Unverified Commit 48d99025 authored by z55250825's avatar z55250825 Committed by GitHub
Browse files

Add new parrots extension implementation for all ops (#794)

* delete all parrots file
add bbox_overlaps new parrots op impl

* support first new impl parrts op (bbox_overlaps)(success test)

* add box_iou_rotated op, test succeed

* add carafe and carafe_naive op, test succeed (one parrots bug need fix)

* add cc_attention op, test success

* add corner_pool op, test success

* add parrots op deform_conv, test success

* add deform_roi_pool op, test success (but has question)

* add focal loss op, test success (gradcheck)

* add masked_conv2d op, test success

* add modulated_deform_conv op, test success

* add nms and nms_rotated op, test success

* add psamask op, test success

* add roi_align op, test_success

* add roi_pool op, test success

* add sync_bn op, test success

* add tin_shift op, test success

* fix test_deform_roi_pool, add parrots test

* skip test_onnx because parrots does not support onnx

* fix c++ lint

* fix python lint

* fix python lint
parent 72e4cc12
#include "parrots_cpp_helper.hpp"
#include "pytorch_cpp_helper.hpp"
void BBoxOverlapsCUDAKernelLauncher(const DArrayLite bboxes1,
const DArrayLite bboxes2, DArrayLite ious,
const int mode, const bool aligned,
const int offset, cudaStream_t stream);
#ifdef MMCV_WITH_CUDA
void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
Tensor ious, const int mode,
const bool aligned, const int offset);
void bbox_overlaps_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int mode, offset;
bool aligned;
SSAttrs(attr)
.get<int>("mode", mode)
.get<bool>("aligned", aligned)
.get<int>("offset", offset)
.done();
const auto& bboxes1 = ins[0];
const auto& bboxes2 = ins[1];
void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset) {
BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
}
#endif
auto& ious = outs[0];
void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset) {
if (bboxes1.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(bboxes1);
CHECK_CUDA_INPUT(bboxes2);
CHECK_CUDA_INPUT(ious);
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset,
stream);
bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
#else
AT_ERROR("bbox_overlaps is not compiled with GPU support");
#endif
} else {
AT_ERROR("bbox_overlaps is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(bbox_overlaps)
.attr("mode")
.attr("aligned")
.attr("offset")
.input(2)
.output(1)
.apply(bbox_overlaps_cuda)
.done();
#include "bbox_overlaps_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
void BBoxOverlapsCUDAKernelLauncher(const DArrayLite bboxes1,
const DArrayLite bboxes2, DArrayLite ious,
const int mode, const bool aligned,
const int offset, cudaStream_t stream) {
int output_size = ious.size();
int num_bbox1 = bboxes1.dim(0);
int num_bbox2 = bboxes2.dim(0);
void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
Tensor ious, const int mode,
const bool aligned, const int offset) {
int output_size = ious.numel();
int num_bbox1 = bboxes1.size(0);
int num_bbox2 = bboxes2.size(0);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
bboxes1.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(bboxes1.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
bboxes1.scalar_type(), "bbox_overlaps_cuda_kernel", ([&] {
bbox_overlaps_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
bboxes1.ptr<scalar_t>(), bboxes2.ptr<scalar_t>(),
ious.ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,
bboxes1.data_ptr<scalar_t>(), bboxes2.data_ptr<scalar_t>(),
ious.data_ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,
offset);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "bbox_overlaps_pytorch.h"
using namespace parrots;
/*
* void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor
* ious, const int mode, const bool aligned, const int offset);
*/
void bbox_overlaps_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int mode, offset;
bool aligned;
SSAttrs(attr)
.get<int>("mode", mode)
.get<bool>("aligned", aligned)
.get<int>("offset", offset)
.done();
const auto& bboxes1 = buildATensor(ctx, ins[0]);
const auto& bboxes2 = buildATensor(ctx, ins[1]);
auto ious = buildATensor(ctx, outs[0]);
bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
}
PARROTS_EXTENSION_REGISTER(bbox_overlaps)
.attr("mode")
.attr("aligned")
.attr("offset")
.input(2)
.output(1)
.apply(bbox_overlaps_parrots)
.done();
#ifndef BBOX_OVERLAPS_PYTORCH_H
#define BBOX_OVERLAPS_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset);
#endif // BBOX_OVERLAPS_PYTORCH_H
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
#include "parrots_cpp_helper.hpp"
#include "pytorch_cpp_helper.hpp"
void box_iou_rotated_cpu_launcher(const DArrayLite boxes1,
const DArrayLite boxes2, DArrayLite ious,
void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned);
void box_iou_rotated_cuda_launcher(const DArrayLite boxes1,
const DArrayLite boxes2, DArrayLite ious,
const int mode_flag, const bool aligned,
cudaStream_t stream);
void box_iou_rotated_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& boxes1 = ins[0];
const auto& boxes2 = ins[1];
bool aligned;
int mode_flag;
SSAttrs(attr)
.get<bool>("aligned", aligned)
.get<int>("mode_flag", mode_flag)
.done();
auto& ious = outs[0];
box_iou_rotated_cpu_launcher(boxes1, boxes2, ious, mode_flag, aligned);
}
void box_iou_rotated_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& boxes1 = ins[0];
const auto& boxes2 = ins[1];
bool aligned;
int mode_flag;
SSAttrs(attr)
.get<bool>("aligned", aligned)
.get<int>("mode_flag", mode_flag)
.done();
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
auto& ious = outs[0];
box_iou_rotated_cuda_launcher(boxes1, boxes2, ious, mode_flag, aligned,
stream);
}
#ifdef MMCV_WITH_CUDA
void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned);
#endif
PARROTS_EXTENSION_REGISTER(box_iou_rotated)
.attr("aligned")
.attr("mode_flag")
.input(2)
.output(1)
.apply(box_iou_rotated_cpu)
#ifdef PARROTS_USE_CUDA
.apply(box_iou_rotated_cuda)
// Interface for Python
// inline is needed to prevent multiple function definitions when this header is
// included by different cpps
void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned) {
assert(boxes1.device().is_cuda() == boxes2.device().is_cuda());
if (boxes1.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
#else
AT_ERROR("Not compiled with GPU support");
#endif
.done();
} else {
box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
}
}
......@@ -2,35 +2,32 @@
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
#include "box_iou_rotated_utils.hpp"
#include "parrots_cpp_helper.hpp"
#include "pytorch_cpp_helper.hpp"
template <typename T>
void box_iou_rotated_cpu_kernel(const DArrayLite boxes1,
const DArrayLite boxes2, DArrayLite ious,
const int mode_flag, const bool aligned) {
int output_size = ious.size();
int num_boxes1 = boxes1.dim(0);
int num_boxes2 = boxes2.dim(0);
auto ious_ptr = ious.ptr<float>();
void box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
Tensor ious, const int mode_flag,
const bool aligned) {
int output_size = ious.numel();
auto num_boxes1 = boxes1.size(0);
auto num_boxes2 = boxes2.size(0);
if (aligned) {
for (int i = 0; i < output_size; i++) {
ious_ptr[i] = single_box_iou_rotated<T>(boxes1[i].ptr<T>(),
boxes2[i].ptr<T>(), mode_flag);
ious[i] = single_box_iou_rotated<T>(boxes1[i].data_ptr<T>(),
boxes2[i].data_ptr<T>(), mode_flag);
}
} else {
for (int i = 0; i < num_boxes1; i++) {
for (int j = 0; j < num_boxes2; j++) {
ious_ptr[i * num_boxes2 + j] = single_box_iou_rotated<T>(
boxes1[i].ptr<T>(), boxes2[j].ptr<T>(), mode_flag);
ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
}
}
}
}
void box_iou_rotated_cpu_launcher(const DArrayLite boxes1,
const DArrayLite boxes2, DArrayLite ious,
void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned) {
box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
}
......@@ -2,23 +2,24 @@
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
#include "box_iou_rotated_cuda.cuh"
#include "parrots_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
void box_iou_rotated_cuda_launcher(const DArrayLite boxes1,
const DArrayLite boxes2, DArrayLite ious,
const int mode_flag, const bool aligned,
cudaStream_t stream) {
void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned) {
using scalar_t = float;
AT_ASSERTM(boxes1.type().is_cuda(), "boxes1 must be a CUDA tensor");
AT_ASSERTM(boxes2.type().is_cuda(), "boxes2 must be a CUDA tensor");
int output_size = ious.size();
int num_boxes1 = boxes1.dim(0);
int num_boxes2 = boxes2.dim(0);
int output_size = ious.numel();
int num_boxes1 = boxes1.size(0);
int num_boxes2 = boxes2.size(0);
at::cuda::CUDAGuard device_guard(boxes1.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
box_iou_rotated_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
num_boxes1, num_boxes2, boxes1.ptr<scalar_t>(),
boxes2.ptr<scalar_t>(), (scalar_t*)ious.ptr<scalar_t>(), mode_flag,
aligned);
PARROTS_CUDA_CHECK(cudaGetLastError());
num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),
boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),
mode_flag, aligned);
AT_CUDA_CHECK(cudaGetLastError());
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "box_iou_rotated_pytorch.h"
using namespace parrots;
/*
* void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor
* ious, const int mode_flag, const bool aligned);
*/
void box_iou_rotated_cpu_parrots(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
bool aligned;
int mode_flag;
SSAttrs(attr)
.get<bool>("aligned", aligned)
.get<int>("mode_flag", mode_flag)
.done();
const auto& boxes1 = buildATensor(ctx, ins[0]);
const auto& boxes2 = buildATensor(ctx, ins[1]);
auto ious = buildATensor(ctx, outs[0]);
box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
}
#ifdef MMCV_WITH_CUDA
/*
* void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor
* ious, const int mode_flag, const bool aligned);
*/
void box_iou_rotated_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
bool aligned;
int mode_flag;
SSAttrs(attr)
.get<bool>("aligned", aligned)
.get<int>("mode_flag", mode_flag)
.done();
const auto& boxes1 = buildATensor(ctx, ins[0]);
const auto& boxes2 = buildATensor(ctx, ins[1]);
auto ious = buildATensor(ctx, outs[0]);
box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
}
#endif
PARROTS_EXTENSION_REGISTER(box_iou_rotated)
.attr("aligned")
.attr("mode_flag")
.input(2)
.output(1)
.apply(box_iou_rotated_cpu_parrots)
#ifdef MMCV_WITH_CUDA
.apply(box_iou_rotated_cuda_parrots)
#endif
.done();
#ifndef BOX_IOU_ROTATED_PYTORCH_H
#define BOX_IOU_ROTATED_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned);
#ifdef MMCV_WITH_CUDA
void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned);
#endif
#endif // BOX_IOU_ROTATED_PYTORCH_H
#include "parrots_cpp_helper.hpp"
#include "pytorch_cpp_helper.hpp"
void CARAFEForwardCUDAKernelLauncher(
const DArrayLite features, const DArrayLite masks, DArrayLite rfeatures,
DArrayLite routput, DArrayLite rmasks, DArrayLite output,
const int kernel_size, const int group_size, const int scale_factor,
cudaStream_t stream);
#ifdef MMCV_WITH_CUDA
void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
Tensor rfeatures, Tensor routput,
Tensor rmasks, Tensor output,
const int kernel_size,
const int group_size,
const int scale_factor);
void CARAFEBackwardCUDAKernelLauncher(
const DArrayLite top_grad, const DArrayLite rfeatures,
const DArrayLite masks, DArrayLite rtop_grad, DArrayLite rbottom_grad_hs,
DArrayLite rbottom_grad, DArrayLite rmask_grad, DArrayLite bottom_grad,
DArrayLite mask_grad, const int kernel_size, const int group_size,
const int scale_factor, cudaStream_t stream);
const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
const int kernel_size, const int group_size, const int scale_factor);
void carafe_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& features = ins[0];
const auto& masks = ins[1];
auto& rfeatures = outs[0];
auto& routput = outs[1];
auto& rmasks = outs[2];
auto& output = outs[3];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
Tensor routput, Tensor rmasks, Tensor output,
int kernel_size, int group_size, int scale_factor) {
CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
output, kernel_size, group_size, scale_factor,
stream);
output, kernel_size, group_size,
scale_factor);
}
void carafe_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& top_grad = ins[0];
const auto& rfeatures = ins[1];
const auto& masks = ins[2];
auto& rtop_grad = outs[0];
auto rbottom_grad_hs = outs[1];
auto& rbottom_grad = outs[2];
auto& rmask_grad = outs[3];
auto& bottom_grad = outs[4];
auto& mask_grad = outs[5];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
Tensor rtop_grad, Tensor rbottom_grad_hs,
Tensor rbottom_grad, Tensor rmask_grad,
Tensor bottom_grad, Tensor mask_grad, int kernel_size,
int group_size, int scale_factor) {
CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
rbottom_grad_hs, rbottom_grad, rmask_grad,
bottom_grad, mask_grad, kernel_size,
group_size, scale_factor, stream);
group_size, scale_factor);
}
#endif
PARROTS_EXTENSION_REGISTER(carafe_forward)
.attr("kernel_size")
.attr("group_size")
.attr("scale_factor")
.input(2)
.output(4)
.apply(carafe_forward_cuda)
.done();
void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
Tensor routput, Tensor rmasks, Tensor output,
int kernel_size, int group_size, int scale_factor) {
if (features.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(features);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(rfeatures);
CHECK_CUDA_INPUT(routput);
CHECK_CUDA_INPUT(rmasks);
CHECK_CUDA_INPUT(output);
carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
kernel_size, group_size, scale_factor);
#else
AT_ERROR("Carafe is not compiled with GPU support");
#endif
} else {
AT_ERROR("Carafe is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(carafe_backward)
.attr("kernel_size")
.attr("group_size")
.attr("scale_factor")
.input(3)
.output(6)
.apply(carafe_backward_cuda)
.done();
void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
Tensor rtop_grad, Tensor rbottom_grad_hs,
Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
Tensor mask_grad, int kernel_size, int group_size,
int scale_factor) {
if (top_grad.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(top_grad);
CHECK_CUDA_INPUT(rfeatures);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(rtop_grad);
CHECK_CUDA_INPUT(rbottom_grad_hs);
CHECK_CUDA_INPUT(rbottom_grad);
CHECK_CUDA_INPUT(rmask_grad);
CHECK_CUDA_INPUT(bottom_grad);
CHECK_CUDA_INPUT(mask_grad);
carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
rbottom_grad, rmask_grad, bottom_grad, mask_grad,
kernel_size, group_size, scale_factor);
#else
AT_ERROR("Carafe is not compiled with GPU support");
#endif
} else {
AT_ERROR("Carafe is not implemented on CPU");
}
}
#include "carafe_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
void CARAFEForwardCUDAKernelLauncher(
const DArrayLite features, const DArrayLite masks, DArrayLite rfeatures,
DArrayLite routput, DArrayLite rmasks, DArrayLite output,
const int kernel_size, const int group_size, const int scale_factor,
cudaStream_t stream) {
const int batch_size = output.dim(0);
const int channels = output.dim(1);
const int output_height = output.dim(2);
const int output_width = output.dim(3);
void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
Tensor rfeatures, Tensor routput,
Tensor rmasks, Tensor output,
const int kernel_size,
const int group_size,
const int scale_factor) {
const int batch_size = output.size(0);
const int channels = output.size(1);
const int output_height = output.size(2);
const int output_width = output.size(3);
const int input_height = features.dim(2);
const int input_width = features.dim(3);
const int input_height = features.size(2);
const int input_width = features.size(3);
const int mask_channels = masks.dim(1);
const int mask_channels = masks.size(1);
rfeatures.resize_({batch_size, input_height, input_width, channels});
routput.resize_({batch_size, output_height, output_width, channels});
rmasks.resize_({batch_size, output_height, output_width, mask_channels});
// one warp per pixel
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
features.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(features.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.scalar_type(), "NCHW2NHWC_Feature", ([&] {
const scalar_t *bottom_data = features.data_ptr<scalar_t>();
scalar_t *top_data = rfeatures.data_ptr<scalar_t>();
const int dh = divideUP(channels, kTileDim);
const int dw = divideUP(input_height * input_width, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, channels, input_height * input_width, dh, dw,
features.ptr<scalar_t>(), rfeatures.ptr<scalar_t>());
bottom_data, top_data);
}));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
features.elemType().prim(), ([&] {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.scalar_type(), "NCHW2NHWC_Masks", ([&] {
const scalar_t *bottom_data = masks.data_ptr<scalar_t>();
scalar_t *top_data = rmasks.data_ptr<scalar_t>();
const int dh = divideUP(mask_channels, kTileDim);
const int dw = divideUP(output_height * output_width, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, mask_channels, output_height * output_width, dh, dw,
masks.ptr<scalar_t>(), rmasks.ptr<scalar_t>());
bottom_data, top_data);
}));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
features.elemType().prim(), ([&] {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.scalar_type(), "CARAFELaucherForward", ([&] {
const int num_kernels =
batch_size * output_height * output_width * THREADS_PER_PIXEL;
const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
const scalar_t *bottom_masks = rmasks.data_ptr<scalar_t>();
scalar_t *top_data = routput.data_ptr<scalar_t>();
CARAFEForward<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
THREADS_PER_BLOCK, 0, stream>>>(
num_kernels, rfeatures.ptr<scalar_t>(), rmasks.ptr<scalar_t>(),
kernel_size, group_size, scale_factor, channels, input_height,
input_width, output_height, output_width, mask_channels,
routput.ptr<scalar_t>());
num_kernels, bottom_data, bottom_masks, kernel_size, group_size,
scale_factor, channels, input_height, input_width, output_height,
output_width, mask_channels, top_data);
}));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
features.elemType().prim(), ([&] {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.scalar_type(), "NHWC2NCHW", ([&] {
const scalar_t *bottom_data = routput.data_ptr<scalar_t>();
scalar_t *top_data = output.data_ptr<scalar_t>();
const int dh = divideUP(output_height * output_width, kTileDim);
const int dw = divideUP(channels, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, output_height * output_width, channels, dh, dw,
routput.ptr<scalar_t>(), output.ptr<scalar_t>());
bottom_data, top_data);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
void CARAFEBackwardCUDAKernelLauncher(
const DArrayLite top_grad, const DArrayLite rfeatures,
const DArrayLite masks, DArrayLite rtop_grad, DArrayLite rbottom_grad_hs,
DArrayLite rbottom_grad, DArrayLite rmask_grad, DArrayLite bottom_grad,
DArrayLite mask_grad, const int kernel_size, const int group_size,
const int scale_factor, cudaStream_t stream) {
const int batch_size = top_grad.dim(0);
const int channels = top_grad.dim(1);
const int output_height = top_grad.dim(2);
const int output_width = top_grad.dim(3);
const int input_height = bottom_grad.dim(2);
const int input_width = bottom_grad.dim(3);
const int mask_channels = masks.dim(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] {
const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
const int kernel_size, const int group_size, const int scale_factor) {
const int batch_size = top_grad.size(0);
const int channels = top_grad.size(1);
const int output_height = top_grad.size(2);
const int output_width = top_grad.size(3);
const int input_height = bottom_grad.size(2);
const int input_width = bottom_grad.size(3);
const int mask_channels = masks.size(1);
rtop_grad.resize_({batch_size, output_height, output_width, channels});
rbottom_grad.resize_({batch_size, input_height, input_width, channels});
rbottom_grad_hs.resize_({batch_size, output_height, output_width, channels});
rmask_grad.resize_({batch_size, output_height, output_width, mask_channels});
at::cuda::CUDAGuard device_guard(top_grad.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.scalar_type(), "NCHW2NHWC_Top_Grad", ([&] {
const scalar_t *bottom_data = top_grad.data_ptr<scalar_t>();
scalar_t *top_data = rtop_grad.data_ptr<scalar_t>();
const int dh = divideUP(channels, kTileDim);
const int dw = divideUP(output_height * output_width, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, channels, output_height * output_width, dh, dw,
top_grad.ptr<scalar_t>(), rtop_grad.ptr<scalar_t>());
bottom_data, top_data);
}));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.scalar_type(), "CARAFELaucherBackward_Feature", ([&] {
const int num_kernels =
batch_size * output_height * output_width * THREADS_PER_PIXEL;
const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
scalar_t *bottom_diff = rbottom_grad_hs.data_ptr<scalar_t>();
CARAFEBackward_Feature<scalar_t>
<<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
stream>>>(num_kernels, rtop_grad.ptr<scalar_t>(),
masks.ptr<scalar_t>(), kernel_size, group_size,
scale_factor, channels, input_height, input_width,
output_height, output_width, mask_channels,
rbottom_grad_hs.ptr<scalar_t>());
stream>>>(num_kernels, top_diff, bottom_masks, kernel_size,
group_size, scale_factor, channels, input_height,
input_width, output_height, output_width,
mask_channels, bottom_diff);
}));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.scalar_type(), "FeatureSum", ([&] {
const int num_kernels =
batch_size * input_height * input_width * THREADS_PER_PIXEL;
const scalar_t *bottom_diff_hs = rbottom_grad_hs.data_ptr<scalar_t>();
scalar_t *bottom_diff = rbottom_grad.data_ptr<scalar_t>();
FeatureSum<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
THREADS_PER_BLOCK, 0, stream>>>(
num_kernels, rbottom_grad_hs.ptr<scalar_t>(), scale_factor,
channels, input_height, input_width, rbottom_grad.ptr<scalar_t>());
FeatureSum<scalar_t>
<<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
stream>>>(num_kernels, bottom_diff_hs, scale_factor, channels,
input_height, input_width, bottom_diff);
}));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.scalar_type(), "NHWC2NCHW_Bottom_Grad", ([&] {
const scalar_t *bottom_data = rbottom_grad.data_ptr<scalar_t>();
scalar_t *top_data = bottom_grad.data_ptr<scalar_t>();
const int dh = divideUP(input_height * input_width, kTileDim);
const int dw = divideUP(channels, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, input_height * input_width, channels, dh, dw,
rbottom_grad.ptr<scalar_t>(), bottom_grad.ptr<scalar_t>());
bottom_data, top_data);
}));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.scalar_type(), "CARAFELaucherBackward_Mask", ([&] {
const int num_kernels = batch_size * output_height * output_width *
mask_channels * WARP_SIZE;
const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
scalar_t *mask_diff = rmask_grad.data_ptr<scalar_t>();
CARAFEBackward_Mask<scalar_t>
<<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
stream>>>(num_kernels, rtop_grad.ptr<scalar_t>(),
rfeatures.ptr<scalar_t>(), kernel_size, group_size,
scale_factor, channels, input_height, input_width,
output_height, output_width, mask_channels,
rmask_grad.ptr<scalar_t>());
stream>>>(num_kernels, top_diff, bottom_data, kernel_size,
group_size, scale_factor, channels, input_height,
input_width, output_height, output_width,
mask_channels, mask_diff);
}));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.scalar_type(), "NHWC2NCHW_Mask_Grad", ([&] {
const scalar_t *bottom_data = rmask_grad.data_ptr<scalar_t>();
scalar_t *top_data = mask_grad.data_ptr<scalar_t>();
const int dh = divideUP(output_height * output_width, kTileDim);
const int dw = divideUP(mask_channels, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, output_height * output_width, mask_channels, dh, dw,
rmask_grad.ptr<scalar_t>(), mask_grad.ptr<scalar_t>());
bottom_data, top_data);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
#include "parrots_cpp_helper.hpp"
#include "pytorch_cpp_helper.hpp"
void CARAFENAIVEForwardCUDAKernelLauncher(
const DArrayLite features, const DArrayLite masks, DArrayLite output,
const int kernel_size, const int group_size, const int scale_factor,
cudaStream_t stream);
#ifdef MMCV_WITH_CUDA
void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
const Tensor masks, Tensor output,
const int kernel_size,
const int group_size,
const int scale_factor);
void CARAFENAIVEBackwardCUDAKernelLauncher(
const DArrayLite top_grad, const DArrayLite features,
const DArrayLite masks, DArrayLite bottom_grad, DArrayLite mask_grad,
const int kernel_size, const int group_size, const int scale_factor,
cudaStream_t stream);
const Tensor top_grad, const Tensor features, const Tensor masks,
Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
const int group_size, const int scale_factor);
void carafe_naive_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& features = ins[0];
const auto& masks = ins[1];
auto& output = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
int kernel_size, int group_size,
int scale_factor) {
CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
group_size, scale_factor, stream);
group_size, scale_factor);
}
void carafe_naive_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& top_grad = ins[0];
const auto& features = ins[1];
const auto& masks = ins[2];
auto& bottom_grad = outs[0];
auto& mask_grad = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
Tensor bottom_grad, Tensor mask_grad,
int kernel_size, int group_size,
int scale_factor) {
CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
mask_grad, kernel_size, group_size,
scale_factor, stream);
scale_factor);
}
#endif
PARROTS_EXTENSION_REGISTER(carafe_naive_forward)
.attr("kernel_size")
.attr("group_size")
.attr("scale_factor")
.input(2)
.output(1)
.apply(carafe_naive_forward_cuda)
.done();
void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
int kernel_size, int group_size, int scale_factor) {
if (features.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(features);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(output);
carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
scale_factor);
#else
AT_ERROR("CarafeNaive is not compiled with GPU support");
#endif
} else {
AT_ERROR("CarafeNaive is not implemented on CPU");
}
}
PARROTS_EXTENSION_REGISTER(carafe_naive_backward)
.attr("kernel_size")
.attr("group_size")
.attr("scale_factor")
.input(3)
.output(2)
.apply(carafe_naive_backward_cuda)
.done();
void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
Tensor bottom_grad, Tensor mask_grad,
int kernel_size, int group_size, int scale_factor) {
if (top_grad.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(top_grad);
CHECK_CUDA_INPUT(features);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(bottom_grad);
CHECK_CUDA_INPUT(mask_grad);
carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad,
mask_grad, kernel_size, group_size,
scale_factor);
#else
AT_ERROR("CarafeNaive is not compiled with GPU support");
#endif
} else {
AT_ERROR("CarafeNaive is not implemented on CPU");
}
}
#include "carafe_naive_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
void CARAFENAIVEForwardCUDAKernelLauncher(
const DArrayLite features, const DArrayLite masks, DArrayLite output,
const int kernel_size, const int group_size, const int scale_factor,
cudaStream_t stream) {
int output_size = output.size();
int channels = output.dim(1);
int height = output.dim(2);
int width = output.dim(3);
void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
const Tensor masks, Tensor output,
const int kernel_size,
const int group_size,
const int scale_factor) {
int output_size = output.numel();
int channels = output.size(1);
int height = output.size(2);
int width = output.size(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
features.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(features.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.scalar_type(), "CARAFENAIVEForward", ([&] {
carafe_naive_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, features.ptr<scalar_t>(), masks.ptr<scalar_t>(),
output.ptr<scalar_t>(), kernel_size, group_size, scale_factor,
channels, height, width);
output_size, features.data_ptr<scalar_t>(),
masks.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
kernel_size, group_size, scale_factor, channels, height, width);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
void CARAFENAIVEBackwardCUDAKernelLauncher(
const DArrayLite top_grad, const DArrayLite features,
const DArrayLite masks, DArrayLite bottom_grad, DArrayLite mask_grad,
const int kernel_size, const int group_size, const int scale_factor,
cudaStream_t stream) {
int output_size = top_grad.size();
int channels = top_grad.dim(1);
int height = top_grad.dim(2);
int width = top_grad.dim(3);
const Tensor top_grad, const Tensor features, const Tensor masks,
Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
const int group_size, const int scale_factor) {
int output_size = top_grad.numel();
int channels = top_grad.size(1);
int height = top_grad.size(2);
int width = top_grad.size(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
features.elemType().prim(), ([&] {
at::cuda::CUDAGuard device_guard(top_grad.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.scalar_type(), "CARAFENAIVEBackward", ([&] {
carafe_naive_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, top_grad.ptr<scalar_t>(), features.ptr<scalar_t>(),
masks.ptr<scalar_t>(), bottom_grad.ptr<scalar_t>(),
mask_grad.ptr<scalar_t>(), kernel_size, group_size,
output_size, top_grad.data_ptr<scalar_t>(),
features.data_ptr<scalar_t>(), masks.data_ptr<scalar_t>(),
bottom_grad.data_ptr<scalar_t>(),
mask_grad.data_ptr<scalar_t>(), kernel_size, group_size,
scale_factor, channels, height, width);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "carafe_naive_pytorch.h"
using namespace parrots;
/*void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
* int kernel_size, int group_size,
* int scale_factor)
*/
void carafe_naive_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& features = buildATensor(ctx, ins[0]);
const auto& masks = buildATensor(ctx, ins[1]);
auto output = buildATensor(ctx, outs[0]);
carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
scale_factor);
}
/*void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor
* masks, Tensor bottom_grad, Tensor mask_grad, int kernel_size, int group_size,
* int scale_factor);
*/
void carafe_naive_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& top_grad = buildATensor(ctx, ins[0]);
const auto& features = buildATensor(ctx, ins[1]);
const auto& masks = buildATensor(ctx, ins[2]);
auto bottom_grad = buildATensor(ctx, outs[0]);
auto mask_grad = buildATensor(ctx, outs[1]);
carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad, mask_grad,
kernel_size, group_size, scale_factor);
}
PARROTS_EXTENSION_REGISTER(carafe_naive_forward)
.attr("kernel_size")
.attr("group_size")
.attr("scale_factor")
.input(2)
.output(1)
.apply(carafe_naive_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(carafe_naive_backward)
.attr("kernel_size")
.attr("group_size")
.attr("scale_factor")
.input(3)
.output(2)
.apply(carafe_naive_backward_cuda_parrots)
.done();
#ifndef CARAFE_NAIVE_PYTORCH_H
#define CARAFE_NAIVE_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
int kernel_size, int group_size,
int scale_factor);
void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
Tensor bottom_grad, Tensor mask_grad,
int kernel_size, int group_size,
int scale_factor);
#endif // CARAFE_NAIVE_PYTORCH_H
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "carafe_pytorch.h"
using namespace parrots;
/*
* void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
* Tensor routput, Tensor rmasks, Tensor output,
* int kernel_size, int group_size, int scale_factor);
*/
void carafe_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& features = buildATensor(ctx, ins[0]);
const auto& masks = buildATensor(ctx, ins[1]);
auto rfeatures = buildATensor(ctx, outs[0]);
auto routput = buildATensor(ctx, outs[1]);
auto rmasks = buildATensor(ctx, outs[2]);
auto output = buildATensor(ctx, outs[3]);
carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
kernel_size, group_size, scale_factor);
}
/*
* void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
* Tensor rtop_grad, Tensor rbottom_grad_hs,
* Tensor rbottom_grad, Tensor rmask_grad,
* Tensor bottom_grad, Tensor mask_grad, int
* kernel_size, int group_size, int scale_factor);
*/
void carafe_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& top_grad = buildATensor(ctx, ins[0]);
const auto& rfeatures = buildATensor(ctx, ins[1]);
const auto& masks = buildATensor(ctx, ins[2]);
auto rtop_grad = buildATensor(ctx, outs[0]);
auto rbottom_grad_hs = buildATensor(ctx, outs[1]);
auto rbottom_grad = buildATensor(ctx, outs[2]);
auto rmask_grad = buildATensor(ctx, outs[3]);
auto bottom_grad = buildATensor(ctx, outs[4]);
auto mask_grad = buildATensor(ctx, outs[5]);
carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
rbottom_grad, rmask_grad, bottom_grad, mask_grad,
kernel_size, group_size, scale_factor);
}
PARROTS_EXTENSION_REGISTER(carafe_forward)
.attr("kernel_size")
.attr("group_size")
.attr("scale_factor")
.input(2)
.output(4)
.apply(carafe_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(carafe_backward)
.attr("kernel_size")
.attr("group_size")
.attr("scale_factor")
.input(3)
.output(6)
.apply(carafe_backward_cuda_parrots)
.done();
#ifndef CARAFE_PYTORCH_H
#define CARAFE_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
Tensor routput, Tensor rmasks, Tensor output,
int kernel_size, int group_size, int scale_factor);
void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
Tensor rtop_grad, Tensor rbottom_grad_hs,
Tensor rbottom_grad, Tensor rmask_grad,
Tensor bottom_grad, Tensor mask_grad, int kernel_size,
int group_size, int scale_factor);
#endif // CARAFE_PYTORCH_H
#include "parrots_cpp_helper.hpp"
#include "pytorch_cpp_helper.hpp"
void CAForwardCUDAKernelLauncher(const DArrayLite t, const DArrayLite f,
DArrayLite weight, CudaContext &ctx,
cudaStream_t stream);
#ifdef MMCV_WITH_CUDA
void CAForwardCUDAKernelLauncher(const Tensor t, const Tensor f, Tensor weight);
void CABackwardCUDAKernelLauncher(const DArrayLite dw, const DArrayLite t,
const DArrayLite f, DArrayLite dt,
DArrayLite df, CudaContext &ctx,
cudaStream_t stream);
void CABackwardCUDAKernelLauncher(const Tensor dw, const Tensor t,
const Tensor f, Tensor dt, Tensor df);
void CAMapForwardCUDAKernelLauncher(const DArrayLite weight, const DArrayLite g,
DArrayLite out, CudaContext &ctx,
cudaStream_t stream);
void CAMapForwardCUDAKernelLauncher(const Tensor weight, const Tensor g,
Tensor out);
void CAMapBackwardCUDAKernelLauncher(const DArrayLite dout,
const DArrayLite weight,
const DArrayLite g, DArrayLite dw,
DArrayLite dg, CudaContext &ctx,
cudaStream_t stream);
void CAMapBackwardCUDAKernelLauncher(const Tensor dout, const Tensor weight,
const Tensor g, Tensor dw, Tensor dg);
void ca_forward_cuda(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &t = ins[0];
const auto &f = ins[1];
auto &weight = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CAForwardCUDAKernelLauncher(t, f, weight, ctx, stream);
void ca_forward_cuda(const Tensor t, const Tensor f, Tensor weight) {
CAForwardCUDAKernelLauncher(t, f, weight);
}
void ca_backward_cuda(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &dw = ins[0];
const auto &t = ins[1];
const auto &f = ins[2];
auto &dt = outs[0];
auto &df = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CABackwardCUDAKernelLauncher(dw, t, f, dt, df, ctx, stream);
void ca_backward_cuda(const Tensor dw, const Tensor t, const Tensor f,
Tensor dt, Tensor df) {
CABackwardCUDAKernelLauncher(dw, t, f, dt, df);
}
void ca_map_forward_cuda(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &weight = ins[0];
const auto &g = ins[1];
auto &out = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CAMapForwardCUDAKernelLauncher(weight, g, out, ctx, stream);
void ca_map_forward_cuda(const Tensor weight, const Tensor g, Tensor out) {
CAMapForwardCUDAKernelLauncher(weight, g, out);
}
void ca_map_backward_cuda(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &dout = ins[0];
const auto &weight = ins[1];
const auto &g = ins[2];
auto &dw = outs[0];
auto &dg = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CAMapBackwardCUDAKernelLauncher(dout, weight, g, dw, dg, ctx, stream);
void ca_map_backward_cuda(const Tensor dout, const Tensor weight,
const Tensor g, Tensor dw, Tensor dg) {
CAMapBackwardCUDAKernelLauncher(dout, weight, g, dw, dg);
}
#endif
PARROTS_EXTENSION_REGISTER(ca_forward)
.input(2)
.output(1)
.apply(ca_forward_cuda)
.done();
void ca_forward(const Tensor t, const Tensor f, Tensor weight) {
if (t.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(t);
CHECK_CUDA_INPUT(f);
CHECK_CUDA_INPUT(weight);
ca_forward_cuda(t, f, weight);
#else
AT_ERROR("ca is not compiled with GPU support");
#endif
} else {
AT_ERROR("ca is not implemented on the CPU");
}
}
PARROTS_EXTENSION_REGISTER(ca_backward)
.input(3)
.output(2)
.apply(ca_backward_cuda)
.done();
void ca_backward(const Tensor dw, const Tensor t, const Tensor f, Tensor dt,
Tensor df) {
if (dw.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(dw);
CHECK_CUDA_INPUT(t);
CHECK_CUDA_INPUT(f);
CHECK_CUDA_INPUT(dt);
CHECK_CUDA_INPUT(df);
ca_backward_cuda(dw, t, f, dt, df);
#else
AT_ERROR("ca is not compiled with GPU support");
#endif
} else {
AT_ERROR("ca is not implemented on the CPU");
}
}
PARROTS_EXTENSION_REGISTER(ca_map_forward)
.input(2)
.output(1)
.apply(ca_map_forward_cuda)
.done();
void ca_map_forward(const Tensor weight, const Tensor g, Tensor out) {
if (weight.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(g);
CHECK_CUDA_INPUT(out);
ca_map_forward_cuda(weight, g, out);
#else
AT_ERROR("ca_map is not compiled with GPU support");
#endif
} else {
AT_ERROR("ca is not implemented on the CPU");
}
}
PARROTS_EXTENSION_REGISTER(ca_map_backward)
.input(3)
.output(2)
.apply(ca_map_backward_cuda)
.done();
void ca_map_backward(const Tensor dout, const Tensor weight, const Tensor g,
Tensor dw, Tensor dg) {
if (dout.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(dout);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(g);
CHECK_CUDA_INPUT(dw);
CHECK_CUDA_INPUT(dg);
ca_map_backward_cuda(dout, weight, g, dw, dg);
#else
AT_ERROR("ca_map is not compiled with GPU support");
#endif
} else {
AT_ERROR("ca is not implemented on the CPU");
}
}
// Modified from
// https://github.com/LikeLy-Journey/SegmenTron/blob/master/segmentron/modules/csrc/criss_cross_attention/ca_cuda.cu
#include <THC/THC.h>
#include <THC/THCDeviceUtils.cuh>
#include "cc_attention_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void CAForwardCUDAKernelLauncher(const Tensor t, const Tensor f,
Tensor weight) {
AT_ASSERTM(t.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(f.device().is_cuda(), "input must be a CUDA tensor");
auto n = t.size(0);
auto c = t.size(1);
auto h = t.size(2);
auto w = t.size(3);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = h + w;
dim3 blocks(d1, d2, d3);
AT_DISPATCH_FLOATING_TYPES(t.scalar_type(), "ca_forward", [&] {
ca_forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
t.contiguous().data_ptr<scalar_t>(),
f.contiguous().data_ptr<scalar_t>(),
weight.contiguous().data_ptr<scalar_t>(), n, c, h, w);
});
THCudaCheck(cudaGetLastError());
}
void CABackwardCUDAKernelLauncher(const Tensor dw, const Tensor t,
const Tensor f, Tensor dt, Tensor df) {
AT_ASSERTM(dw.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(t.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(f.device().is_cuda(), "input must be a CUDA tensor");
auto n = t.size(0);
auto c = t.size(1);
auto h = t.size(2);
auto w = t.size(3);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = c;
dim3 blocks(d1, d2, d3);
AT_DISPATCH_FLOATING_TYPES(t.scalar_type(), "ca_backward_kernel_t", [&] {
ca_backward_kernel_t<scalar_t><<<blocks, threads, 0, stream>>>(
dw.contiguous().data_ptr<scalar_t>(),
t.contiguous().data_ptr<scalar_t>(),
f.contiguous().data_ptr<scalar_t>(),
dt.contiguous().data_ptr<scalar_t>(), n, c, h, w);
});
AT_DISPATCH_FLOATING_TYPES(f.scalar_type(), "ca_backward_kernel_f", [&] {
ca_backward_kernel_f<scalar_t><<<blocks, threads, 0, stream>>>(
dw.contiguous().data_ptr<scalar_t>(),
t.contiguous().data_ptr<scalar_t>(),
f.contiguous().data_ptr<scalar_t>(),
df.contiguous().data_ptr<scalar_t>(), n, c, h, w);
});
THCudaCheck(cudaGetLastError());
}
void CAMapForwardCUDAKernelLauncher(const Tensor weight, const Tensor g,
Tensor out) {
AT_ASSERTM(weight.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(g.device().is_cuda(), "input must be a CUDA tensor");
auto n = g.size(0);
auto c = g.size(1);
auto h = g.size(2);
auto w = g.size(3);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = c;
dim3 blocks(d1, d2, d3);
AT_DISPATCH_FLOATING_TYPES(g.scalar_type(), "ca_map_forward", [&] {
ca_map_forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
weight.contiguous().data_ptr<scalar_t>(),
g.contiguous().data_ptr<scalar_t>(),
out.contiguous().data_ptr<scalar_t>(), n, c, h, w);
});
THCudaCheck(cudaGetLastError());
}
void CAMapBackwardCUDAKernelLauncher(const Tensor dout, const Tensor weight,
const Tensor g, Tensor dw, Tensor dg) {
AT_ASSERTM(dout.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(weight.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(g.device().is_cuda(), "input must be a CUDA tensor");
auto n = dout.size(0);
auto c = dout.size(1);
auto h = dout.size(2);
auto w = dout.size(3);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = h + w;
dim3 blocks(d1, d2, d3);
AT_DISPATCH_FLOATING_TYPES(
weight.scalar_type(), "ca_map_backward_kernel_w", [&] {
ca_map_backward_kernel_w<scalar_t><<<blocks, threads, 0, stream>>>(
dout.contiguous().data_ptr<scalar_t>(),
weight.contiguous().data_ptr<scalar_t>(),
g.contiguous().data_ptr<scalar_t>(),
dw.contiguous().data_ptr<scalar_t>(), n, c, h, w);
});
AT_DISPATCH_FLOATING_TYPES(g.scalar_type(), "ca_map_backward_kernel_g", [&] {
ca_map_backward_kernel_g<scalar_t><<<blocks, threads, 0, stream>>>(
dout.contiguous().data_ptr<scalar_t>(),
weight.contiguous().data_ptr<scalar_t>(),
g.contiguous().data_ptr<scalar_t>(),
dg.contiguous().data_ptr<scalar_t>(), n, c, h, w);
});
THCudaCheck(cudaGetLastError());
}
#include "cc_attention_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
void CAForwardCUDAKernelLauncher(const DArrayLite t, const DArrayLite f,
DArrayLite weight, CudaContext &ctx,
cudaStream_t stream) {
auto n = t.dim(0);
auto c = t.dim(1);
auto h = t.dim(2);
auto w = t.dim(3);
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = h + w;
dim3 blocks(d1, d2, d3);
PARROTS_DISPATCH_FLOATING_TYPES(t.elemType().prim(), [&] {
ca_forward_kernel<scalar_t>
<<<blocks, threads, 0, stream>>>(t.ptr<scalar_t>(), f.ptr<scalar_t>(),
weight.ptr<scalar_t>(), n, c, h, w);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void CABackwardCUDAKernelLauncher(const DArrayLite dw, const DArrayLite t,
const DArrayLite f, DArrayLite dt,
DArrayLite df, CudaContext &ctx,
cudaStream_t stream) {
auto n = t.dim(0);
auto c = t.dim(1);
auto h = t.dim(2);
auto w = t.dim(3);
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = c;
dim3 blocks(d1, d2, d3);
PARROTS_DISPATCH_FLOATING_TYPES(t.elemType().prim(), [&] {
ca_backward_kernel_t<scalar_t><<<blocks, threads, 0, stream>>>(
dw.ptr<scalar_t>(), t.ptr<scalar_t>(), f.ptr<scalar_t>(),
dt.ptr<scalar_t>(), n, c, h, w);
});
PARROTS_DISPATCH_FLOATING_TYPES(f.elemType().prim(), [&] {
ca_backward_kernel_f<scalar_t><<<blocks, threads, 0, stream>>>(
dw.ptr<scalar_t>(), t.ptr<scalar_t>(), f.ptr<scalar_t>(),
df.ptr<scalar_t>(), n, c, h, w);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void CAMapForwardCUDAKernelLauncher(const DArrayLite weight, const DArrayLite g,
DArrayLite out, CudaContext &ctx,
cudaStream_t stream) {
auto n = g.dim(0);
auto c = g.dim(1);
auto h = g.dim(2);
auto w = g.dim(3);
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = c;
dim3 blocks(d1, d2, d3);
PARROTS_DISPATCH_FLOATING_TYPES(g.elemType().prim(), [&] {
ca_map_forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
weight.ptr<scalar_t>(), g.ptr<scalar_t>(), out.ptr<scalar_t>(), n, c, h,
w);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void CAMapBackwardCUDAKernelLauncher(const DArrayLite dout,
const DArrayLite weight,
const DArrayLite g, DArrayLite dw,
DArrayLite dg, CudaContext &ctx,
cudaStream_t stream) {
auto n = dout.dim(0);
auto c = dout.dim(1);
auto h = dout.dim(2);
auto w = dout.dim(3);
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = h + w;
dim3 blocks(d1, d2, d3);
PARROTS_DISPATCH_FLOATING_TYPES(weight.elemType().prim(), [&] {
ca_map_backward_kernel_w<scalar_t><<<blocks, threads, 0, stream>>>(
dout.ptr<scalar_t>(), weight.ptr<scalar_t>(), g.ptr<scalar_t>(),
dw.ptr<scalar_t>(), n, c, h, w);
});
PARROTS_DISPATCH_FLOATING_TYPES(g.elemType().prim(), [&] {
ca_map_backward_kernel_g<scalar_t><<<blocks, threads, 0, stream>>>(
dout.ptr<scalar_t>(), weight.ptr<scalar_t>(), g.ptr<scalar_t>(),
dg.ptr<scalar_t>(), n, c, h, w);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment