Commit 6f3c5f1c authored by limm's avatar limm
Browse files

support v1.4.0

parent 6f674c7e
......@@ -2,19 +2,36 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void ball_query_forward_impl(int b, int n, int m, float min_radius,
#ifdef MMCV_WITH_CUDA
void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
float max_radius, int nsample,
const Tensor new_xyz, const Tensor xyz,
Tensor idx);
void ball_query_forward_cuda(int b, int n, int m, float min_radius,
float max_radius, int nsample,
const Tensor new_xyz, const Tensor xyz,
Tensor idx) {
DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
nsample, new_xyz, xyz, idx);
}
BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
new_xyz, xyz, idx);
};
#endif
void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
Tensor idx_tensor, int b, int n, int m,
float min_radius, float max_radius, int nsample) {
ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
new_xyz_tensor, xyz_tensor, idx_tensor);
if (new_xyz_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(new_xyz_tensor);
CHECK_CUDA_INPUT(xyz_tensor);
ball_query_forward_cuda(b, n, m, min_radius, max_radius, nsample,
new_xyz_tensor, xyz_tensor, idx_tensor);
#else
AT_ERROR("ball_query is not compiled with GPU support");
#endif
} else {
AT_ERROR("ball_query is not implemented on CPU");
}
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
#ifdef MMCV_WITH_CUDA
void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
Tensor ious, const int mode,
const bool aligned, const int offset);
void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset) {
DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
aligned, offset);
BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
}
#endif
void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset) {
bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
if (bboxes1.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(bboxes1);
CHECK_CUDA_INPUT(bboxes2);
CHECK_CUDA_INPUT(ious);
bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
#else
AT_ERROR("bbox_overlaps is not compiled with GPU support");
#endif
} else {
AT_ERROR("bbox_overlaps is not implemented on CPU");
}
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
#ifdef MMCV_WITH_CUDA
void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
const Tensor &boxes, Tensor output,
Tensor argmax_idx,
const int pool_size);
void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
const Tensor &boxes,
const Tensor &argmax_idx,
Tensor grad_input,
const int pool_size);
void border_align_forward_cuda(const Tensor &input, const Tensor &boxes,
Tensor output, Tensor argmax_idx,
const int pool_size) {
DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
argmax_idx, pool_size);
BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
pool_size);
}
void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
void border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,
const Tensor &argmax_idx, Tensor grad_input,
const int pool_size) {
DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
argmax_idx, grad_input, pool_size);
BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
grad_input, pool_size);
}
#endif
void border_align_forward(const Tensor &input, const Tensor &boxes,
Tensor output, Tensor argmax_idx,
const int pool_size) {
border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(boxes);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax_idx);
border_align_forward_cuda(input, boxes, output, argmax_idx, pool_size);
#else
AT_ERROR("BorderAlign is not compiled with GPU support");
#endif
} else {
AT_ERROR("BorderAlign is not implemented on CPU");
}
}
void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
const Tensor &argmax_idx, Tensor grad_input,
const int pool_size) {
border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
pool_size);
if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(boxes);
CHECK_CUDA_INPUT(argmax_idx);
CHECK_CUDA_INPUT(grad_input);
border_align_backward_cuda(grad_output, boxes, argmax_idx, grad_input,
pool_size);
#else
AT_ERROR("BorderAlign is not compiled with GPU support");
#endif
} else {
AT_ERROR("BorderAlign is not implemented on CPU");
}
}
......@@ -7,7 +7,6 @@
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void border_align_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
......@@ -50,4 +49,3 @@ PARROTS_EXTENSION_REGISTER(border_align_backward)
.output(1)
.apply(border_align_backward_cuda_parrots)
.done();
#endif
......@@ -2,18 +2,28 @@
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned) {
DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
aligned);
}
void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned);
#ifdef MMCV_WITH_CUDA
void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned);
#endif
// Interface for Python
// inline is needed to prevent multiple function definitions when this header is
// included by different cpps
void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned) {
box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
assert(boxes1.device().is_cuda() == boxes2.device().is_cuda());
if (boxes1.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
#else
AT_ERROR("Not compiled with GPU support");
#endif
} else {
box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
}
}
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
#include "box_iou_rotated_utils.hpp"
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
template <typename T>
void box_iou_quadri_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
Tensor ious, const int mode_flag,
const bool aligned) {
void box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
Tensor ious, const int mode_flag,
const bool aligned) {
int output_size = ious.numel();
auto num_boxes1 = boxes1.size(0);
auto num_boxes2 = boxes2.size(0);
if (aligned) {
for (int i = 0; i < output_size; i++) {
ious[i] = single_box_iou_quadri<T>(boxes1[i].data_ptr<T>(),
boxes2[i].data_ptr<T>(), mode_flag);
ious[i] = single_box_iou_rotated<T>(boxes1[i].data_ptr<T>(),
boxes2[i].data_ptr<T>(), mode_flag);
}
} else {
for (int i = 0; i < num_boxes1; i++) {
for (int j = 0; j < num_boxes2; j++) {
ious[i * num_boxes2 + j] = single_box_iou_quadri<T>(
ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
}
}
}
}
void box_iou_quadri_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned) {
box_iou_quadri_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned) {
box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
}
void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned);
REGISTER_DEVICE_IMPL(box_iou_quadri_impl, CPU, box_iou_quadri_cpu);
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
#ifdef MMCV_WITH_CUDA
void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
Tensor rfeatures, Tensor routput,
Tensor rmasks, Tensor output,
const int kernel_size,
const int group_size,
const int scale_factor);
void CARAFEBackwardCUDAKernelLauncher(
const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
const int kernel_size, const int group_size, const int scale_factor);
void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
Tensor routput, Tensor rmasks, Tensor output,
int kernel_size, int group_size, int scale_factor) {
DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
rmasks, output, kernel_size, group_size, scale_factor);
CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
output, kernel_size, group_size,
scale_factor);
}
void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
Tensor rtop_grad, Tensor rbottom_grad_hs,
Tensor rbottom_grad, Tensor rmask_grad,
Tensor bottom_grad, Tensor mask_grad, int kernel_size,
int group_size, int scale_factor) {
DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
bottom_grad, mask_grad, kernel_size, group_size,
scale_factor);
CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
rbottom_grad_hs, rbottom_grad, rmask_grad,
bottom_grad, mask_grad, kernel_size,
group_size, scale_factor);
}
#endif
void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
Tensor routput, Tensor rmasks, Tensor output,
int kernel_size, int group_size, int scale_factor) {
carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
kernel_size, group_size, scale_factor);
if (features.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(features);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(rfeatures);
CHECK_CUDA_INPUT(routput);
CHECK_CUDA_INPUT(rmasks);
CHECK_CUDA_INPUT(output);
carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
kernel_size, group_size, scale_factor);
#else
AT_ERROR("Carafe is not compiled with GPU support");
#endif
} else {
AT_ERROR("Carafe is not implemented on CPU");
}
}
void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
......@@ -32,7 +61,24 @@ void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
Tensor mask_grad, int kernel_size, int group_size,
int scale_factor) {
carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
rbottom_grad, rmask_grad, bottom_grad, mask_grad,
kernel_size, group_size, scale_factor);
if (top_grad.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(top_grad);
CHECK_CUDA_INPUT(rfeatures);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(rtop_grad);
CHECK_CUDA_INPUT(rbottom_grad_hs);
CHECK_CUDA_INPUT(rbottom_grad);
CHECK_CUDA_INPUT(rmask_grad);
CHECK_CUDA_INPUT(bottom_grad);
CHECK_CUDA_INPUT(mask_grad);
carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
rbottom_grad, rmask_grad, bottom_grad, mask_grad,
kernel_size, group_size, scale_factor);
#else
AT_ERROR("Carafe is not compiled with GPU support");
#endif
} else {
AT_ERROR("Carafe is not implemented on CPU");
}
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
#ifdef MMCV_WITH_CUDA
void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
const Tensor masks, Tensor output,
const int kernel_size,
const int group_size,
const int scale_factor);
void CARAFENAIVEBackwardCUDAKernelLauncher(
const Tensor top_grad, const Tensor features, const Tensor masks,
Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
const int group_size, const int scale_factor);
void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
int kernel_size, int group_size,
int scale_factor) {
DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
kernel_size, group_size, scale_factor);
CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
group_size, scale_factor);
}
void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
Tensor bottom_grad, Tensor mask_grad,
int kernel_size, int group_size,
int scale_factor) {
DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
bottom_grad, mask_grad, kernel_size, group_size,
scale_factor);
CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
mask_grad, kernel_size, group_size,
scale_factor);
}
#endif
void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
int kernel_size, int group_size, int scale_factor) {
carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
scale_factor);
if (features.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(features);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(output);
carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
scale_factor);
#else
AT_ERROR("CarafeNaive is not compiled with GPU support");
#endif
} else {
AT_ERROR("CarafeNaive is not implemented on CPU");
}
}
void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
Tensor bottom_grad, Tensor mask_grad,
int kernel_size, int group_size, int scale_factor) {
carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
kernel_size, group_size, scale_factor);
if (top_grad.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(top_grad);
CHECK_CUDA_INPUT(features);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(bottom_grad);
CHECK_CUDA_INPUT(mask_grad);
carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad,
mask_grad, kernel_size, group_size,
scale_factor);
#else
AT_ERROR("CarafeNaive is not compiled with GPU support");
#endif
} else {
AT_ERROR("CarafeNaive is not implemented on CPU");
}
}
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
const Tensor dist1, const Tensor dist2,
const Tensor idx1, const Tensor idx2) {
DISPATCH_DEVICE_IMPL(chamfer_distance_forward_impl, xyz1, xyz2, dist1, dist2,
idx1, idx2);
}
void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
Tensor idx1, Tensor idx2, Tensor graddist1,
Tensor graddist2, Tensor gradxyz1,
Tensor gradxyz2) {
DISPATCH_DEVICE_IMPL(chamfer_distance_backward_impl, xyz1, xyz2, idx1, idx2,
graddist1, graddist2, gradxyz1, gradxyz2);
}
void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
const Tensor dist1, const Tensor dist2,
const Tensor idx1, const Tensor idx2) {
chamfer_distance_forward_impl(xyz1, xyz2, dist1, dist2, idx1, idx2);
}
void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
Tensor idx1, Tensor idx2, Tensor graddist1,
Tensor graddist2, Tensor gradxyz1,
Tensor gradxyz2) {
chamfer_distance_backward_impl(xyz1, xyz2, idx1, idx2, graddist1, graddist2,
gradxyz1, gradxyz2);
}
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "chamfer_distance_pytorch.h"
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void chamfer_distance_forward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
auto xyz1 = buildATensor(ctx, ins[0]);
auto xyz2 = buildATensor(ctx, ins[1]);
auto dist1 = buildATensor(ctx, outs[0]);
auto dist2 = buildATensor(ctx, outs[1]);
auto idx1 = buildATensor(ctx, outs[2]);
auto idx2 = buildATensor(ctx, outs[3]);
chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1, idx2);
}
void chamfer_distance_backward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
auto xyz1 = buildATensor(ctx, ins[0]);
auto xyz2 = buildATensor(ctx, ins[1]);
auto idx1 = buildATensor(ctx, ins[2]);
auto idx2 = buildATensor(ctx, ins[3]);
auto graddist1 = buildATensor(ctx, ins[4]);
auto graddist2 = buildATensor(ctx, ins[5]);
auto gradxyz1 = buildATensor(ctx, outs[0]);
auto gradxyz2 = buildATensor(ctx, outs[1]);
chamfer_distance_backward(xyz1, xyz2, idx1, idx2, graddist1, graddist2,
gradxyz1, gradxyz2);
}
PARROTS_EXTENSION_REGISTER(chamfer_distance_forward)
.input(2)
.output(4)
.apply(chamfer_distance_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(chamfer_distance_backward)
.input(6)
.output(2)
.apply(chamfer_distance_backward_cuda_parrots)
.done();
#endif
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ACTIVE_CHAMFER_DISTANCE_PYTORCH_H
#define ACTIVE_CHAMFER_DISTANCE_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
const Tensor dist1, const Tensor dist2,
const Tensor idx1, const Tensor idx);
void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
Tensor idx1, Tensor idx2, Tensor graddist1,
Tensor graddist2, Tensor gradxyz1,
Tensor gradxyz2);
#endif // ACTIVE_CHAMFER_DISTANCE_PYTORCH_H
......@@ -102,6 +102,7 @@ std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
IntArrayRef data_shape = kernel_mask.sizes();
auto data_label_map = internal_kernel_label.data_ptr<int32_t>();
IntArrayRef label_map_shape = internal_kernel_label.sizes();
vector<vector<int>> text_line;
kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,
......
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/tree/main/mmdet/ops/iou/src
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
Tensor ious) {
DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);
}
void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {
convex_iou_impl(pointsets, polygons, ious);
}
void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
Tensor output) {
DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);
}
void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {
convex_giou_impl(pointsets, polygons, output);
}
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "convex_iou_pytorch.h"
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void convex_iou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
auto pointsets = buildATensor(ctx, ins[0]);
auto polygons = buildATensor(ctx, ins[1]);
auto ious = buildATensor(ctx, outs[0]);
convex_iou(pointsets, polygons, ious);
}
void convex_giou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
auto pointsets = buildATensor(ctx, ins[0]);
auto polygons = buildATensor(ctx, ins[1]);
auto output = buildATensor(ctx, outs[0]);
convex_giou(pointsets, polygons, output);
}
PARROTS_EXTENSION_REGISTER(convex_iou)
.input(2)
.output(1)
.apply(convex_iou_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(convex_giou)
.input(2)
.output(1)
.apply(convex_giou_forward_cuda_parrots)
.done();
#endif
// Copyright (c) OpenMMLab. All rights reserved
#ifndef CONVEX_IOU_PYTORCH_H
#define CONVEX_IOU_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);
void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);
#endif // RIROI_ALIGN_ROTATED_PYTORCH_H
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
#include "pytorch_cpp_helper.hpp"
Tensor bottom_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get height
int64_t height = input.size(2);
output.copy_(input);
for (int64_t ind = 1; ind < height; ind <<= 1) {
Tensor max_temp = at::slice(output, 2, ind, height);
Tensor cur_temp = at::slice(output, 2, ind, height).clone();
Tensor next_temp = at::slice(output, 2, 0, height - ind).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor bottom_pool_backward(Tensor input, Tensor grad_output) {
auto output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(2, 0);
max_val.copy_(input_temp);
max_ind.fill_(0);
auto output_temp = output.select(2, 0);
auto grad_output_temp = grad_output.select(2, 0);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(2);
auto gt_mask = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 0; ind < height - 1; ++ind) {
input_temp = input.select(2, ind + 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, ind + 1);
grad_output_temp = grad_output.select(2, ind + 1).unsqueeze(2);
output.scatter_add_(2, un_max_ind, grad_output_temp);
}
return output;
}
Tensor left_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get width
int64_t width = input.size(3);
output.copy_(input);
for (int64_t ind = 1; ind < width; ind <<= 1) {
Tensor max_temp = at::slice(output, 3, 0, width - ind);
Tensor cur_temp = at::slice(output, 3, 0, width - ind).clone();
Tensor next_temp = at::slice(output, 3, ind, width).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor left_pool_backward(Tensor input, Tensor grad_output) {
auto output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(3, width - 1);
max_val.copy_(input_temp);
max_ind.fill_(width - 1);
auto output_temp = output.select(3, width - 1);
auto grad_output_temp = grad_output.select(3, width - 1);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(3);
auto gt_mask = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 1; ind < width; ++ind) {
input_temp = input.select(3, width - ind - 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, width - ind - 1);
grad_output_temp = grad_output.select(3, width - ind - 1).unsqueeze(3);
output.scatter_add_(3, un_max_ind, grad_output_temp);
}
return output;
}
Tensor right_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get width
int64_t width = input.size(3);
output.copy_(input);
for (int64_t ind = 1; ind < width; ind <<= 1) {
Tensor max_temp = at::slice(output, 3, ind, width);
Tensor cur_temp = at::slice(output, 3, ind, width).clone();
Tensor next_temp = at::slice(output, 3, 0, width - ind).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor right_pool_backward(Tensor input, Tensor grad_output) {
Tensor output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(3, 0);
max_val.copy_(input_temp);
max_ind.fill_(0);
auto output_temp = output.select(3, 0);
auto grad_output_temp = grad_output.select(3, 0);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(3);
auto gt_mask = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 0; ind < width - 1; ++ind) {
input_temp = input.select(3, ind + 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, ind + 1);
grad_output_temp = grad_output.select(3, ind + 1).unsqueeze(3);
output.scatter_add_(3, un_max_ind, grad_output_temp);
}
return output;
}
Tensor top_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get height
int64_t height = input.size(2);
output.copy_(input);
for (int64_t ind = 1; ind < height; ind <<= 1) {
Tensor max_temp = at::slice(output, 2, 0, height - ind);
Tensor cur_temp = at::slice(output, 2, 0, height - ind).clone();
Tensor next_temp = at::slice(output, 2, ind, height).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor top_pool_backward(Tensor input, Tensor grad_output) {
auto output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(2, height - 1);
max_val.copy_(input_temp);
max_ind.fill_(height - 1);
auto output_temp = output.select(2, height - 1);
auto grad_output_temp = grad_output.select(2, height - 1);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(2);
auto gt_mask = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 1; ind < height; ++ind) {
input_temp = input.select(2, height - ind - 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, height - ind - 1);
grad_output_temp = grad_output.select(2, height - ind - 1).unsqueeze(2);
output.scatter_add_(2, un_max_ind, grad_output_temp);
}
return output;
}
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "corner_pool_pytorch.h"
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void bottom_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = bottom_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void bottom_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = bottom_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void left_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = left_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void left_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = left_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void right_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = right_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void right_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = right_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void top_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = top_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void top_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = top_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
#endif
void bottom_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = bottom_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void bottom_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = bottom_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void left_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = left_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void left_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = left_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void right_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = right_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void right_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = right_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
void top_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input;
input = buildATensor(ctx, ins[0]);
auto out = top_pool_forward(input);
updateDArray(ctx, out, outs[0]);
}
void top_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor input, grad_output;
input = buildATensor(ctx, ins[0]);
grad_output = buildATensor(ctx, ins[1]);
auto out = top_pool_backward(input, grad_output);
updateDArray(ctx, out, outs[0]);
}
PARROTS_EXTENSION_REGISTER(bottom_pool_forward)
.input(1)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(bottom_pool_forward_parrots)
#endif
.apply(bottom_pool_forward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(bottom_pool_backward)
.input(2)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(bottom_pool_backward_parrots)
#endif
.apply(bottom_pool_backward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(top_pool_forward)
.input(1)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(top_pool_forward_parrots)
#endif
.apply(top_pool_forward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(top_pool_backward)
.input(2)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(top_pool_backward_parrots)
#endif
.apply(top_pool_backward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(left_pool_forward)
.input(1)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(left_pool_forward_parrots)
#endif
.apply(left_pool_forward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(left_pool_backward)
.input(2)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(left_pool_backward_parrots)
#endif
.apply(left_pool_backward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(right_pool_forward)
.input(1)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(right_pool_forward_parrots)
#endif
.apply(right_pool_forward_parrots_cpu)
.done();
PARROTS_EXTENSION_REGISTER(right_pool_backward)
.input(2)
.output(1)
#ifdef MMCV_WITH_CUDA
.apply(right_pool_backward_parrots)
#endif
.apply(right_pool_backward_parrots_cpu)
.done();
// Copyright (c) OpenMMLab. All rights reserved
#ifndef CORNER_POOL_PYTORCH_H
#define CORNER_POOL_PYTORCH_H
#include <torch/extension.h>
at::Tensor bottom_pool_forward(at::Tensor input);
at::Tensor bottom_pool_backward(at::Tensor input, at::Tensor grad_output);
at::Tensor left_pool_forward(at::Tensor input);
at::Tensor left_pool_backward(at::Tensor input, at::Tensor grad_output);
at::Tensor right_pool_forward(at::Tensor input);
at::Tensor right_pool_backward(at::Tensor input, at::Tensor grad_output);
at::Tensor top_pool_forward(at::Tensor input);
at::Tensor top_pool_backward(at::Tensor input, at::Tensor grad_output);
#endif // CORNER_POOL_PYTORCH_H
......@@ -2,37 +2,65 @@
#include <iostream>
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
#ifdef MMCV_WITH_CUDA
void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
Tensor output, int kH, int kW,
int patchH, int patchW, int padH,
int padW, int dilationH,
int dilationW, int dilation_patchH,
int dilation_patchW, int dH, int dW);
void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
Tensor input2, Tensor grad_input1,
Tensor grad_input2, int kH, int kW,
int patchH, int patchW, int padH,
int padW, int dilationH,
int dilationW, int dilation_patchH,
int dilation_patchW, int dH, int dW);
void correlation_cuda_forward(Tensor input1, Tensor input2, Tensor output,
int kH, int kW, int patchH, int patchW, int padH,
int padW, int dilationH, int dilationW,
int dilation_patchH, int dilation_patchW, int dH,
int dW) {
DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
patchH, patchW, padH, padW, dilationH, dilationW,
dilation_patchH, dilation_patchW, dH, dW);
CorrelationForwardCUDAKernelLauncher(
input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
dilationW, dilation_patchH, dilation_patchW, dH, dW);
}
void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
void correlation_cuda_backward(Tensor grad_output, Tensor input1, Tensor input2,
Tensor grad_input1, Tensor grad_input2, int kH,
int kW, int patchH, int patchW, int padH,
int padW, int dilationH, int dilationW,
int dilation_patchH, int dilation_patchW, int dH,
int dW) {
DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
padW, dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW);
CorrelationBackwardCUDAKernelLauncher(
grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
patchW, padH, padW, dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW);
}
#endif
void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
int kW, int patchH, int patchW, int padH, int padW,
int dilationH, int dilationW, int dilation_patchH,
int dilation_patchW, int dH, int dW) {
correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
padW, dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW);
if (input1.device().is_cuda() && input2.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input1);
CHECK_CUDA_INPUT(input2);
correlation_cuda_forward(input1, input2, output, kH, kW, patchH, patchW,
padH, padW, dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW);
#else
AT_ERROR("Correlation is not compiled with GPU support");
#endif
} else {
AT_ERROR("Correlation is not implemented on CPU");
}
}
void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
......@@ -40,8 +68,20 @@ void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
int kW, int patchH, int patchW, int padH, int padW,
int dilationH, int dilationW, int dilation_patchH,
int dilation_patchW, int dH, int dW) {
correlation_backward_impl(grad_output, input1, input2, grad_input1,
grad_input2, kH, kW, patchH, patchW, padH, padW,
dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW);
if (input1.device().is_cuda() && input2.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(input1);
CHECK_CUDA_INPUT(input2);
correlation_cuda_backward(grad_output, input1, input2, grad_input1,
grad_input2, kH, kW, patchH, patchW, padH, padW,
dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW);
#else
AT_ERROR("Correlation is not compiled with GPU support");
#endif
} else {
AT_ERROR("Correlation is not implemented on CPU");
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment