Unverified Commit a4dc2a72 authored by pc's avatar pc Committed by GitHub
Browse files

support device dispatch in parrots (#1588)

parent 0bcbeadb
// Modified from // Modified from
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
void AssignScoreWithKForwardCUDAKernelLauncher(
int B, int N0, int N1, int M, int K, int O, int aggregate,
const Tensor& points, const Tensor& centers, const Tensor& scores,
const Tensor& knn_idx, Tensor& output);
void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
int aggregate, const Tensor& points, int aggregate, const Tensor& points,
const Tensor& centers, const Tensor& centers,
const Tensor& scores, const Tensor& scores,
const Tensor& knn_idx, Tensor& output) { const Tensor& knn_idx, Tensor& output) {
AssignScoreWithKForwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output); aggregate, points, centers, scores, knn_idx, output);
}; }
void AssignScoreWithKBackwardCUDAKernelLauncher(
int B, int N0, int N1, int M, int K, int O, int aggregate,
const Tensor& grad_out, const Tensor& points, const Tensor& centers,
const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
Tensor& grad_centers, Tensor& grad_scores);
void assign_score_withk_backward_cuda( void assign_score_withk_backward_impl(
int B, int N0, int N1, int M, int K, int O, int aggregate, int B, int N0, int N1, int M, int K, int O, int aggregate,
const Tensor& grad_out, const Tensor& points, const Tensor& centers, const Tensor& grad_out, const Tensor& points, const Tensor& centers,
const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points, const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
Tensor& grad_centers, Tensor& grad_scores) { Tensor& grad_centers, Tensor& grad_scores) {
AssignScoreWithKBackwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx, aggregate, grad_out, points, centers, scores, knn_idx,
grad_points, grad_centers, grad_scores); grad_points, grad_centers, grad_scores);
}; }
#endif
void assign_score_withk_forward(const Tensor& points, const Tensor& centers, void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
const Tensor& scores, const Tensor& knn_idx, const Tensor& scores, const Tensor& knn_idx,
Tensor& output, int B, int N0, int N1, int M, Tensor& output, int B, int N0, int N1, int M,
int K, int O, int aggregate) { int K, int O, int aggregate) {
if (points.device().is_cuda()) { assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
#ifdef MMCV_WITH_CUDA centers, scores, knn_idx, output);
CHECK_CONTIGUOUS(points);
CHECK_CONTIGUOUS(centers);
CHECK_CONTIGUOUS(scores);
CHECK_CONTIGUOUS(knn_idx);
CHECK_CONTIGUOUS(output);
assign_score_withk_forward_cuda(B, N0, N1, M, K, O, aggregate, points,
centers, scores, knn_idx, output);
#else
AT_ERROR("assign_score_withk is not compiled with GPU support");
#endif
} else {
AT_ERROR("assign_score_withk is not implemented on CPU");
}
} }
void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points, void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
...@@ -62,24 +36,7 @@ void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points, ...@@ -62,24 +36,7 @@ void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
Tensor& grad_centers, Tensor& grad_scores, Tensor& grad_centers, Tensor& grad_scores,
int B, int N0, int N1, int M, int K, int O, int B, int N0, int N1, int M, int K, int O,
int aggregate) { int aggregate) {
if (grad_points.device().is_cuda()) { assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
#ifdef MMCV_WITH_CUDA points, centers, scores, knn_idx,
CHECK_CONTIGUOUS(grad_out); grad_points, grad_centers, grad_scores);
CHECK_CONTIGUOUS(scores);
CHECK_CONTIGUOUS(points);
CHECK_CONTIGUOUS(centers);
CHECK_CONTIGUOUS(knn_idx);
CHECK_CONTIGUOUS(grad_scores);
CHECK_CONTIGUOUS(grad_points);
CHECK_CONTIGUOUS(grad_centers);
assign_score_withk_backward_cuda(B, N0, N1, M, K, O, aggregate, grad_out,
points, centers, scores, knn_idx,
grad_points, grad_centers, grad_scores);
#else
AT_ERROR("assign_score_withk is not compiled with GPU support");
#endif
} else {
AT_ERROR("assign_score_withk is not implemented on CPU");
}
} }
...@@ -2,36 +2,19 @@ ...@@ -2,36 +2,19 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void ball_query_forward_impl(int b, int n, int m, float min_radius,
void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
float max_radius, int nsample,
const Tensor new_xyz, const Tensor xyz,
Tensor idx);
void ball_query_forward_cuda(int b, int n, int m, float min_radius,
float max_radius, int nsample, float max_radius, int nsample,
const Tensor new_xyz, const Tensor xyz, const Tensor new_xyz, const Tensor xyz,
Tensor idx) { Tensor idx) {
BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample, DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
new_xyz, xyz, idx); nsample, new_xyz, xyz, idx);
}; }
#endif
void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor, void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
Tensor idx_tensor, int b, int n, int m, Tensor idx_tensor, int b, int n, int m,
float min_radius, float max_radius, int nsample) { float min_radius, float max_radius, int nsample) {
if (new_xyz_tensor.device().is_cuda()) { ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
#ifdef MMCV_WITH_CUDA new_xyz_tensor, xyz_tensor, idx_tensor);
CHECK_CUDA_INPUT(new_xyz_tensor);
CHECK_CUDA_INPUT(xyz_tensor);
ball_query_forward_cuda(b, n, m, min_radius, max_radius, nsample,
new_xyz_tensor, xyz_tensor, idx_tensor);
#else
AT_ERROR("ball_query is not compiled with GPU support");
#endif
} else {
AT_ERROR("ball_query is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
Tensor ious, const int mode,
const bool aligned, const int offset);
void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset) { const int mode, const bool aligned, const int offset) {
BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset); DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
aligned, offset);
} }
#endif
void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
const int mode, const bool aligned, const int offset) { const int mode, const bool aligned, const int offset) {
if (bboxes1.device().is_cuda()) { bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(bboxes1);
CHECK_CUDA_INPUT(bboxes2);
CHECK_CUDA_INPUT(ious);
bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
#else
AT_ERROR("bbox_overlaps is not compiled with GPU support");
#endif
} else {
AT_ERROR("bbox_overlaps is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
const Tensor &boxes, Tensor output,
Tensor argmax_idx,
const int pool_size);
void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
const Tensor &boxes,
const Tensor &argmax_idx,
Tensor grad_input,
const int pool_size);
void border_align_forward_cuda(const Tensor &input, const Tensor &boxes,
Tensor output, Tensor argmax_idx, Tensor output, Tensor argmax_idx,
const int pool_size) { const int pool_size) {
BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx, DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
pool_size); argmax_idx, pool_size);
} }
void border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes, void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
const Tensor &argmax_idx, Tensor grad_input, const Tensor &argmax_idx, Tensor grad_input,
const int pool_size) { const int pool_size) {
BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx, DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
grad_input, pool_size); argmax_idx, grad_input, pool_size);
} }
#endif
void border_align_forward(const Tensor &input, const Tensor &boxes, void border_align_forward(const Tensor &input, const Tensor &boxes,
Tensor output, Tensor argmax_idx, Tensor output, Tensor argmax_idx,
const int pool_size) { const int pool_size) {
if (input.device().is_cuda()) { border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(boxes);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax_idx);
border_align_forward_cuda(input, boxes, output, argmax_idx, pool_size);
#else
AT_ERROR("BorderAlign is not compiled with GPU support");
#endif
} else {
AT_ERROR("BorderAlign is not implemented on CPU");
}
} }
void border_align_backward(const Tensor &grad_output, const Tensor &boxes, void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
const Tensor &argmax_idx, Tensor grad_input, const Tensor &argmax_idx, Tensor grad_input,
const int pool_size) { const int pool_size) {
if (grad_output.device().is_cuda()) { border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
#ifdef MMCV_WITH_CUDA pool_size);
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(boxes);
CHECK_CUDA_INPUT(argmax_idx);
CHECK_CUDA_INPUT(grad_input);
border_align_backward_cuda(grad_output, boxes, argmax_idx, grad_input,
pool_size);
#else
AT_ERROR("BorderAlign is not compiled with GPU support");
#endif
} else {
AT_ERROR("BorderAlign is not implemented on CPU");
}
} }
...@@ -2,28 +2,18 @@ ...@@ -2,28 +2,18 @@
// modified from // modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious, void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned); const int mode_flag, const bool aligned) {
DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
#ifdef MMCV_WITH_CUDA aligned);
void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious, }
const int mode_flag, const bool aligned);
#endif
// Interface for Python // Interface for Python
// inline is needed to prevent multiple function definitions when this header is // inline is needed to prevent multiple function definitions when this header is
// included by different cpps // included by different cpps
void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious, void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned) { const int mode_flag, const bool aligned) {
assert(boxes1.device().is_cuda() == boxes2.device().is_cuda()); box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
if (boxes1.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
#else
AT_ERROR("Not compiled with GPU support");
#endif
} else {
box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
}
} }
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
#include "box_iou_rotated_utils.hpp"
#include "pytorch_cpp_helper.hpp"
template <typename T>
void box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
Tensor ious, const int mode_flag,
const bool aligned) {
int output_size = ious.numel();
auto num_boxes1 = boxes1.size(0);
auto num_boxes2 = boxes2.size(0);
if (aligned) {
for (int i = 0; i < output_size; i++) {
ious[i] = single_box_iou_rotated<T>(boxes1[i].data_ptr<T>(),
boxes2[i].data_ptr<T>(), mode_flag);
}
} else {
for (int i = 0; i < num_boxes1; i++) {
for (int j = 0; j < num_boxes2; j++) {
ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
}
}
}
}
void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
const int mode_flag, const bool aligned) {
box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
}
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
Tensor rfeatures, Tensor routput,
Tensor rmasks, Tensor output,
const int kernel_size,
const int group_size,
const int scale_factor);
void CARAFEBackwardCUDAKernelLauncher(
const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
const int kernel_size, const int group_size, const int scale_factor);
void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
Tensor routput, Tensor rmasks, Tensor output, Tensor routput, Tensor rmasks, Tensor output,
int kernel_size, int group_size, int scale_factor) { int kernel_size, int group_size, int scale_factor) {
CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks, DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
output, kernel_size, group_size, rmasks, output, kernel_size, group_size, scale_factor);
scale_factor);
} }
void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks, void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rtop_grad, Tensor rbottom_grad_hs,
Tensor rbottom_grad, Tensor rmask_grad, Tensor rbottom_grad, Tensor rmask_grad,
Tensor bottom_grad, Tensor mask_grad, int kernel_size, Tensor bottom_grad, Tensor mask_grad, int kernel_size,
int group_size, int scale_factor) { int group_size, int scale_factor) {
CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad, DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
rbottom_grad_hs, rbottom_grad, rmask_grad, rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
bottom_grad, mask_grad, kernel_size, bottom_grad, mask_grad, kernel_size, group_size,
group_size, scale_factor); scale_factor);
} }
#endif
void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures, void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
Tensor routput, Tensor rmasks, Tensor output, Tensor routput, Tensor rmasks, Tensor output,
int kernel_size, int group_size, int scale_factor) { int kernel_size, int group_size, int scale_factor) {
if (features.device().is_cuda()) { carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
#ifdef MMCV_WITH_CUDA kernel_size, group_size, scale_factor);
CHECK_CUDA_INPUT(features);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(rfeatures);
CHECK_CUDA_INPUT(routput);
CHECK_CUDA_INPUT(rmasks);
CHECK_CUDA_INPUT(output);
carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
kernel_size, group_size, scale_factor);
#else
AT_ERROR("Carafe is not compiled with GPU support");
#endif
} else {
AT_ERROR("Carafe is not implemented on CPU");
}
} }
void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks, void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
...@@ -61,24 +32,7 @@ void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks, ...@@ -61,24 +32,7 @@ void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad, Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
Tensor mask_grad, int kernel_size, int group_size, Tensor mask_grad, int kernel_size, int group_size,
int scale_factor) { int scale_factor) {
if (top_grad.device().is_cuda()) { carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
#ifdef MMCV_WITH_CUDA rbottom_grad, rmask_grad, bottom_grad, mask_grad,
CHECK_CUDA_INPUT(top_grad); kernel_size, group_size, scale_factor);
CHECK_CUDA_INPUT(rfeatures);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(rtop_grad);
CHECK_CUDA_INPUT(rbottom_grad_hs);
CHECK_CUDA_INPUT(rbottom_grad);
CHECK_CUDA_INPUT(rmask_grad);
CHECK_CUDA_INPUT(bottom_grad);
CHECK_CUDA_INPUT(mask_grad);
carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
rbottom_grad, rmask_grad, bottom_grad, mask_grad,
kernel_size, group_size, scale_factor);
#else
AT_ERROR("Carafe is not compiled with GPU support");
#endif
} else {
AT_ERROR("Carafe is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
const Tensor masks, Tensor output,
const int kernel_size,
const int group_size,
const int scale_factor);
void CARAFENAIVEBackwardCUDAKernelLauncher(
const Tensor top_grad, const Tensor features, const Tensor masks,
Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
const int group_size, const int scale_factor);
void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
int kernel_size, int group_size, int kernel_size, int group_size,
int scale_factor) { int scale_factor) {
CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size, DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
group_size, scale_factor); kernel_size, group_size, scale_factor);
} }
void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks, void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
Tensor bottom_grad, Tensor mask_grad, Tensor bottom_grad, Tensor mask_grad,
int kernel_size, int group_size, int kernel_size, int group_size,
int scale_factor) { int scale_factor) {
CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad, DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
mask_grad, kernel_size, group_size, bottom_grad, mask_grad, kernel_size, group_size,
scale_factor); scale_factor);
} }
#endif
void carafe_naive_forward(Tensor features, Tensor masks, Tensor output, void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
int kernel_size, int group_size, int scale_factor) { int kernel_size, int group_size, int scale_factor) {
if (features.device().is_cuda()) { carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
#ifdef MMCV_WITH_CUDA scale_factor);
CHECK_CUDA_INPUT(features);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(output);
carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
scale_factor);
#else
AT_ERROR("CarafeNaive is not compiled with GPU support");
#endif
} else {
AT_ERROR("CarafeNaive is not implemented on CPU");
}
} }
void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks, void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
Tensor bottom_grad, Tensor mask_grad, Tensor bottom_grad, Tensor mask_grad,
int kernel_size, int group_size, int scale_factor) { int kernel_size, int group_size, int scale_factor) {
if (top_grad.device().is_cuda()) { carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
#ifdef MMCV_WITH_CUDA kernel_size, group_size, scale_factor);
CHECK_CUDA_INPUT(top_grad);
CHECK_CUDA_INPUT(features);
CHECK_CUDA_INPUT(masks);
CHECK_CUDA_INPUT(bottom_grad);
CHECK_CUDA_INPUT(mask_grad);
carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad,
mask_grad, kernel_size, group_size,
scale_factor);
#else
AT_ERROR("CarafeNaive is not compiled with GPU support");
#endif
} else {
AT_ERROR("CarafeNaive is not implemented on CPU");
}
} }
...@@ -2,65 +2,37 @@ ...@@ -2,65 +2,37 @@
#include <iostream> #include <iostream>
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
Tensor output, int kH, int kW,
int patchH, int patchW, int padH,
int padW, int dilationH,
int dilationW, int dilation_patchH,
int dilation_patchW, int dH, int dW);
void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
Tensor input2, Tensor grad_input1,
Tensor grad_input2, int kH, int kW,
int patchH, int patchW, int padH,
int padW, int dilationH,
int dilationW, int dilation_patchH,
int dilation_patchW, int dH, int dW);
void correlation_cuda_forward(Tensor input1, Tensor input2, Tensor output,
int kH, int kW, int patchH, int patchW, int padH, int kH, int kW, int patchH, int patchW, int padH,
int padW, int dilationH, int dilationW, int padW, int dilationH, int dilationW,
int dilation_patchH, int dilation_patchW, int dH, int dilation_patchH, int dilation_patchW, int dH,
int dW) { int dW) {
CorrelationForwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH, patchH, patchW, padH, padW, dilationH, dilationW,
dilationW, dilation_patchH, dilation_patchW, dH, dW); dilation_patchH, dilation_patchW, dH, dW);
} }
void correlation_cuda_backward(Tensor grad_output, Tensor input1, Tensor input2, void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
Tensor grad_input1, Tensor grad_input2, int kH, Tensor grad_input1, Tensor grad_input2, int kH,
int kW, int patchH, int patchW, int padH, int kW, int patchH, int patchW, int padH,
int padW, int dilationH, int dilationW, int padW, int dilationH, int dilationW,
int dilation_patchH, int dilation_patchW, int dH, int dilation_patchH, int dilation_patchW, int dH,
int dW) { int dW) {
CorrelationBackwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH, grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
patchW, padH, padW, dilationH, dilationW, dilation_patchH, padW, dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW); dilation_patchW, dH, dW);
} }
#endif
void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH, void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
int kW, int patchH, int patchW, int padH, int padW, int kW, int patchH, int patchW, int padH, int padW,
int dilationH, int dilationW, int dilation_patchH, int dilationH, int dilationW, int dilation_patchH,
int dilation_patchW, int dH, int dW) { int dilation_patchW, int dH, int dW) {
if (input1.device().is_cuda() && input2.device().is_cuda()) { correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
#ifdef MMCV_WITH_CUDA padW, dilationH, dilationW, dilation_patchH,
CHECK_CUDA_INPUT(input1); dilation_patchW, dH, dW);
CHECK_CUDA_INPUT(input2);
correlation_cuda_forward(input1, input2, output, kH, kW, patchH, patchW,
padH, padW, dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW);
#else
AT_ERROR("Correlation is not compiled with GPU support");
#endif
} else {
AT_ERROR("Correlation is not implemented on CPU");
}
} }
void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2, void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
...@@ -68,20 +40,8 @@ void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2, ...@@ -68,20 +40,8 @@ void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
int kW, int patchH, int patchW, int padH, int padW, int kW, int patchH, int patchW, int padH, int padW,
int dilationH, int dilationW, int dilation_patchH, int dilationH, int dilationW, int dilation_patchH,
int dilation_patchW, int dH, int dW) { int dilation_patchW, int dH, int dW) {
if (input1.device().is_cuda() && input2.device().is_cuda()) { correlation_backward_impl(grad_output, input1, input2, grad_input1,
#ifdef MMCV_WITH_CUDA grad_input2, kH, kW, patchH, patchW, padH, padW,
CHECK_CUDA_INPUT(grad_output); dilationH, dilationW, dilation_patchH,
CHECK_CUDA_INPUT(input1); dilation_patchW, dH, dW);
CHECK_CUDA_INPUT(input2);
correlation_cuda_backward(grad_output, input1, input2, grad_input1,
grad_input2, kH, kW, patchH, patchW, padH, padW,
dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW);
#else
AT_ERROR("Correlation is not compiled with GPU support");
#endif
} else {
AT_ERROR("Correlation is not implemented on CPU");
}
} }
This diff is collapsed.
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor data_col) {
DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, parallel_imgs,
deformable_group, data_col);
}
#ifdef MMCV_WITH_CUDA void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
const int channels, const int height,
void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels, const int width, const int ksize_h,
const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w,
const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group,
const int parallel_imgs, const int deformable_group, Tensor grad_im) {
Tensor data_col); DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels, stride_w, dilation_h, dilation_w, parallel_imgs,
const int height, const int width, const int ksize_h, deformable_group, grad_im);
const int ksize_w, const int pad_h, const int pad_w, }
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor grad_im);
void deformable_col2im_coord(
Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int parallel_imgs,
const int deformable_group, Tensor grad_offset);
#endif
void deformable_im2col_cpu(Tensor data_im, Tensor data_offset, void deformable_col2im_coord_impl(
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor data_col);
void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor grad_im);
void deformable_col2im_coord_cpu(
Tensor data_col, Tensor data_im, Tensor data_offset, const int channels, Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h, const int ksize_w, const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int parallel_imgs, const int dilation_h, const int dilation_w, const int parallel_imgs,
const int deformable_group, Tensor grad_offset); const int deformable_group, Tensor grad_offset) {
DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
data_offset, channels, height, width, ksize_h, ksize_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
parallel_imgs, deformable_group, grad_offset);
}
void deform_conv_shape_check(at::Tensor input, at::Tensor offset, void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
at::Tensor *gradOutput, at::Tensor weight, int kH, at::Tensor *gradOutput, at::Tensor weight, int kH,
...@@ -227,17 +216,9 @@ void deform_conv_forward(Tensor input, Tensor weight, Tensor offset, ...@@ -227,17 +216,9 @@ void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
output_buffer.size(2), output_buffer.size(3)}); output_buffer.size(2), output_buffer.size(3)});
for (int elt = 0; elt < batchSize / im2col_step; elt++) { for (int elt = 0; elt < batchSize / im2col_step; elt++) {
if (input.device().is_cuda()) { deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
#ifdef MMCV_WITH_CUDA inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight, dilationW, im2col_step, deformable_group, columns);
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
#endif
} else {
deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
}
columns = columns.view({group, columns.size(0) / group, columns.size(1)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view({group, weight.size(0) / group, weight.size(1), weight = weight.view({group, weight.size(0) / group, weight.size(1),
...@@ -373,29 +354,15 @@ void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput, ...@@ -373,29 +354,15 @@ void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
{gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2), {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)}); gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
if (input.device().is_cuda()) { deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
#ifdef MMCV_WITH_CUDA inputHeight, inputWidth, kH, kW, padH, padW,
deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane, dH, dW, dilationH, dilationW, im2col_step,
inputHeight, inputWidth, kH, kW, padH, padW, dH, deformable_group, gradOffset[elt]);
dW, dilationH, dilationW, im2col_step,
deformable_group, gradOffset[elt]); deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
deformable_col2im(columns, offset[elt], nInputPlane, inputHeight, dilationW, im2col_step, deformable_group,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH, gradInput[elt]);
dilationW, im2col_step, deformable_group,
gradInput[elt]);
#endif
} else {
deformable_col2im_coord_cpu(columns, input[elt], offset[elt], nInputPlane,
inputHeight, inputWidth, kH, kW, padH, padW,
dH, dW, dilationH, dilationW, im2col_step,
deformable_group, gradOffset[elt]);
deformable_col2im_cpu(columns, offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group,
gradInput[elt]);
}
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)}); weight.size(3), weight.size(4)});
...@@ -508,17 +475,9 @@ void deform_conv_backward_parameters(Tensor input, Tensor offset, ...@@ -508,17 +475,9 @@ void deform_conv_backward_parameters(Tensor input, Tensor offset,
deformable_group * 2 * kH * kW, outputHeight, outputWidth}); deformable_group * 2 * kH * kW, outputHeight, outputWidth});
for (int elt = 0; elt < batchSize / im2col_step; elt++) { for (int elt = 0; elt < batchSize / im2col_step; elt++) {
if (input.device().is_cuda()) { deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
#ifdef MMCV_WITH_CUDA inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight, dilationW, im2col_step, deformable_group, columns);
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
#endif
} else {
deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
}
// divide into group // divide into group
gradOutputBuffer = gradOutputBuffer.view( gradOutputBuffer = gradOutputBuffer.view(
......
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
template <typename T>
T deformable_im2col_bilinear_cpu(const T *input, const int data_width,
const int height, const int width, T h, T w) {
if (h <= -1 || height <= h || w <= -1 || width <= w) {
return 0;
}
int h_low = floor(h);
int w_low = floor(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
T lh = h - h_low;
T lw = w - w_low;
T hh = 1 - lh, hw = 1 - lw;
T v1 = 0;
if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
T v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
v2 = input[h_low * data_width + w_high];
T v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
v3 = input[h_high * data_width + w_low];
T v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
v4 = input[h_high * data_width + w_high];
T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename T>
T get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
const int height, const int width) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (h == argmax_h_low && w == argmax_w_low)
weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
if (h == argmax_h_low && w == argmax_w_high)
weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
if (h == argmax_h_high && w == argmax_w_low)
weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
if (h == argmax_h_high && w == argmax_w_high)
weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
return weight;
}
template <typename T>
T get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
const int width, const T *im_data,
const int data_width, const int bp_dir) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (bp_dir == 0) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += -1 * (argmax_w - argmax_w_low) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_w - argmax_w_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
} else if (bp_dir == 1) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += -1 * (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
}
return weight;
}
template <typename T>
void deformable_im2col_cpu_kernel(
const int n, const T *data_im, const T *data_offset, const int height,
const int width, const int kernel_h, const int kernel_w, const int pad_h,
const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int num_channels, const int deformable_group, const int height_col,
const int width_col, T *data_col) {
for (int index = 0; index < n; index++) {
// index index of output matrix
const int w_col = index % width_col;
const int h_col = (index / width_col) % height_col;
const int b_col = (index / width_col / height_col) % batch_size;
const int c_im = (index / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
// compute deformable group index
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
T *data_col_ptr =
data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
const T *data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const T *data_offset_ptr =
data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
T val = static_cast<T>(0);
const T h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
val = deformable_im2col_bilinear_cpu(data_im_ptr, width, height,
width, h_im, w_im);
*data_col_ptr = val;
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
template <typename T>
void deformable_col2im_cpu_kernel(
const int n, const T *data_col, const T *data_offset, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int deformable_group, const int height_col, const int width_col,
T *grad_im) {
for (int index = 0; index < n; index++) {
const int j = (index / width_col / height_col / batch_size) % kernel_w;
const int i =
(index / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c =
index / width_col / height_col / batch_size / kernel_w / kernel_h;
// compute the start and end of the output
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = index % width_col;
int h_out = (index / width_col) % height_col;
int b = (index / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const T *data_offset_ptr =
data_offset + (b * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
const T cur_top_grad = data_col[index];
const int cur_h = (int)cur_inv_h_data;
const int cur_w = (int)cur_inv_w_data;
for (int dy = -2; dy <= 2; dy++) {
for (int dx = -2; dx <= 2; dx++) {
if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1) {
int cur_bottom_grad_pos =
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
T weight =
get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data,
cur_h + dy, cur_w + dx, height, width);
*(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
}
}
}
}
}
template <typename T>
void deformable_col2im_coord_cpu_kernel(
const int n, const T *data_col, const T *data_im, const T *data_offset,
const int channels, const int height, const int width, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int offset_channels, const int deformable_group, const int height_col,
const int width_col, T *grad_offset) {
for (int index = 0; index < n; index++) {
T val = 0;
int w = index % width_col;
int h = (index / width_col) % height_col;
int c = (index / width_col / height_col) % offset_channels;
int b = (index / width_col / height_col) / offset_channels;
// compute the start and end of the output
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const T *data_col_ptr = data_col + deformable_group_index *
channel_per_deformable_group *
batch_size * width_col * height_col;
const T *data_im_ptr =
data_im + (b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h / kernel_w *
height * width;
const T *data_offset_ptr =
data_offset + (b * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
col_c += col_step) {
const int col_pos =
(((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i =
(col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr =
(((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr =
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
w_out);
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
T inv_h = h_in + i * dilation_h + offset_h;
T inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
inv_h = inv_w = -2;
const T weight = get_coordinate_weight_cpu(
inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
width, bp_dir);
val += weight * data_col_ptr[col_pos];
cnt += 1;
}
grad_offset[index] = val;
}
}
void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor data_col) {
int height_col =
(height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col =
(width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = channels * height_col * width_col * parallel_imgs;
int channel_per_deformable_group = channels / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.scalar_type(), "deformable_im2col_cpu", [&] {
deformable_im2col_cpu_kernel<scalar_t>(
num_kernels, data_im.data_ptr<scalar_t>(),
data_offset.data_ptr<scalar_t>(), height, width, ksize_h, ksize_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, parallel_imgs, channels,
deformable_group, height_col, width_col,
data_col.data_ptr<scalar_t>());
});
}
void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor grad_im) {
// todo: make sure parallel_imgs is passed in correctly
int height_col =
(height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col =
(width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels =
channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
int channel_per_deformable_group = channels / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
deformable_col2im_cpu_kernel<scalar_t>(
num_kernels, data_col_, data_offset_, channels, height, width,
ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
dilation_w, channel_per_deformable_group, parallel_imgs,
deformable_group, height_col, width_col, grad_im_);
}));
}
void deformable_col2im_coord_cpu(
Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int parallel_imgs,
const int deformable_group, Tensor grad_offset) {
int height_col =
(height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col =
(width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
deformable_group * parallel_imgs;
int channel_per_deformable_group =
channels * ksize_h * ksize_w / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "deformable_col2im_coord_cpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
deformable_col2im_coord_cpu_kernel<scalar_t>(
num_kernels, data_col_, data_im_, data_offset_, channels, height,
width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
2 * ksize_h * ksize_w * deformable_group, deformable_group,
height_col, width_col, grad_offset_);
}));
}
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
Tensor offset, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale,
int sampling_ratio, float gamma);
void DeformRoIPoolBackwardCUDAKernelLauncher(
Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio, float gamma);
void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height, Tensor output, int pooled_height,
int pooled_width, float spatial_scale, int pooled_width, float spatial_scale,
int sampling_ratio, float gamma) { int sampling_ratio, float gamma) {
DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output, DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
pooled_height, pooled_width, output, pooled_height, pooled_width, spatial_scale,
spatial_scale, sampling_ratio, gamma); sampling_ratio, gamma);
} }
void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input, void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
Tensor rois, Tensor offset, Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset, Tensor grad_input, Tensor grad_offset,
int pooled_height, int pooled_width, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio, float spatial_scale, int sampling_ratio,
float gamma) { float gamma) {
DeformRoIPoolBackwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
grad_output, input, rois, offset, grad_input, grad_offset, pooled_height, offset, grad_input, grad_offset, pooled_height,
pooled_width, spatial_scale, sampling_ratio, gamma); pooled_width, spatial_scale, sampling_ratio, gamma);
} }
#endif
void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset, void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height, int pooled_width, Tensor output, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio, float spatial_scale, int sampling_ratio,
float gamma) { float gamma) {
if (input.device().is_cuda()) { deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
#ifdef MMCV_WITH_CUDA pooled_width, spatial_scale, sampling_ratio,
CHECK_CUDA_INPUT(input); gamma);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(output);
deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
pooled_width, spatial_scale, sampling_ratio,
gamma);
#else
AT_ERROR("DeformRoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformRoIPool is not implemented on CPU");
}
} }
void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois, void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
...@@ -61,22 +36,7 @@ void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois, ...@@ -61,22 +36,7 @@ void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
Tensor grad_offset, int pooled_height, Tensor grad_offset, int pooled_height,
int pooled_width, float spatial_scale, int pooled_width, float spatial_scale,
int sampling_ratio, float gamma) { int sampling_ratio, float gamma) {
if (grad_output.device().is_cuda()) { deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
#ifdef MMCV_WITH_CUDA grad_offset, pooled_height, pooled_width,
CHECK_CUDA_INPUT(grad_output); spatial_scale, sampling_ratio, gamma);
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_offset);
deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
grad_offset, pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
#else
AT_ERROR("DeformRoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformRoIPool is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor output,
const float gamma,
const float alpha);
void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight,
Tensor grad_input,
const float gamma,
const float alpha);
void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor output,
const float gamma,
const float alpha);
void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor buff,
Tensor grad_input,
const float gamma,
const float alpha);
void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) { Tensor output, float gamma, float alpha) {
SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output, DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
gamma, alpha); output, gamma, alpha);
} }
void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target, void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
Tensor weight, Tensor grad_input, Tensor weight, Tensor grad_input,
float gamma, float alpha) { float gamma, float alpha) {
SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input, DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
gamma, alpha); grad_input, gamma, alpha);
} }
void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight, void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) { Tensor output, float gamma, float alpha) {
SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output, DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
gamma, alpha); output, gamma, alpha);
} }
void softmax_focal_loss_backward_cuda(Tensor input, Tensor target, void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
Tensor weight, Tensor buff, Tensor weight, Tensor buff,
Tensor grad_input, float gamma, Tensor grad_input, float gamma,
float alpha) { float alpha) {
SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff, DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
grad_input, gamma, alpha); buff, grad_input, gamma, alpha);
} }
#endif
void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight, void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) { Tensor output, float gamma, float alpha) {
if (input.device().is_cuda()) { sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma,
alpha);
#else
AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
}
} }
void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight, void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
Tensor grad_input, float gamma, float alpha) { Tensor grad_input, float gamma, float alpha) {
if (input.device().is_cuda()) { sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
#ifdef MMCV_WITH_CUDA alpha);
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(grad_input);
sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
alpha);
#else
AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
}
} }
void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight, void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) { Tensor output, float gamma, float alpha) {
if (input.device().is_cuda()) { softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
softmax_focal_loss_forward_cuda(input, target, weight, output, gamma,
alpha);
#else
AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
}
} }
void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight, void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
Tensor buff, Tensor grad_input, float gamma, Tensor buff, Tensor grad_input, float gamma,
float alpha) { float alpha) {
if (input.device().is_cuda()) { softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
#ifdef MMCV_WITH_CUDA gamma, alpha);
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(buff);
CHECK_CUDA_INPUT(grad_input);
softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
gamma, alpha);
#else
AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
}
} }
...@@ -2,61 +2,33 @@ ...@@ -2,61 +2,33 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void furthest_point_sampling_forward_impl(Tensor points_tensor,
void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m, Tensor temp_tensor, Tensor idx_tensor,
const float *dataset, int b, int n, int m) {
float *temp, int *idxs); DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
temp_tensor, idx_tensor, b, n, m);
void furthest_point_sampling_forward_cuda(int b, int n, int m,
const float *dataset, float *temp,
int *idxs) {
FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
} }
void FurthestPointSamplingWithDistForwardCUDAKernelLauncher( void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
int b, int n, int m, const float *dataset, float *temp, int *idxs); Tensor temp_tensor,
Tensor idx_tensor, int b,
void furthest_point_sampling_with_dist_forward_cuda(int b, int n, int m, int n, int m) {
const float *dataset, DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
float *temp, int *idxs) { points_tensor, temp_tensor, idx_tensor, b, n, m);
FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
idxs);
} }
#endif
void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor, void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
Tensor idx_tensor, int b, int n, int m) { Tensor idx_tensor, int b, int n, int m) {
if (points_tensor.device().is_cuda()) { furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA b, n, m);
const float *points = points_tensor.data_ptr<float>();
float *temp = temp_tensor.data_ptr<float>();
int *idx = idx_tensor.data_ptr<int>();
furthest_point_sampling_forward_cuda(b, n, m, points, temp, idx);
#else
AT_ERROR("furthest_point_sampling is not compiled with GPU support");
#endif
} else {
AT_ERROR("furthest_point_sampling is not implemented on CPU");
}
} }
void furthest_point_sampling_with_dist_forward(Tensor points_tensor, void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
Tensor temp_tensor, Tensor temp_tensor,
Tensor idx_tensor, int b, int n, Tensor idx_tensor, int b, int n,
int m) { int m) {
if (points_tensor.device().is_cuda()) { furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
#ifdef MMCV_WITH_CUDA idx_tensor, b, n, m);
const float *points = points_tensor.data<float>();
float *temp = temp_tensor.data<float>();
int *idx = idx_tensor.data<int>();
furthest_point_sampling_with_dist_forward_cuda(b, n, m, points, temp, idx);
#else
AT_ERROR(
"furthest_point_sampling_with_dist is not compiled with GPU support");
#endif
} else {
AT_ERROR("furthest_point_sampling_with_dist is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved
// Modified from // Modified from
// from
// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp // https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp
#include "pytorch_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA /*
torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor &input, Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
const torch::Tensor &bias,
const torch::Tensor &refer, int act, NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
int grad, float alpha, float scale); Augmentation (ADA)
=======================================================================
1. Definitions
"Licensor" means any person or entity that distributes its Work.
"Software" means the original work of authorship made available under
this License.
"Work" means the Software and any additions to or derivative works of
the Software that are made available under this License.
The terms "reproduce," "reproduction," "derivative works," and
"distribution" have the meaning as provided under U.S. copyright law;
provided, however, that for the purposes of this License, derivative
works shall not include works that remain separable from, or merely
link (or bind by name) to the interfaces of, the Work.
Works, including the Software, are "made available" under this License
by including in or with the Work either (a) a copyright notice
referencing the applicability of this License to the Work, or (b) a
copy of this License.
2. License Grants
2.1 Copyright Grant. Subject to the terms and conditions of this
License, each Licensor grants to you a perpetual, worldwide,
non-exclusive, royalty-free, copyright license to reproduce,
prepare derivative works of, publicly display, publicly perform,
sublicense and distribute its Work and any resulting derivative
works in any form.
3. Limitations
3.1 Redistribution. You may reproduce or distribute the Work only
if (a) you do so under this License, (b) you include a complete
copy of this License with your distribution, and (c) you retain
without modification any copyright, patent, trademark, or
attribution notices that are present in the Work.
3.2 Derivative Works. You may specify that additional or different
terms apply to the use, reproduction, and distribution of your
derivative works of the Work ("Your Terms") only if (a) Your Terms
provide that the use limitation in Section 3.3 applies to your
derivative works, and (b) you identify the specific derivative
works that are subject to Your Terms. Notwithstanding Your Terms,
this License (including the redistribution requirements in Section
3.1) will continue to apply to the Work itself.
#endif 3.3 Use Limitation. The Work and any derivative works thereof only
may be used or intended for use non-commercially. Notwithstanding
the foregoing, NVIDIA and its affiliates may use the Work and any
derivative works commercially. As used herein, "non-commercially"
means for research or evaluation purposes only.
3.4 Patent Claims. If you bring or threaten to bring a patent claim
against any Licensor (including any claim, cross-claim or
counterclaim in a lawsuit) to enforce any patents that you allege
are infringed by any Work, then your rights under this License from
such Licensor (including the grant in Section 2.1) will terminate
immediately.
3.5 Trademarks. This License does not grant any rights to use any
Licensor’s or its affiliates’ names, logos, or trademarks, except
as necessary to reproduce the notices described in this License.
3.6 Termination. If you violate any term of this License, then your
rights under this License (including the grant in Section 2.1) will
terminate immediately.
4. Disclaimer of Warranty.
THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
THIS LICENSE.
5. Limitation of Liability.
EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
THE POSSIBILITY OF SUCH DAMAGES.
=======================================================================
*/
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
const torch::Tensor& bias,
const torch::Tensor& refer, int act,
int grad, float alpha, float scale) {
return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
act, grad, alpha, scale);
}
torch::Tensor fused_bias_leakyrelu(const torch::Tensor &input, torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
const torch::Tensor &bias, const torch::Tensor& bias,
const torch::Tensor &refer, int act, const torch::Tensor& refer, int act,
int grad, float alpha, float scale) { int grad, float alpha, float scale) {
#ifdef MMCV_WITH_CUDA return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
CHECK_CUDA(input); scale);
CHECK_CUDA(bias);
return fused_bias_leakyrelu_op(input, bias, refer, act, grad, alpha, scale);
#else
AT_ERROR("Fused bias leakyrelu is not compiled with GPU support");
#endif
} }
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void gather_points_forward_impl(int b, int c, int n, int npoints,
void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
const Tensor points,
const Tensor idx, Tensor out);
void gather_points_forward_cuda(int b, int c, int n, int npoints,
const Tensor points, const Tensor idx, const Tensor points, const Tensor idx,
Tensor out) { Tensor out) {
GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out); DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
}; idx, out);
}
void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
const Tensor grad_out,
const Tensor idx,
Tensor grad_points);
void gather_points_backward_cuda(int b, int c, int n, int npoints, void gather_points_backward_impl(int b, int c, int n, int npoints,
const Tensor grad_out, const Tensor idx, const Tensor grad_out, const Tensor idx,
Tensor grad_points) { Tensor grad_points) {
GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx, DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
grad_points); idx, grad_points);
}; }
#endif
void gather_points_forward(Tensor points_tensor, Tensor idx_tensor, void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor out_tensor, int b, int c, int n, Tensor out_tensor, int b, int c, int n,
int npoints) { int npoints) {
if (points_tensor.device().is_cuda()) { gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA out_tensor);
gather_points_forward_cuda(b, c, n, npoints, points_tensor, idx_tensor,
out_tensor);
#else
AT_ERROR("gather_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("gather_points is not implemented on CPU");
}
} }
void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor, void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor grad_points_tensor, int b, int c, int n, Tensor grad_points_tensor, int b, int c, int n,
int npoints) { int npoints) {
if (grad_out_tensor.device().is_cuda()) { gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA grad_points_tensor);
gather_points_backward_cuda(b, c, n, npoints, grad_out_tensor, idx_tensor,
grad_points_tensor);
#else
AT_ERROR("gather_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("gather_points is not implemented on CPU");
}
} }
...@@ -3,56 +3,32 @@ ...@@ -3,56 +3,32 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
int nsample, const Tensor points,
const Tensor idx, Tensor out);
void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
const Tensor points, const Tensor idx, const Tensor points, const Tensor idx,
Tensor out) { Tensor out) {
GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx, DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
out); points, idx, out);
}; }
void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints, void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
int nsample, const Tensor grad_out,
const Tensor idx,
Tensor grad_points);
void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
const Tensor grad_out, const Tensor idx, const Tensor grad_out, const Tensor idx,
Tensor grad_points) { Tensor grad_points) {
GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out, DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
idx, grad_points); grad_out, idx, grad_points);
}; }
#endif
void group_points_forward(Tensor points_tensor, Tensor idx_tensor, void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor out_tensor, int b, int c, int n, int npoints, Tensor out_tensor, int b, int c, int n, int npoints,
int nsample) { int nsample) {
if (points_tensor.device().is_cuda()) { DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
#ifdef MMCV_WITH_CUDA points_tensor, idx_tensor, out_tensor);
group_points_forward_cuda(b, c, n, npoints, nsample, points_tensor,
idx_tensor, out_tensor);
#else
AT_ERROR("group_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("group_points is not implemented on CPU");
}
} }
void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor, void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor grad_points_tensor, int b, int c, int n, Tensor grad_points_tensor, int b, int c, int n,
int npoints, int nsample) { int npoints, int nsample) {
if (grad_out_tensor.device().is_cuda()) { group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
#ifdef MMCV_WITH_CUDA idx_tensor, grad_points_tensor);
group_points_backward_cuda(b, c, n, npoints, nsample, grad_out_tensor,
idx_tensor, grad_points_tensor);
#else
AT_ERROR("group_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("group_points is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
#include "pytorch_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
#ifndef HIP_DIFF
#include <cuda_runtime_api.h>
int get_cudart_version() { return CUDART_VERSION; }
#endif
#endif
std::string get_compiling_cuda_version() {
#ifdef MMCV_WITH_CUDA
#ifndef HIP_DIFF
std::ostringstream oss;
// copied from
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
auto printCudaStyleVersion = [&](int v) {
oss << (v / 1000) << "." << (v / 10 % 100);
if (v % 10 != 0) {
oss << "." << (v % 10);
}
};
printCudaStyleVersion(get_cudart_version());
return oss.str();
#else
return std::string("rocm not available");
#endif
#else
return std::string("not available");
#endif
}
// similar to
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
std::string get_compiler_version() {
std::ostringstream ss;
#if defined(__GNUC__)
#ifndef __clang__
{ ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
#endif
#endif
#if defined(__clang_major__)
{
ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
<< __clang_patchlevel__;
}
#endif
#if defined(_MSC_VER)
{ ss << "MSVC " << _MSC_FULL_VER; }
#endif
return ss.str();
}
...@@ -8,68 +8,35 @@ All Rights Reserved 2019-2020. ...@@ -8,68 +8,35 @@ All Rights Reserved 2019-2020.
*/ */
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8; const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
#ifdef MMCV_WITH_CUDA void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
#include <cuda.h>
#include <cuda_runtime_api.h>
#define CHECK_ERROR(state) \
{ gpuAssert((state), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
line);
if (abort) exit(code);
}
}
void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
const Tensor boxes_a,
const int num_b,
const Tensor boxes_b,
Tensor ans_overlap);
void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b, const int num_b, const Tensor boxes_b,
Tensor ans_overlap) { Tensor ans_overlap) {
IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b, DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
ans_overlap); num_b, boxes_b, ans_overlap);
}; }
void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a, void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
const Tensor boxes_a,
const int num_b,
const Tensor boxes_b,
Tensor ans_iou);
void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b, const int num_b, const Tensor boxes_b,
Tensor ans_iou) { Tensor ans_iou) {
IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b, DISPATCH_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, num_a, boxes_a, num_b,
ans_iou); boxes_b, ans_iou);
}; }
void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long *mask, int boxes_num,
float nms_overlap_thresh);
void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long *mask, void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) { int boxes_num, float nms_overlap_thresh) {
IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh); DISPATCH_DEVICE_IMPL(iou3d_nms_forward_impl, boxes, mask, boxes_num,
}; nms_overlap_thresh);
}
void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long *mask,
int boxes_num,
float nms_overlap_thresh);
void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long *mask, void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) { int boxes_num, float nms_overlap_thresh) {
IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num, DISPATCH_DEVICE_IMPL(iou3d_nms_normal_forward_impl, boxes, mask, boxes_num,
nms_overlap_thresh); nms_overlap_thresh);
}; }
#endif
void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b, void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
Tensor ans_overlap) { Tensor ans_overlap) {
...@@ -77,23 +44,11 @@ void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b, ...@@ -77,23 +44,11 @@ void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
// params boxes_b: (M, 5) // params boxes_b: (M, 5)
// params ans_overlap: (N, M) // params ans_overlap: (N, M)
if (boxes_a.device().is_cuda()) { int num_a = boxes_a.size(0);
#ifdef MMCV_WITH_CUDA int num_b = boxes_b.size(0);
CHECK_CUDA_INPUT(boxes_a);
CHECK_CUDA_INPUT(boxes_b); iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
CHECK_CUDA_INPUT(ans_overlap); ans_overlap);
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
iou3d_boxes_overlap_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b,
ans_overlap);
#else
AT_ERROR("iou3d_boxes_overlap_bev is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_boxes_overlap_bev is not implemented on CPU");
}
} }
void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b, void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
...@@ -101,77 +56,52 @@ void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b, ...@@ -101,77 +56,52 @@ void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry] // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5) // params boxes_b: (M, 5)
// params ans_overlap: (N, M) // params ans_overlap: (N, M)
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
if (boxes_a.device().is_cuda()) { iou3d_boxes_iou_bev_forward_impl(num_a, boxes_a, num_b, boxes_b, ans_iou);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes_a);
CHECK_CUDA_INPUT(boxes_b);
CHECK_CUDA_INPUT(ans_iou);
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
iou3d_boxes_iou_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b, ans_iou);
#else
AT_ERROR("iou3d_boxes_iou_bev is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_boxes_iou_bev is not implemented on CPU");
}
} }
void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num, void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh) { float nms_overlap_thresh) {
// params boxes: (N, 5) [x1, y1, x2, y2, ry] // params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N) // params keep: (N)
CHECK_CONTIGUOUS(boxes);
CHECK_CONTIGUOUS(keep);
if (boxes.device().is_cuda()) { int boxes_num = boxes.size(0);
#ifdef MMCV_WITH_CUDA int64_t *keep_data = keep.data_ptr<int64_t>();
CHECK_CUDA_INPUT(boxes); int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
CHECK_CONTIGUOUS(keep);
int boxes_num = boxes.size(0); const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
Tensor mask = at::Tensor mask_cpu = mask.to(at::kCPU);
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong)); unsigned long long *mask_host =
unsigned long long *mask_data = (unsigned long long *)mask_cpu.data_ptr<int64_t>();
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_forward_cuda(boxes, mask_data, boxes_num, nms_overlap_thresh);
at::Tensor mask_cpu = mask.to(at::kCPU); std::vector<unsigned long long> remv_cpu(col_blocks);
unsigned long long *mask_host = memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
(unsigned long long *)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv_cpu(col_blocks); int num_to_keep = 0;
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
for (int i = 0; i < boxes_num; i++) { if (!(remv_cpu[nblock] & (1ULL << inblock))) {
int nblock = i / THREADS_PER_BLOCK_NMS; keep_data[num_to_keep++] = i;
int inblock = i % THREADS_PER_BLOCK_NMS; unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
if (!(remv_cpu[nblock] & (1ULL << inblock))) { remv_cpu[j] |= p[j];
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
} }
} }
if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
*keep_num_data = num_to_keep; *keep_num_data = num_to_keep;
#else
AT_ERROR("iou3d_nms is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_nms is not implemented on CPU");
} }
} }
...@@ -180,53 +110,42 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num, ...@@ -180,53 +110,42 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
// params boxes: (N, 5) [x1, y1, x2, y2, ry] // params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N) // params keep: (N)
if (boxes.device().is_cuda()) { CHECK_CONTIGUOUS(boxes);
#ifdef MMCV_WITH_CUDA CHECK_CONTIGUOUS(keep);
CHECK_CUDA_INPUT(boxes);
CHECK_CONTIGUOUS(keep);
int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_normal_forward_cuda(boxes, mask_data, boxes_num,
nms_overlap_thresh);
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host =
(unsigned long long *)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
}
}
if (cudaSuccess != cudaGetLastError()) printf("Error!\n"); int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
*keep_num_data = num_to_keep; const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_normal_forward_impl(boxes, mask_data, boxes_num,
nms_overlap_thresh);
#else at::Tensor mask_cpu = mask.to(at::kCPU);
AT_ERROR("iou3d_nms_normal is not compiled with GPU support"); unsigned long long *mask_host =
#endif (unsigned long long *)mask_cpu.data_ptr<int64_t>();
} else {
AT_ERROR("iou3d_nms_normal is not implemented on CPU"); std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
}
} }
*keep_num_data = num_to_keep;
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment