Unverified Commit 230f9a3b authored by q.yao's avatar q.yao Committed by GitHub
Browse files

Refactor csrc with device dispatcher (#1463)

* Add device registry for pytorch ops

* add declaration of CheckDeviceConsistency

* fix for torch130

* assert with torch check

* Refactor ops with dispatch

* update rest ops

* faster install

* update compatibility

* update compatibility, rename parameter

* move cpu implement to pytorch/cpu

* update ops/csrc/README.md

* fix rocm support

* update cn document

* update docs

* list instead of map
parent ef8ba752
...@@ -100,25 +100,20 @@ THE POSSIBILITY OF SUCH DAMAGES. ...@@ -100,25 +100,20 @@ THE POSSIBILITY OF SUCH DAMAGES.
*/ */
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
const torch::Tensor& bias, const torch::Tensor& bias,
const torch::Tensor& refer, int act, const torch::Tensor& refer, int act,
int grad, float alpha, float scale); int grad, float alpha, float scale) {
return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
#endif act, grad, alpha, scale);
}
torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input, torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
const torch::Tensor& bias, const torch::Tensor& bias,
const torch::Tensor& refer, int act, const torch::Tensor& refer, int act,
int grad, float alpha, float scale) { int grad, float alpha, float scale) {
#ifdef MMCV_WITH_CUDA return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
CHECK_CUDA(input); scale);
CHECK_CUDA(bias);
return fused_bias_leakyrelu_op(input, bias, refer, act, grad, alpha, scale);
#else
AT_ERROR("Fused bias leakyrelu is not compiled with GPU support");
#endif
} }
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void gather_points_forward_impl(int b, int c, int n, int npoints,
void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
const Tensor points,
const Tensor idx, Tensor out);
void gather_points_forward_cuda(int b, int c, int n, int npoints,
const Tensor points, const Tensor idx, const Tensor points, const Tensor idx,
Tensor out) { Tensor out) {
GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out); DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
}; idx, out);
}
void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
const Tensor grad_out,
const Tensor idx,
Tensor grad_points);
void gather_points_backward_cuda(int b, int c, int n, int npoints, void gather_points_backward_impl(int b, int c, int n, int npoints,
const Tensor grad_out, const Tensor idx, const Tensor grad_out, const Tensor idx,
Tensor grad_points) { Tensor grad_points) {
GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx, DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
grad_points); idx, grad_points);
}; }
#endif
void gather_points_forward(Tensor points_tensor, Tensor idx_tensor, void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor out_tensor, int b, int c, int n, Tensor out_tensor, int b, int c, int n,
int npoints) { int npoints) {
if (points_tensor.device().is_cuda()) { gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA
gather_points_forward_cuda(b, c, n, npoints, points_tensor, idx_tensor,
out_tensor); out_tensor);
#else
AT_ERROR("gather_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("gather_points is not implemented on CPU");
}
} }
void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor, void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor grad_points_tensor, int b, int c, int n, Tensor grad_points_tensor, int b, int c, int n,
int npoints) { int npoints) {
if (grad_out_tensor.device().is_cuda()) { gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA
gather_points_backward_cuda(b, c, n, npoints, grad_out_tensor, idx_tensor,
grad_points_tensor); grad_points_tensor);
#else
AT_ERROR("gather_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("gather_points is not implemented on CPU");
}
} }
...@@ -3,56 +3,32 @@ ...@@ -3,56 +3,32 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
int nsample, const Tensor points,
const Tensor idx, Tensor out);
void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
const Tensor points, const Tensor idx, const Tensor points, const Tensor idx,
Tensor out) { Tensor out) {
GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx, DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
out); points, idx, out);
}; }
void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints, void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
int nsample, const Tensor grad_out,
const Tensor idx,
Tensor grad_points);
void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
const Tensor grad_out, const Tensor idx, const Tensor grad_out, const Tensor idx,
Tensor grad_points) { Tensor grad_points) {
GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out, DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
idx, grad_points); grad_out, idx, grad_points);
}; }
#endif
void group_points_forward(Tensor points_tensor, Tensor idx_tensor, void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor out_tensor, int b, int c, int n, int npoints, Tensor out_tensor, int b, int c, int n, int npoints,
int nsample) { int nsample) {
if (points_tensor.device().is_cuda()) { DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
#ifdef MMCV_WITH_CUDA points_tensor, idx_tensor, out_tensor);
group_points_forward_cuda(b, c, n, npoints, nsample, points_tensor,
idx_tensor, out_tensor);
#else
AT_ERROR("group_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("group_points is not implemented on CPU");
}
} }
void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor, void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor grad_points_tensor, int b, int c, int n, Tensor grad_points_tensor, int b, int c, int n,
int npoints, int nsample) { int npoints, int nsample) {
if (grad_out_tensor.device().is_cuda()) { group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
#ifdef MMCV_WITH_CUDA
group_points_backward_cuda(b, c, n, npoints, nsample, grad_out_tensor,
idx_tensor, grad_points_tensor); idx_tensor, grad_points_tensor);
#else
AT_ERROR("group_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("group_points is not implemented on CPU");
}
} }
...@@ -8,68 +8,35 @@ All Rights Reserved 2019-2020. ...@@ -8,68 +8,35 @@ All Rights Reserved 2019-2020.
*/ */
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8; const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
#ifdef MMCV_WITH_CUDA void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
#include <cuda.h>
#include <cuda_runtime_api.h>
#define CHECK_ERROR(state) \
{ gpuAssert((state), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
line);
if (abort) exit(code);
}
}
void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
const Tensor boxes_a,
const int num_b,
const Tensor boxes_b,
Tensor ans_overlap);
void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b, const int num_b, const Tensor boxes_b,
Tensor ans_overlap) { Tensor ans_overlap) {
IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b, DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
ans_overlap); num_b, boxes_b, ans_overlap);
}; }
void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a, void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
const Tensor boxes_a,
const int num_b,
const Tensor boxes_b,
Tensor ans_iou);
void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b, const int num_b, const Tensor boxes_b,
Tensor ans_iou) { Tensor ans_iou) {
IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b, DISPATCH_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, num_a, boxes_a, num_b,
ans_iou); boxes_b, ans_iou);
}; }
void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long *mask, int boxes_num,
float nms_overlap_thresh);
void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long *mask, void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) { int boxes_num, float nms_overlap_thresh) {
IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh); DISPATCH_DEVICE_IMPL(iou3d_nms_forward_impl, boxes, mask, boxes_num,
}; nms_overlap_thresh);
}
void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long *mask,
int boxes_num,
float nms_overlap_thresh);
void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long *mask, void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) { int boxes_num, float nms_overlap_thresh) {
IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num, DISPATCH_DEVICE_IMPL(iou3d_nms_normal_forward_impl, boxes, mask, boxes_num,
nms_overlap_thresh); nms_overlap_thresh);
}; }
#endif
void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b, void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
Tensor ans_overlap) { Tensor ans_overlap) {
...@@ -77,23 +44,11 @@ void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b, ...@@ -77,23 +44,11 @@ void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
// params boxes_b: (M, 5) // params boxes_b: (M, 5)
// params ans_overlap: (N, M) // params ans_overlap: (N, M)
if (boxes_a.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes_a);
CHECK_CUDA_INPUT(boxes_b);
CHECK_CUDA_INPUT(ans_overlap);
int num_a = boxes_a.size(0); int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0); int num_b = boxes_b.size(0);
iou3d_boxes_overlap_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b, iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
ans_overlap); ans_overlap);
#else
AT_ERROR("iou3d_boxes_overlap_bev is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_boxes_overlap_bev is not implemented on CPU");
}
} }
void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b, void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
...@@ -101,33 +56,17 @@ void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b, ...@@ -101,33 +56,17 @@ void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry] // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5) // params boxes_b: (M, 5)
// params ans_overlap: (N, M) // params ans_overlap: (N, M)
if (boxes_a.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes_a);
CHECK_CUDA_INPUT(boxes_b);
CHECK_CUDA_INPUT(ans_iou);
int num_a = boxes_a.size(0); int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0); int num_b = boxes_b.size(0);
iou3d_boxes_iou_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b, ans_iou); iou3d_boxes_iou_bev_forward_impl(num_a, boxes_a, num_b, boxes_b, ans_iou);
#else
AT_ERROR("iou3d_boxes_iou_bev is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_boxes_iou_bev is not implemented on CPU");
}
} }
void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num, void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh) { float nms_overlap_thresh) {
// params boxes: (N, 5) [x1, y1, x2, y2, ry] // params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N) // params keep: (N)
CHECK_CONTIGUOUS(boxes);
if (boxes.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes);
CHECK_CONTIGUOUS(keep); CHECK_CONTIGUOUS(keep);
int boxes_num = boxes.size(0); int boxes_num = boxes.size(0);
...@@ -140,7 +79,7 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num, ...@@ -140,7 +79,7 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong)); at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data = unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>(); (unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_forward_cuda(boxes, mask_data, boxes_num, nms_overlap_thresh); iou3d_nms_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
at::Tensor mask_cpu = mask.to(at::kCPU); at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host = unsigned long long *mask_host =
...@@ -162,16 +101,7 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num, ...@@ -162,16 +101,7 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
remv_cpu[j] |= p[j]; remv_cpu[j] |= p[j];
} }
} }
}
if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
*keep_num_data = num_to_keep; *keep_num_data = num_to_keep;
#else
AT_ERROR("iou3d_nms is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_nms is not implemented on CPU");
} }
} }
...@@ -180,9 +110,7 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num, ...@@ -180,9 +110,7 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
// params boxes: (N, 5) [x1, y1, x2, y2, ry] // params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N) // params keep: (N)
if (boxes.device().is_cuda()) { CHECK_CONTIGUOUS(boxes);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes);
CHECK_CONTIGUOUS(keep); CHECK_CONTIGUOUS(keep);
int boxes_num = boxes.size(0); int boxes_num = boxes.size(0);
...@@ -195,7 +123,7 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num, ...@@ -195,7 +123,7 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong)); at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data = unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>(); (unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_normal_forward_cuda(boxes, mask_data, boxes_num, iou3d_nms_normal_forward_impl(boxes, mask_data, boxes_num,
nms_overlap_thresh); nms_overlap_thresh);
at::Tensor mask_cpu = mask.to(at::kCPU); at::Tensor mask_cpu = mask.to(at::kCPU);
...@@ -219,14 +147,5 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num, ...@@ -219,14 +147,5 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
} }
} }
if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
*keep_num_data = num_to_keep; *keep_num_data = num_to_keep;
#else
AT_ERROR("iou3d_nms_normal is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_nms_normal is not implemented on CPU");
}
} }
...@@ -2,31 +2,16 @@ ...@@ -2,31 +2,16 @@
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
const Tensor xyz, const Tensor new_xyz,
Tensor idx, Tensor dist2);
void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
const Tensor new_xyz, Tensor idx, Tensor dist2) { const Tensor new_xyz, Tensor idx, Tensor dist2) {
KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2); DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
dist2);
} }
#endif
void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor, void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
Tensor dist2_tensor, int b, int n, int m, int nsample) { Tensor dist2_tensor, int b, int n, int m, int nsample) {
if (new_xyz_tensor.device().is_cuda()) { knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(new_xyz_tensor);
CHECK_CUDA_INPUT(xyz_tensor);
knn_forward_cuda(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
dist2_tensor); dist2_tensor);
#else
AT_ERROR("knn is not compiled with GPU support");
#endif
} else {
AT_ERROR("knn is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int kernel_h,
const int kernel_w, const int pad_h,
const int pad_w);
void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int height,
const int width, const int channels);
void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col, const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) { const int pad_h, const int pad_w) {
// im: (n, ic, h, w), kernel size (kh, kw) DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh) col, kernel_h, kernel_w, pad_h, pad_w);
MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
kernel_h, kernel_w, pad_h, pad_w);
} }
void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx, void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height, const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) { int width, int channels) {
// im: (n, ic, h, w), kernel size (kh, kw) DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh) im, height, width, channels);
MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
width, channels);
} }
#endif
void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx, void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col, const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) { const int pad_h, const int pad_w) {
if (im.device().is_cuda()) { masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(im);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(col);
masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w); kernel_w, pad_h, pad_w);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
} }
void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx, void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height, const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) { int width, int channels) {
if (col.device().is_cuda()) { masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(col);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(im);
masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
channels); channels);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void modulated_deformable_im2col_impl(
void modulated_deformable_im2col_cuda(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask, const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im, const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col, const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col);
void modulated_deformable_col2im_cuda(
const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h, const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im); const int dilation_w, const int deformable_group, Tensor data_col) {
DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
void modulated_deformable_col2im_coord_cuda( data_mask, batch_size, channels, height_im, width_im,
const Tensor data_col, const Tensor data_im, const Tensor data_offset, height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
const Tensor data_mask, const int batch_size, const int channels, stride_h, stride_w, dilation_h, dilation_w,
const int height_im, const int width_im, const int height_col, deformable_group, data_col);
const int width_col, const int kernel_h, const int kernel_w, }
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask);
#endif
void modulated_deformable_im2col_cpu(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col);
void modulated_deformable_col2im_cpu( void modulated_deformable_col2im_impl(
const Tensor data_col, const Tensor data_offset, const Tensor data_mask, const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im, const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col, const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h, const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im); const int dilation_w, const int deformable_group, Tensor grad_im) {
DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
data_mask, batch_size, channels, height_im, width_im,
height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w,
deformable_group, grad_im);
}
void modulated_deformable_col2im_coord_cpu( void modulated_deformable_col2im_coord_impl(
const Tensor data_col, const Tensor data_im, const Tensor data_offset, const Tensor data_col, const Tensor data_im, const Tensor data_offset,
const Tensor data_mask, const int batch_size, const int channels, const Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col, const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kernel_w, const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group, const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask); Tensor grad_offset, Tensor grad_mask) {
DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
data_im, data_offset, data_mask, batch_size, channels,
height_im, width_im, height_col, width_col, kernel_h,
kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
dilation_w, deformable_group, grad_offset, grad_mask);
}
void modulated_deform_conv_forward( void modulated_deform_conv_forward(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset, Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
...@@ -61,31 +51,6 @@ void modulated_deform_conv_forward( ...@@ -61,31 +51,6 @@ void modulated_deform_conv_forward(
const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group, const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias) { const int deformable_group, const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(columns);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(weight);
CHECK_CPU_INPUT(bias);
CHECK_CPU_INPUT(ones);
CHECK_CPU_INPUT(offset);
CHECK_CPU_INPUT(mask);
CHECK_CPU_INPUT(output);
CHECK_CPU_INPUT(columns);
}
at::DeviceGuard guard(input.device()); at::DeviceGuard guard(input.device());
const int batch = input.size(0); const int batch = input.size(0);
...@@ -127,19 +92,10 @@ void modulated_deform_conv_forward( ...@@ -127,19 +92,10 @@ void modulated_deform_conv_forward(
output.size(2), output.size(3)}); output.size(2), output.size(3)});
for (int b = 0; b < batch; b++) { for (int b = 0; b < batch; b++) {
if (input.device().is_cuda()) { modulated_deformable_im2col_impl(
#ifdef MMCV_WITH_CUDA
modulated_deformable_im2col_cuda(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
#endif
} else {
modulated_deformable_im2col_cpu(
input[b], offset[b], mask[b], 1, channels, height, width, height_out, input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns); dilation_h, dilation_w, deformable_group, columns);
}
// divide into group // divide into group
weight = weight.view({group, weight.size(0) / group, weight.size(1), weight = weight.view({group, weight.size(0) / group, weight.size(1),
...@@ -174,41 +130,6 @@ void modulated_deform_conv_backward( ...@@ -174,41 +130,6 @@ void modulated_deform_conv_backward(
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias) { const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(columns);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
CHECK_CUDA_INPUT(grad_offset);
CHECK_CUDA_INPUT(grad_mask);
CHECK_CUDA_INPUT(grad_output);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(weight);
CHECK_CPU_INPUT(bias);
CHECK_CPU_INPUT(ones);
CHECK_CPU_INPUT(offset);
CHECK_CPU_INPUT(mask);
CHECK_CPU_INPUT(columns);
CHECK_CPU_INPUT(grad_input);
CHECK_CPU_INPUT(grad_weight);
CHECK_CPU_INPUT(grad_bias);
CHECK_CPU_INPUT(grad_offset);
CHECK_CPU_INPUT(grad_mask);
CHECK_CPU_INPUT(grad_output);
}
at::DeviceGuard guard(input.device()); at::DeviceGuard guard(input.device());
const int batch = input.size(0); const int batch = input.size(0);
...@@ -261,46 +182,24 @@ void modulated_deform_conv_backward( ...@@ -261,46 +182,24 @@ void modulated_deform_conv_backward(
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)}); weight.size(3), weight.size(4)});
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
// gradient w.r.t. input coordinate data // gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cuda( modulated_deformable_col2im_coord_impl(
columns, input[b], offset[b], mask[b], 1, channels, height, width, columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b], stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b]); grad_mask[b]);
// gradient w.r.t. input data // gradient w.r.t. input data
modulated_deformable_col2im_cuda( modulated_deformable_col2im_impl(
columns, offset[b], mask[b], 1, channels, height, width, height_out, columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b]); dilation_h, dilation_w, deformable_group, grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and // gradient w.r.t. weight, dWeight should accumulate across the batch and
// group // group
modulated_deformable_im2col_cuda( modulated_deformable_im2col_impl(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
#endif
} else {
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cpu(
columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b]);
// gradient w.r.t. input data
modulated_deformable_col2im_cpu(
columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
modulated_deformable_im2col_cpu(
input[b], offset[b], mask[b], 1, channels, height, width, height_out, input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns); dilation_h, dilation_w, deformable_group, columns);
}
columns = columns.view({group, columns.size(0) / group, columns.size(1)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)});
grad_weight = grad_weight.view({group, grad_weight.size(0) / group, grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
......
...@@ -10,43 +10,39 @@ ...@@ -10,43 +10,39 @@
*/ */
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA Tensor ms_deform_attn_impl_forward(const Tensor &value,
Tensor ms_deform_attn_cuda_forward(const Tensor &value,
const Tensor &spatial_shapes, const Tensor &spatial_shapes,
const Tensor &level_start_index, const Tensor &level_start_index,
const Tensor &sampling_loc, const Tensor &sampling_loc,
const Tensor &attn_weight, const Tensor &attn_weight,
const int im2col_step); const int im2col_step) {
return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
spatial_shapes, level_start_index, sampling_loc,
attn_weight, im2col_step);
}
void ms_deform_attn_cuda_backward( void ms_deform_attn_impl_backward(
const Tensor &value, const Tensor &spatial_shapes, const Tensor &value, const Tensor &spatial_shapes,
const Tensor &level_start_index, const Tensor &sampling_loc, const Tensor &level_start_index, const Tensor &sampling_loc,
const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value, const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step); Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
const int im2col_step) {
#endif DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
level_start_index, sampling_loc, attn_weight,
grad_output, grad_value, grad_sampling_loc,
grad_attn_weight, im2col_step);
}
Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes, Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
const Tensor &level_start_index, const Tensor &level_start_index,
const Tensor &sampling_loc, const Tensor &sampling_loc,
const Tensor &attn_weight, const Tensor &attn_weight,
const int im2col_step) { const int im2col_step) {
if (value.type().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(value)
CHECK_CUDA_INPUT(spatial_shapes)
CHECK_CUDA_INPUT(level_start_index)
CHECK_CUDA_INPUT(sampling_loc)
CHECK_CUDA_INPUT(attn_weight)
at::DeviceGuard guard(value.device()); at::DeviceGuard guard(value.device());
return ms_deform_attn_cuda_forward(value, spatial_shapes, level_start_index, return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, im2col_step); sampling_loc, attn_weight, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
} }
void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes, void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
...@@ -56,26 +52,9 @@ void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes, ...@@ -56,26 +52,9 @@ void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
const Tensor &grad_output, Tensor &grad_value, const Tensor &grad_output, Tensor &grad_value,
Tensor &grad_sampling_loc, Tensor &grad_sampling_loc,
Tensor &grad_attn_weight, const int im2col_step) { Tensor &grad_attn_weight, const int im2col_step) {
if (value.type().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(value)
CHECK_CUDA_INPUT(spatial_shapes)
CHECK_CUDA_INPUT(level_start_index)
CHECK_CUDA_INPUT(sampling_loc)
CHECK_CUDA_INPUT(attn_weight)
CHECK_CUDA_INPUT(grad_output)
CHECK_CUDA_INPUT(grad_value)
CHECK_CUDA_INPUT(grad_sampling_loc)
CHECK_CUDA_INPUT(grad_attn_weight)
at::DeviceGuard guard(value.device()); at::DeviceGuard guard(value.device());
ms_deform_attn_cuda_backward(value, spatial_shapes, level_start_index, ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, grad_output, sampling_loc, attn_weight, grad_output,
grad_value, grad_sampling_loc, grad_value, grad_sampling_loc, grad_attn_weight,
grad_attn_weight, im2col_step); im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
} else {
AT_ERROR("Not implemented on the CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold, return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
int offset);
Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
} }
#endif
Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto nboxes = boxes.size(0);
Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
auto select = select_t.data_ptr<bool>(); Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
for (int64_t _i = 0; _i < nboxes; _i++) {
if (select[_i] == false) continue;
auto i = order[_i];
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < nboxes; _j++) {
if (select[_j] == false) continue;
auto j = order[_j];
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr > iou_threshold) select[_j] = false;
}
}
return order_t.masked_select(select_t);
}
Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
if (boxes.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes);
CHECK_CUDA_INPUT(scores);
return nms_cuda(boxes, scores, iou_threshold, offset);
#else
AT_ERROR("nms is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(boxes);
CHECK_CPU_INPUT(scores);
return nms_cpu(boxes, scores, iou_threshold, offset);
}
}
Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
float iou_threshold, float sigma, float min_score, float iou_threshold, float sigma, float min_score,
int method, int offset) { int method, int offset) {
if (boxes.numel() == 0) { return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
return at::empty({0}, boxes.options().dtype(at::kLong)); sigma, min_score, method, offset);
} }
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
auto scores_t = scores.clone();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
auto nboxes = boxes.size(0);
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto sc = scores_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
auto de = dets.data_ptr<float>();
int64_t pos = 0;
Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
auto inds = inds_t.data_ptr<int64_t>();
for (int64_t i = 0; i < nboxes; i++) {
auto max_score = sc[i];
auto max_pos = i;
pos = i + 1;
// get max box
while (pos < nboxes) {
if (max_score < sc[pos]) {
max_score = sc[pos];
max_pos = pos;
}
pos = pos + 1;
}
// swap
auto ix1 = de[i * 5 + 0] = x1[max_pos];
auto iy1 = de[i * 5 + 1] = y1[max_pos];
auto ix2 = de[i * 5 + 2] = x2[max_pos];
auto iy2 = de[i * 5 + 3] = y2[max_pos];
auto iscore = de[i * 5 + 4] = sc[max_pos];
auto iarea = areas[max_pos];
auto iind = inds[max_pos];
x1[max_pos] = x1[i];
y1[max_pos] = y1[i];
x2[max_pos] = x2[i];
y2[max_pos] = y2[i];
sc[max_pos] = sc[i];
areas[max_pos] = areas[i];
inds[max_pos] = inds[i];
x1[i] = ix1;
y1[i] = iy1;
x2[i] = ix2;
y2[i] = iy2;
sc[i] = iscore;
areas[i] = iarea;
inds[i] = iind;
pos = i + 1;
while (pos < nboxes) {
auto xx1 = std::max(ix1, x1[pos]);
auto yy1 = std::max(iy1, y1[pos]);
auto xx2 = std::min(ix2, x2[pos]);
auto yy2 = std::min(iy2, y2[pos]);
auto w = std::max(0.f, xx2 - xx1 + offset); std::vector<std::vector<int> > nms_match_impl(Tensor dets,
auto h = std::max(0.f, yy2 - yy1 + offset); float iou_threshold) {
auto inter = w * h; return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
auto ovr = inter / (iarea + areas[pos] - inter); }
float weight = 1.; Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
if (method == 0) { return nms_impl(boxes, scores, iou_threshold, offset);
if (ovr >= iou_threshold) weight = 0;
} else if (method == 1) {
if (ovr >= iou_threshold) weight = 1 - ovr;
} else if (method == 2) {
weight = std::exp(-(ovr * ovr) / sigma);
}
sc[pos] *= weight;
// if box score falls below threshold, discard the box by
// swapping with last box update N
if (sc[pos] < min_score) {
x1[pos] = x1[nboxes - 1];
y1[pos] = y1[nboxes - 1];
x2[pos] = x2[nboxes - 1];
y2[pos] = y2[nboxes - 1];
sc[pos] = sc[nboxes - 1];
areas[pos] = areas[nboxes - 1];
inds[pos] = inds[nboxes - 1];
nboxes = nboxes - 1;
pos = pos - 1;
}
pos = pos + 1;
}
}
return inds_t.slice(0, 0, nboxes);
} }
Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold, Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
float sigma, float min_score, int method, int offset) { float sigma, float min_score, int method, int offset) {
if (boxes.device().is_cuda()) { return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
AT_ERROR("softnms is not implemented on GPU");
} else {
return softnms_cpu(boxes, scores, dets, iou_threshold, sigma, min_score,
method, offset); method, offset);
}
}
std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
auto x1_t = dets.select(1, 0).contiguous();
auto y1_t = dets.select(1, 1).contiguous();
auto x2_t = dets.select(1, 2).contiguous();
auto y2_t = dets.select(1, 3).contiguous();
auto scores = dets.select(1, 4).contiguous();
at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
at::Tensor suppressed_t =
at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
std::vector<int> keep;
std::vector<std::vector<int> > matched;
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1) continue;
keep.push_back(i);
std::vector<int> v_i;
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1) continue;
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(static_cast<float>(0), xx2 - xx1);
auto h = std::max(static_cast<float>(0), yy2 - yy1);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr >= iou_threshold) {
suppressed[j] = 1;
v_i.push_back(j);
}
}
matched.push_back(v_i);
}
for (int i = 0; i < keep.size(); i++)
matched[i].insert(matched[i].begin(), keep[i]);
return matched;
} }
std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) { std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
if (dets.device().is_cuda()) { return nms_match_impl(dets, iou_threshold);
AT_ERROR("nms_match is not implemented on GPU");
} else {
return nms_match_cpu(dets, iou_threshold);
}
} }
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment