Unverified Commit 230f9a3b authored by q.yao's avatar q.yao Committed by GitHub
Browse files

Refactor csrc with device dispatcher (#1463)

* Add device registry for pytorch ops

* add declaration of CheckDeviceConsistency

* fix for torch130

* assert with torch check

* Refactor ops with dispatch

* update rest ops

* faster install

* update compatibility

* update compatibility, rename parameter

* move cpu implement to pytorch/cpu

* update ops/csrc/README.md

* fix rocm support

* update cn document

* update docs

* list instead of map
parent ef8ba752
......@@ -100,25 +100,20 @@ THE POSSIBILITY OF SUCH DAMAGES.
*/
#include "pytorch_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
const torch::Tensor& bias,
const torch::Tensor& refer, int act,
int grad, float alpha, float scale);
#endif
#include "pytorch_device_registry.hpp"
torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
const torch::Tensor& bias,
const torch::Tensor& refer, int act,
int grad, float alpha, float scale) {
return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
act, grad, alpha, scale);
}
torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
const torch::Tensor& bias,
const torch::Tensor& refer, int act,
int grad, float alpha, float scale) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA(input);
CHECK_CUDA(bias);
return fused_bias_leakyrelu_op(input, bias, refer, act, grad, alpha, scale);
#else
AT_ERROR("Fused bias leakyrelu is not compiled with GPU support");
#endif
return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
scale);
}
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
const Tensor points,
const Tensor idx, Tensor out);
void gather_points_forward_cuda(int b, int c, int n, int npoints,
void gather_points_forward_impl(int b, int c, int n, int npoints,
const Tensor points, const Tensor idx,
Tensor out) {
GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
};
void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
const Tensor grad_out,
const Tensor idx,
Tensor grad_points);
DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
idx, out);
}
void gather_points_backward_cuda(int b, int c, int n, int npoints,
void gather_points_backward_impl(int b, int c, int n, int npoints,
const Tensor grad_out, const Tensor idx,
Tensor grad_points) {
GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
grad_points);
};
#endif
DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
idx, grad_points);
}
void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor out_tensor, int b, int c, int n,
int npoints) {
if (points_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
gather_points_forward_cuda(b, c, n, npoints, points_tensor, idx_tensor,
out_tensor);
#else
AT_ERROR("gather_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("gather_points is not implemented on CPU");
}
gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
out_tensor);
}
void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor grad_points_tensor, int b, int c, int n,
int npoints) {
if (grad_out_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
gather_points_backward_cuda(b, c, n, npoints, grad_out_tensor, idx_tensor,
grad_points_tensor);
#else
AT_ERROR("gather_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("gather_points is not implemented on CPU");
}
gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
grad_points_tensor);
}
......@@ -3,56 +3,32 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
int nsample, const Tensor points,
const Tensor idx, Tensor out);
void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
const Tensor points, const Tensor idx,
Tensor out) {
GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
out);
};
DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
points, idx, out);
}
void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
int nsample, const Tensor grad_out,
const Tensor idx,
Tensor grad_points);
void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
const Tensor grad_out, const Tensor idx,
Tensor grad_points) {
GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
idx, grad_points);
};
#endif
DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
grad_out, idx, grad_points);
}
void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor out_tensor, int b, int c, int n, int npoints,
int nsample) {
if (points_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
group_points_forward_cuda(b, c, n, npoints, nsample, points_tensor,
idx_tensor, out_tensor);
#else
AT_ERROR("group_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("group_points is not implemented on CPU");
}
DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
points_tensor, idx_tensor, out_tensor);
}
void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor grad_points_tensor, int b, int c, int n,
int npoints, int nsample) {
if (grad_out_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
group_points_backward_cuda(b, c, n, npoints, nsample, grad_out_tensor,
idx_tensor, grad_points_tensor);
#else
AT_ERROR("group_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("group_points is not implemented on CPU");
}
group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
idx_tensor, grad_points_tensor);
}
......@@ -8,68 +8,35 @@ All Rights Reserved 2019-2020.
*/
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
#ifdef MMCV_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime_api.h>
#define CHECK_ERROR(state) \
{ gpuAssert((state), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
line);
if (abort) exit(code);
}
}
void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
const Tensor boxes_a,
const int num_b,
const Tensor boxes_b,
Tensor ans_overlap);
void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b,
Tensor ans_overlap) {
IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
ans_overlap);
};
void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
const Tensor boxes_a,
const int num_b,
const Tensor boxes_b,
Tensor ans_iou);
void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
num_b, boxes_b, ans_overlap);
}
void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b,
Tensor ans_iou) {
IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
ans_iou);
};
void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long *mask, int boxes_num,
float nms_overlap_thresh);
DISPATCH_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, num_a, boxes_a, num_b,
boxes_b, ans_iou);
}
void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long *mask,
void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) {
IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh);
};
void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long *mask,
int boxes_num,
float nms_overlap_thresh);
DISPATCH_DEVICE_IMPL(iou3d_nms_forward_impl, boxes, mask, boxes_num,
nms_overlap_thresh);
}
void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long *mask,
void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) {
IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
nms_overlap_thresh);
};
#endif
DISPATCH_DEVICE_IMPL(iou3d_nms_normal_forward_impl, boxes, mask, boxes_num,
nms_overlap_thresh);
}
void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
Tensor ans_overlap) {
......@@ -77,23 +44,11 @@ void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
if (boxes_a.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes_a);
CHECK_CUDA_INPUT(boxes_b);
CHECK_CUDA_INPUT(ans_overlap);
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
iou3d_boxes_overlap_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b,
ans_overlap);
#else
AT_ERROR("iou3d_boxes_overlap_bev is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_boxes_overlap_bev is not implemented on CPU");
}
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
ans_overlap);
}
void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
......@@ -101,77 +56,52 @@ void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
if (boxes_a.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes_a);
CHECK_CUDA_INPUT(boxes_b);
CHECK_CUDA_INPUT(ans_iou);
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
iou3d_boxes_iou_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b, ans_iou);
#else
AT_ERROR("iou3d_boxes_iou_bev is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_boxes_iou_bev is not implemented on CPU");
}
iou3d_boxes_iou_bev_forward_impl(num_a, boxes_a, num_b, boxes_b, ans_iou);
}
void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh) {
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N)
CHECK_CONTIGUOUS(boxes);
CHECK_CONTIGUOUS(keep);
if (boxes.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes);
CHECK_CONTIGUOUS(keep);
int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_forward_cuda(boxes, mask_data, boxes_num, nms_overlap_thresh);
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host =
(unsigned long long *)mask_cpu.data_ptr<int64_t>();
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host =
(unsigned long long *)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
}
if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
*keep_num_data = num_to_keep;
#else
AT_ERROR("iou3d_nms is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_nms is not implemented on CPU");
}
}
......@@ -180,53 +110,42 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N)
if (boxes.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes);
CHECK_CONTIGUOUS(keep);
int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_normal_forward_cuda(boxes, mask_data, boxes_num,
nms_overlap_thresh);
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host =
(unsigned long long *)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
}
}
CHECK_CONTIGUOUS(boxes);
CHECK_CONTIGUOUS(keep);
if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
*keep_num_data = num_to_keep;
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_normal_forward_impl(boxes, mask_data, boxes_num,
nms_overlap_thresh);
#else
AT_ERROR("iou3d_nms_normal is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_nms_normal is not implemented on CPU");
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host =
(unsigned long long *)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
}
}
*keep_num_data = num_to_keep;
}
......@@ -2,31 +2,16 @@
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
const Tensor xyz, const Tensor new_xyz,
Tensor idx, Tensor dist2);
void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
const Tensor new_xyz, Tensor idx, Tensor dist2) {
KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
dist2);
}
#endif
void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
Tensor dist2_tensor, int b, int n, int m, int nsample) {
if (new_xyz_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(new_xyz_tensor);
CHECK_CUDA_INPUT(xyz_tensor);
knn_forward_cuda(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
dist2_tensor);
#else
AT_ERROR("knn is not compiled with GPU support");
#endif
} else {
AT_ERROR("knn is not implemented on CPU");
}
knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
dist2_tensor);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int kernel_h,
const int kernel_w, const int pad_h,
const int pad_w);
void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int height,
const int width, const int channels);
void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
kernel_h, kernel_w, pad_h, pad_w);
DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
col, kernel_h, kernel_w, pad_h, pad_w);
}
void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
width, channels);
DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
im, height, width, channels);
}
#endif
void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) {
if (im.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(im);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(col);
masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w);
}
void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) {
if (col.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(col);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(im);
masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
channels);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
channels);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void modulated_deformable_im2col_cuda(
void modulated_deformable_im2col_impl(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col);
void modulated_deformable_col2im_cuda(
const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im);
void modulated_deformable_col2im_coord_cuda(
const Tensor data_col, const Tensor data_im, const Tensor data_offset,
const Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask);
#endif
void modulated_deformable_im2col_cpu(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col);
const int dilation_w, const int deformable_group, Tensor data_col) {
DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
data_mask, batch_size, channels, height_im, width_im,
height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w,
deformable_group, data_col);
}
void modulated_deformable_col2im_cpu(
void modulated_deformable_col2im_impl(
const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im);
const int dilation_w, const int deformable_group, Tensor grad_im) {
DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
data_mask, batch_size, channels, height_im, width_im,
height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w,
deformable_group, grad_im);
}
void modulated_deformable_col2im_coord_cpu(
void modulated_deformable_col2im_coord_impl(
const Tensor data_col, const Tensor data_im, const Tensor data_offset,
const Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask);
Tensor grad_offset, Tensor grad_mask) {
DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
data_im, data_offset, data_mask, batch_size, channels,
height_im, width_im, height_col, width_col, kernel_h,
kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
dilation_w, deformable_group, grad_offset, grad_mask);
}
void modulated_deform_conv_forward(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
......@@ -61,31 +51,6 @@ void modulated_deform_conv_forward(
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(columns);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(weight);
CHECK_CPU_INPUT(bias);
CHECK_CPU_INPUT(ones);
CHECK_CPU_INPUT(offset);
CHECK_CPU_INPUT(mask);
CHECK_CPU_INPUT(output);
CHECK_CPU_INPUT(columns);
}
at::DeviceGuard guard(input.device());
const int batch = input.size(0);
......@@ -127,19 +92,10 @@ void modulated_deform_conv_forward(
output.size(2), output.size(3)});
for (int b = 0; b < batch; b++) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
modulated_deformable_im2col_cuda(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
#endif
} else {
modulated_deformable_im2col_cpu(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
}
modulated_deformable_im2col_impl(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
// divide into group
weight = weight.view({group, weight.size(0) / group, weight.size(1),
......@@ -174,41 +130,6 @@ void modulated_deform_conv_backward(
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(columns);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
CHECK_CUDA_INPUT(grad_offset);
CHECK_CUDA_INPUT(grad_mask);
CHECK_CUDA_INPUT(grad_output);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(weight);
CHECK_CPU_INPUT(bias);
CHECK_CPU_INPUT(ones);
CHECK_CPU_INPUT(offset);
CHECK_CPU_INPUT(mask);
CHECK_CPU_INPUT(columns);
CHECK_CPU_INPUT(grad_input);
CHECK_CPU_INPUT(grad_weight);
CHECK_CPU_INPUT(grad_bias);
CHECK_CPU_INPUT(grad_offset);
CHECK_CPU_INPUT(grad_mask);
CHECK_CPU_INPUT(grad_output);
}
at::DeviceGuard guard(input.device());
const int batch = input.size(0);
......@@ -261,46 +182,24 @@ void modulated_deform_conv_backward(
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)});
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cuda(
columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b]);
// gradient w.r.t. input data
modulated_deformable_col2im_cuda(
columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
modulated_deformable_im2col_cuda(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
#endif
} else {
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cpu(
columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b]);
// gradient w.r.t. input data
modulated_deformable_col2im_cpu(
columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
modulated_deformable_im2col_cpu(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
}
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_impl(
columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b]);
// gradient w.r.t. input data
modulated_deformable_col2im_impl(
columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
modulated_deformable_im2col_impl(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
......
......@@ -10,43 +10,39 @@
*/
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
Tensor ms_deform_attn_cuda_forward(const Tensor &value,
Tensor ms_deform_attn_impl_forward(const Tensor &value,
const Tensor &spatial_shapes,
const Tensor &level_start_index,
const Tensor &sampling_loc,
const Tensor &attn_weight,
const int im2col_step);
const int im2col_step) {
return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
spatial_shapes, level_start_index, sampling_loc,
attn_weight, im2col_step);
}
void ms_deform_attn_cuda_backward(
void ms_deform_attn_impl_backward(
const Tensor &value, const Tensor &spatial_shapes,
const Tensor &level_start_index, const Tensor &sampling_loc,
const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);
#endif
Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
const int im2col_step) {
DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
level_start_index, sampling_loc, attn_weight,
grad_output, grad_value, grad_sampling_loc,
grad_attn_weight, im2col_step);
}
Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
const Tensor &level_start_index,
const Tensor &sampling_loc,
const Tensor &attn_weight,
const int im2col_step) {
if (value.type().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(value)
CHECK_CUDA_INPUT(spatial_shapes)
CHECK_CUDA_INPUT(level_start_index)
CHECK_CUDA_INPUT(sampling_loc)
CHECK_CUDA_INPUT(attn_weight)
at::DeviceGuard guard(value.device());
return ms_deform_attn_cuda_forward(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
at::DeviceGuard guard(value.device());
return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, im2col_step);
}
void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
......@@ -56,26 +52,9 @@ void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
const Tensor &grad_output, Tensor &grad_value,
Tensor &grad_sampling_loc,
Tensor &grad_attn_weight, const int im2col_step) {
if (value.type().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(value)
CHECK_CUDA_INPUT(spatial_shapes)
CHECK_CUDA_INPUT(level_start_index)
CHECK_CUDA_INPUT(sampling_loc)
CHECK_CUDA_INPUT(attn_weight)
CHECK_CUDA_INPUT(grad_output)
CHECK_CUDA_INPUT(grad_value)
CHECK_CUDA_INPUT(grad_sampling_loc)
CHECK_CUDA_INPUT(grad_attn_weight)
at::DeviceGuard guard(value.device());
ms_deform_attn_cuda_backward(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, grad_output,
grad_value, grad_sampling_loc,
grad_attn_weight, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
} else {
AT_ERROR("Not implemented on the CPU");
}
at::DeviceGuard guard(value.device());
ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, grad_output,
grad_value, grad_sampling_loc, grad_attn_weight,
im2col_step);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
int offset);
Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
}
#endif
Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto nboxes = boxes.size(0);
Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
auto select = select_t.data_ptr<bool>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
for (int64_t _i = 0; _i < nboxes; _i++) {
if (select[_i] == false) continue;
auto i = order[_i];
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < nboxes; _j++) {
if (select[_j] == false) continue;
auto j = order[_j];
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr > iou_threshold) select[_j] = false;
}
}
return order_t.masked_select(select_t);
Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
float iou_threshold, float sigma, float min_score,
int method, int offset) {
return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
sigma, min_score, method, offset);
}
Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
if (boxes.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes);
CHECK_CUDA_INPUT(scores);
return nms_cuda(boxes, scores, iou_threshold, offset);
#else
AT_ERROR("nms is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(boxes);
CHECK_CPU_INPUT(scores);
return nms_cpu(boxes, scores, iou_threshold, offset);
}
std::vector<std::vector<int> > nms_match_impl(Tensor dets,
float iou_threshold) {
return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
}
Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
float iou_threshold, float sigma, float min_score,
int method, int offset) {
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
auto scores_t = scores.clone();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
auto nboxes = boxes.size(0);
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto sc = scores_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
auto de = dets.data_ptr<float>();
int64_t pos = 0;
Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
auto inds = inds_t.data_ptr<int64_t>();
for (int64_t i = 0; i < nboxes; i++) {
auto max_score = sc[i];
auto max_pos = i;
pos = i + 1;
// get max box
while (pos < nboxes) {
if (max_score < sc[pos]) {
max_score = sc[pos];
max_pos = pos;
}
pos = pos + 1;
}
// swap
auto ix1 = de[i * 5 + 0] = x1[max_pos];
auto iy1 = de[i * 5 + 1] = y1[max_pos];
auto ix2 = de[i * 5 + 2] = x2[max_pos];
auto iy2 = de[i * 5 + 3] = y2[max_pos];
auto iscore = de[i * 5 + 4] = sc[max_pos];
auto iarea = areas[max_pos];
auto iind = inds[max_pos];
x1[max_pos] = x1[i];
y1[max_pos] = y1[i];
x2[max_pos] = x2[i];
y2[max_pos] = y2[i];
sc[max_pos] = sc[i];
areas[max_pos] = areas[i];
inds[max_pos] = inds[i];
x1[i] = ix1;
y1[i] = iy1;
x2[i] = ix2;
y2[i] = iy2;
sc[i] = iscore;
areas[i] = iarea;
inds[i] = iind;
pos = i + 1;
while (pos < nboxes) {
auto xx1 = std::max(ix1, x1[pos]);
auto yy1 = std::max(iy1, y1[pos]);
auto xx2 = std::min(ix2, x2[pos]);
auto yy2 = std::min(iy2, y2[pos]);
auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas[pos] - inter);
float weight = 1.;
if (method == 0) {
if (ovr >= iou_threshold) weight = 0;
} else if (method == 1) {
if (ovr >= iou_threshold) weight = 1 - ovr;
} else if (method == 2) {
weight = std::exp(-(ovr * ovr) / sigma);
}
sc[pos] *= weight;
// if box score falls below threshold, discard the box by
// swapping with last box update N
if (sc[pos] < min_score) {
x1[pos] = x1[nboxes - 1];
y1[pos] = y1[nboxes - 1];
x2[pos] = x2[nboxes - 1];
y2[pos] = y2[nboxes - 1];
sc[pos] = sc[nboxes - 1];
areas[pos] = areas[nboxes - 1];
inds[pos] = inds[nboxes - 1];
nboxes = nboxes - 1;
pos = pos - 1;
}
pos = pos + 1;
}
}
return inds_t.slice(0, 0, nboxes);
Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
return nms_impl(boxes, scores, iou_threshold, offset);
}
Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
float sigma, float min_score, int method, int offset) {
if (boxes.device().is_cuda()) {
AT_ERROR("softnms is not implemented on GPU");
} else {
return softnms_cpu(boxes, scores, dets, iou_threshold, sigma, min_score,
method, offset);
}
}
std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
auto x1_t = dets.select(1, 0).contiguous();
auto y1_t = dets.select(1, 1).contiguous();
auto x2_t = dets.select(1, 2).contiguous();
auto y2_t = dets.select(1, 3).contiguous();
auto scores = dets.select(1, 4).contiguous();
at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
at::Tensor suppressed_t =
at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
std::vector<int> keep;
std::vector<std::vector<int> > matched;
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1) continue;
keep.push_back(i);
std::vector<int> v_i;
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1) continue;
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(static_cast<float>(0), xx2 - xx1);
auto h = std::max(static_cast<float>(0), yy2 - yy1);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr >= iou_threshold) {
suppressed[j] = 1;
v_i.push_back(j);
}
}
matched.push_back(v_i);
}
for (int i = 0; i < keep.size(); i++)
matched[i].insert(matched[i].begin(), keep[i]);
return matched;
return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
method, offset);
}
std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
if (dets.device().is_cuda()) {
AT_ERROR("nms_match is not implemented on GPU");
} else {
return nms_match_cpu(dets, iou_threshold);
}
return nms_match_impl(dets, iou_threshold);
}
......@@ -2,120 +2,14 @@
// It is modified from https://github.com/WenmuZhou/PAN.pytorch
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
std::vector<std::vector<float>> estimate_confidence(int32_t* label,
float* score, int label_num,
int height, int width) {
std::vector<std::vector<float>> point_vector;
for (int i = 0; i < label_num; i++) {
std::vector<float> point;
point.push_back(0);
point.push_back(0);
point_vector.push_back(point);
}
for (int y = 0; y < height; y++) {
auto label_tmp = label + y * width;
auto score_tmp = score + y * width;
for (int x = 0; x < width; x++) {
auto l = label_tmp[x];
if (l > 0) {
float confidence = score_tmp[x];
point_vector[l].push_back(x);
point_vector[l].push_back(y);
point_vector[l][0] += confidence;
point_vector[l][1] += 1;
}
}
}
for (int l = 0; l < point_vector.size(); l++)
if (point_vector[l][1] > 0) {
point_vector[l][0] /= point_vector[l][1];
}
return point_vector;
}
std::vector<std::vector<float>> pixel_group_cpu(
std::vector<std::vector<float>> pixel_group_impl(
Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
assert(score.dim() == 2);
assert(mask.dim() == 2);
assert(embedding_dim.dim() == 3);
int height = score.size(0);
int width = score.size(1);
assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
assert(width == mask.size(1) == embedding.size(2) == kernel_label.size(2));
auto threshold_square = dis_threshold * dis_threshold;
auto ptr_score = score.data_ptr<float>();
auto ptr_mask = mask.data_ptr<bool>();
auto ptr_kernel_contour = kernel_contour.data_ptr<uint8_t>();
auto ptr_embedding = embedding.data_ptr<float>();
auto ptr_kernel_label = kernel_label.data_ptr<int32_t>();
std::queue<std::tuple<int, int, int32_t>> contour_pixels;
auto embedding_dim = embedding.size(2);
std::vector<std::vector<float>> kernel_vector(
kernel_region_num, std::vector<float>(embedding_dim + 1, 0));
Tensor text_label;
text_label = kernel_label.clone();
auto ptr_text_label = text_label.data_ptr<int32_t>();
for (int i = 0; i < height; i++) {
auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
for (int j = 0, k = 0; j < width && k < width * embedding_dim;
j++, k += embedding_dim) {
int32_t label = ptr_kernel_label_tmp[j];
if (label > 0) {
for (int d = 0; d < embedding_dim; d++)
kernel_vector[label][d] += ptr_embedding_tmp[k + d];
kernel_vector[label][embedding_dim] += 1;
// kernel pixel number
if (ptr_kernel_contour_tmp[j]) {
contour_pixels.push(std::make_tuple(i, j, label));
}
}
}
}
for (int i = 0; i < kernel_region_num; i++) {
for (int j = 0; j < embedding_dim; j++) {
kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
}
}
int dx[4] = {-1, 1, 0, 0};
int dy[4] = {0, 0, -1, 1};
while (!contour_pixels.empty()) {
auto query_pixel = contour_pixels.front();
contour_pixels.pop();
int y = std::get<0>(query_pixel);
int x = std::get<1>(query_pixel);
int32_t l = std::get<2>(query_pixel);
auto kernel_cv = kernel_vector[l];
for (int idx = 0; idx < 4; idx++) {
int tmpy = y + dy[idx];
int tmpx = x + dx[idx];
auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0)
continue;
float dis = 0;
auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
for (size_t i = 0; i < embedding_dim; i++) {
dis +=
pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
// ignore further computing if dis is big enough
if (dis >= threshold_square) break;
}
if (dis >= threshold_square) continue;
contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
ptr_text_label_tmp[tmpx] = l;
}
}
return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num,
height, width);
return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
kernel_label, kernel_contour, kernel_region_num,
dis_threshold);
}
std::vector<std::vector<float>> pixel_group(
......@@ -127,11 +21,6 @@ std::vector<std::vector<float>> pixel_group(
kernel_label = kernel_label.contiguous();
kernel_contour = kernel_contour.contiguous();
CHECK_CPU_INPUT(score);
CHECK_CPU_INPUT(mask);
CHECK_CPU_INPUT(embedding);
CHECK_CPU_INPUT(kernel_label);
CHECK_CPU_INPUT(kernel_contour);
return pixel_group_cpu(score, mask, embedding, kernel_label, kernel_contour,
kernel_region_num, distance_threshold);
return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
kernel_region_num, distance_threshold);
}
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
int pts_num, const Tensor boxes,
const Tensor pts,
Tensor box_idx_of_points);
void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
int pts_num, const Tensor boxes,
const Tensor pts,
Tensor box_idx_of_points) {
PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
boxes, pts, box_idx_of_points);
};
void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
int pts_num, const Tensor boxes,
const Tensor pts,
Tensor box_idx_of_points);
DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
pts_num, boxes, pts, box_idx_of_points);
}
void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
int pts_num, const Tensor boxes,
const Tensor pts,
Tensor box_idx_of_points) {
PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
boxes, pts, box_idx_of_points);
};
#endif
DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
pts_num, boxes, pts, box_idx_of_points);
}
void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
Tensor box_idx_of_points_tensor) {
......@@ -34,30 +23,12 @@ void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
// coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
// [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
// default -1
if (pts_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes_tensor);
CHECK_CUDA_INPUT(pts_tensor);
CHECK_CUDA_INPUT(box_idx_of_points_tensor);
int batch_size = boxes_tensor.size(0);
int boxes_num = boxes_tensor.size(1);
int pts_num = pts_tensor.size(1);
const float *boxes = boxes_tensor.data_ptr<float>();
const float *pts = pts_tensor.data_ptr<float>();
int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
points_in_boxes_part_forward_cuda(batch_size, boxes_num, pts_num,
boxes_tensor, pts_tensor,
box_idx_of_points_tensor);
#else
AT_ERROR("points_in_boxes_part is not compiled with GPU support");
#endif
} else {
AT_ERROR("points_in_boxes_part is not implemented on CPU");
}
int batch_size = boxes_tensor.size(0);
int boxes_num = boxes_tensor.size(1);
int pts_num = pts_tensor.size(1);
points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
boxes_tensor, pts_tensor,
box_idx_of_points_tensor);
}
void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
......@@ -65,28 +36,9 @@ void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
// params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
// coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
// in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
if (pts_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes_tensor);
CHECK_CUDA_INPUT(pts_tensor);
CHECK_CUDA_INPUT(box_idx_of_points_tensor);
int batch_size = boxes_tensor.size(0);
int boxes_num = boxes_tensor.size(1);
int pts_num = pts_tensor.size(1);
const float *boxes = boxes_tensor.data_ptr<float>();
const float *pts = pts_tensor.data_ptr<float>();
int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
points_in_boxes_all_forward_cuda(batch_size, boxes_num, pts_num,
boxes_tensor, pts_tensor,
box_idx_of_points_tensor);
#else
AT_ERROR("points_in_boxes_all is not compiled with GPU support");
#endif
} else {
AT_ERROR("points_in_boxes_all is not implemented on CPU");
}
int batch_size = boxes_tensor.size(0);
int boxes_num = boxes_tensor.size(1);
int pts_num = pts_tensor.size(1);
points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
pts_tensor, box_idx_of_points_tensor);
}
......@@ -2,255 +2,40 @@
// Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifndef min
#define min(a, b) (((a) < (b)) ? (a) : (b))
#endif
#ifndef max
#define max(a, b) (((a) > (b)) ? (a) : (b))
#endif
void psamask_collect_forward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const Tensor mask_data,
Tensor buffer_data) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
buffer_data.view({-1})[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) *
h_feature * w_feature +
h * w_feature + w] =
mask_data.view(
{-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w];
}
}
}
}
}
}
void psamask_distribute_forward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const Tensor mask_data,
Tensor buffer_data) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
buffer_data.view(
{-1})[(n * h_feature * w_feature + h * w_feature + w) *
h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)] =
mask_data.view(
{-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w];
}
}
}
}
}
}
void psamask_collect_backward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const Tensor buffer_diff,
Tensor mask_diff) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w] =
buffer_diff.view({-1})[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) *
h_feature * w_feature +
h * w_feature + w];
}
}
}
}
}
}
void psamask_distribute_backward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask,
const Tensor buffer_diff, Tensor mask_diff) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w] =
buffer_diff.view(
{-1})[(n * h_feature * w_feature + h * w_feature + w) *
h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)];
}
}
}
}
}
}
void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask) {
if (psa_type == 0)
psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input, output);
else
psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input, output);
}
void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) {
if (psa_type == 0)
psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, grad_output, grad_input);
else
psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, grad_output,
grad_input);
}
#ifdef MMCV_WITH_CUDA
void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
Tensor output, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask,
const int half_w_mask);
void PSAMaskBackwardCUDAKernelLauncher(
const int psa_type, const Tensor grad_output, Tensor grad_input,
const int num_, const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask, const int half_w_mask);
void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask) {
PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask,
half_w_mask);
DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask);
}
void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
void psamask_backward_impl(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) {
PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask);
DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask);
}
#endif
void psamask_forward(const Tensor input, Tensor output, const int psa_type,
const int num_, const int h_feature, const int w_feature,
const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(output);
psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
h_mask, w_mask, half_h_mask, half_w_mask);
#else
AT_ERROR("PSAMask is not compiled with GPU support");
#endif
} else {
psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
h_mask, w_mask, half_h_mask, half_w_mask);
}
psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
h_mask, w_mask, half_h_mask, half_w_mask);
}
void psamask_backward(Tensor grad_output, const Tensor grad_input,
const int psa_type, const int num_, const int h_feature,
const int w_feature, const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) {
if (grad_input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_output);
psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
#else
AT_ERROR("PSAMask is not compiled with GPU support");
#endif
} else {
psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
}
psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned);
void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignForwardCUDAKernelLauncher(
input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
spatial_scale, sampling_ratio, pool_mode, aligned);
DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
argmax_x, aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignBackwardCUDAKernelLauncher(
grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
}
#endif
void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned);
void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) {
ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
argmax_x, grad_input, aligned_height, aligned_width,
spatial_scale, sampling_ratio, pool_mode, aligned);
}
void roi_align_forward(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax_y);
CHECK_CUDA_INPUT(argmax_x);
roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
AT_ERROR("RoIAlign is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(output);
CHECK_CPU_INPUT(argmax_y);
CHECK_CPU_INPUT(argmax_x);
roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) {
if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(argmax_y);
CHECK_CUDA_INPUT(argmax_x);
CHECK_CUDA_INPUT(grad_input);
roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
AT_ERROR("RoIAlign is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(grad_output);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(argmax_y);
CHECK_CPU_INPUT(argmax_x);
CHECK_CPU_INPUT(grad_input);
roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void ROIAlignRotatedForwardCUDAKernelLauncher(
const at::Tensor features, const at::Tensor rois, const float spatial_scale,
const int sample_ratio, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois,
const int aligned_height, const int aligned_width, at::Tensor output);
void ROIAlignRotatedBackwardCUDAKernelLauncher(
const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
const int sample_ratio, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois,
const int aligned_height, const int aligned_width, at::Tensor bottom_grad);
void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sample_ratio,
bool aligned, bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
int num_channels = features.size(1);
int data_height = features.size(2);
int data_width = features.size(3);
ROIAlignRotatedForwardCUDAKernelLauncher(
features, rois, spatial_scale, sample_ratio, aligned, clockwise,
num_channels, data_height, data_width, num_rois, aligned_height,
aligned_width, output);
DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,
aligned_height, aligned_width, spatial_scale,
sample_ratio, aligned, clockwise);
}
void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sample_ratio, bool aligned,
bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
int num_channels = bottom_grad.size(1);
int data_height = bottom_grad.size(2);
int data_width = bottom_grad.size(3);
ROIAlignRotatedBackwardCUDAKernelLauncher(
top_grad, rois, spatial_scale, sample_ratio, aligned, clockwise,
num_channels, data_height, data_width, num_rois, aligned_height,
aligned_width, bottom_grad);
}
#endif
void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise);
void ROIAlignRotatedBackwardCPULauncher(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise);
void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise) {
ROIAlignRotatedForwardCPULauncher(input, rois, output, aligned_height,
aligned_width, spatial_scale,
sampling_ratio, aligned, clockwise);
}
void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
ROIAlignRotatedBackwardCPULauncher(
top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
sampling_ratio, aligned, clockwise);
DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
bottom_grad, aligned_height, aligned_width,
spatial_scale, sample_ratio, aligned, clockwise);
}
void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
roi_align_rotated_forward_cuda(input, rois, output, aligned_height,
aligned_width, spatial_scale, sampling_ratio,
aligned, clockwise);
#else
AT_ERROR("RoIAlignRotated is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(output);
roi_align_rotated_forward_cpu(input, rois, output, aligned_height,
aligned_width, spatial_scale, sampling_ratio,
aligned, clockwise);
}
roi_align_rotated_forward_impl(input, rois, output, aligned_height,
aligned_width, spatial_scale, sampling_ratio,
aligned, clockwise);
}
void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
......@@ -124,25 +35,7 @@ void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise) {
if (top_grad.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(top_grad);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(bottom_grad);
roi_align_rotated_backward_cuda(top_grad, rois, bottom_grad, aligned_height,
aligned_width, spatial_scale,
sampling_ratio, aligned, clockwise);
#else
AT_ERROR("RoIAlignRotated is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(top_grad);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(bottom_grad);
roi_align_rotated_backward_cpu(top_grad, rois, bottom_grad, aligned_height,
aligned_width, spatial_scale, sampling_ratio,
aligned, clockwise);
}
roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
aligned_width, spatial_scale, sampling_ratio,
aligned, clockwise);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height,
int pooled_width, float spatial_scale);
void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax, Tensor grad_input,
int pooled_height, int pooled_width,
float spatial_scale);
void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height, int pooled_width,
float spatial_scale) {
ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale);
DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
pooled_height, pooled_width, spatial_scale);
}
void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale) {
ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
pooled_height, pooled_width, spatial_scale);
DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
grad_input, pooled_height, pooled_width, spatial_scale);
}
#endif
void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
int pooled_height, int pooled_width,
float spatial_scale) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax);
roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale);
#else
AT_ERROR("RoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("RoIPool is not implemented on CPU");
}
roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale);
}
void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height, int pooled_width,
float spatial_scale) {
if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(argmax);
CHECK_CUDA_INPUT(grad_input);
roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
pooled_width, spatial_scale);
#else
AT_ERROR("RoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("RoIPool is not implemented on CPU");
}
roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
pooled_width, spatial_scale);
}
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void RoiawarePool3dForwardCUDAKernelLauncher(
int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
int out_y, int out_z, const Tensor rois, const Tensor pts,
const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
Tensor pooled_features, int pool_method);
void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
int max_pts_each_voxel, int out_x, int out_y,
int out_z, const Tensor rois,
const Tensor pts, const Tensor pts_feature,
Tensor argmax, Tensor pts_idx_of_voxels,
Tensor pooled_features, int pool_method) {
RoiawarePool3dForwardCUDAKernelLauncher(
boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
pool_method);
};
void RoiawarePool3dBackwardCUDAKernelLauncher(
int boxes_num, int out_x, int out_y, int out_z, int channels,
int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
const Tensor grad_out, Tensor grad_in, int pool_method);
DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,
channels, max_pts_each_voxel, out_x, out_y, out_z, rois,
pts, pts_feature, argmax, pts_idx_of_voxels,
pooled_features, pool_method);
}
void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,
void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
int out_z, int channels,
int max_pts_each_voxel,
const Tensor pts_idx_of_voxels,
const Tensor argmax, const Tensor grad_out,
Tensor grad_in, int pool_method) {
RoiawarePool3dBackwardCUDAKernelLauncher(
boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
};
#endif
DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,
out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,
argmax, grad_out, grad_in, pool_method);
}
void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
Tensor argmax, Tensor pts_idx_of_voxels,
......@@ -47,36 +35,20 @@ void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params pooled_features: (N, out_x, out_y, out_z, C)
// params pool_method: 0: max_pool 1: avg_pool
if (pts.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(pts);
CHECK_CUDA_INPUT(pts_feature);
CHECK_CUDA_INPUT(argmax);
CHECK_CUDA_INPUT(pts_idx_of_voxels);
CHECK_CUDA_INPUT(pooled_features);
int boxes_num = rois.size(0);
int pts_num = pts.size(0);
int channels = pts_feature.size(1);
int max_pts_each_voxel = pts_idx_of_voxels.size(4); // index 0 is the counter
int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3);
assert((out_x < 256) && (out_y < 256) &&
(out_z < 256)); // we encode index with 8bit
int boxes_num = rois.size(0);
int pts_num = pts.size(0);
int channels = pts_feature.size(1);
int max_pts_each_voxel =
pts_idx_of_voxels.size(4); // index 0 is the counter
int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3);
assert((out_x < 256) && (out_y < 256) &&
(out_z < 256)); // we encode index with 8bit
roiaware_pool3d_forward_cuda(boxes_num, pts_num, channels,
max_pts_each_voxel, out_x, out_y, out_z, rois,
pts, pts_feature, argmax, pts_idx_of_voxels,
pooled_features, pool_method);
#else
AT_ERROR("roiaware_pool3d is not compiled with GPU support");
#endif
} else {
AT_ERROR("roiaware_pool3d is not implemented on CPU");
}
roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,
out_x, out_y, out_z, rois, pts, pts_feature,
argmax, pts_idx_of_voxels, pooled_features,
pool_method);
}
void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
......@@ -87,29 +59,14 @@ void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
// params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value
// params pool_method: 0: max_pool 1: avg_pool
int boxes_num = pts_idx_of_voxels.size(0);
int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3);
int max_pts_each_voxel = pts_idx_of_voxels.size(4); // index 0 is the counter
int channels = grad_out.size(4);
if (grad_in.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(pts_idx_of_voxels);
CHECK_CUDA_INPUT(argmax);
CHECK_CUDA_INPUT(grad_out);
CHECK_CUDA_INPUT(grad_in);
int boxes_num = pts_idx_of_voxels.size(0);
int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3);
int max_pts_each_voxel =
pts_idx_of_voxels.size(4); // index 0 is the counter
int channels = grad_out.size(4);
roiaware_pool3d_backward_cuda(boxes_num, out_x, out_y, out_z, channels,
max_pts_each_voxel, pts_idx_of_voxels, argmax,
grad_out, grad_in, pool_method);
#else
AT_ERROR("roiaware_pool3d is not compiled with GPU support");
#endif
} else {
AT_ERROR("roiaware_pool3d is not implemented on CPU");
}
roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,
max_pts_each_voxel, pts_idx_of_voxels, argmax,
grad_out, grad_in, pool_method);
}
......@@ -7,24 +7,18 @@ All Rights Reserved 2018.
*/
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void RoIPointPool3dForwardCUDAKernelLauncher(
int batch_size, int pts_num, int boxes_num, int feature_in_len,
int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
int feature_in_len, int sampled_pts_num,
const Tensor xyz, const Tensor boxes3d,
const Tensor pts_feature,
Tensor pooled_features,
Tensor pooled_empty_flag) {
RoIPointPool3dForwardCUDAKernelLauncher(
batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
boxes3d, pts_feature, pooled_features, pooled_empty_flag);
};
#endif
DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
pts_feature, pooled_features, pooled_empty_flag);
}
void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
Tensor pooled_features, Tensor pooled_empty_flag) {
......@@ -33,28 +27,13 @@ void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
// params pts_feature: (B, N, C)
// params pooled_features: (B, M, 512, 3+C)
// params pooled_empty_flag: (B, M)
int batch_size = xyz.size(0);
int pts_num = xyz.size(1);
int boxes_num = boxes3d.size(1);
int feature_in_len = pts_feature.size(2);
int sampled_pts_num = pooled_features.size(2);
if (xyz.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(xyz);
CHECK_CUDA_INPUT(boxes3d);
CHECK_CUDA_INPUT(pts_feature);
CHECK_CUDA_INPUT(pooled_features);
CHECK_CUDA_INPUT(pooled_empty_flag);
int batch_size = xyz.size(0);
int pts_num = xyz.size(1);
int boxes_num = boxes3d.size(1);
int feature_in_len = pts_feature.size(2);
int sampled_pts_num = pooled_features.size(2);
roipoint_pool3d_forward_cuda(batch_size, pts_num, boxes_num, feature_in_len,
sampled_pts_num, xyz, boxes3d, pts_feature,
pooled_features, pooled_empty_flag);
#else
AT_ERROR("roipoint_pool3d is not compiled with GPU support");
#endif
} else {
AT_ERROR("roipoint_pool3d is not implemented on CPU");
}
roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
sampled_pts_num, xyz, boxes3d, pts_feature,
pooled_features, pooled_empty_flag);
}
// Copyright (c) OpenMMLab. All rights reserved.
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
#ifdef MMCV_WITH_CUDA
std::vector<torch::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
const torch::Tensor &feats, const torch::Tensor &coors,
const reduce_t reduce_type);
std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
const torch::Tensor &feats, const torch::Tensor &coors,
const reduce_t reduce_type) {
return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
reduce_type);
};
void DynamicPointToVoxelBackwardCUDAKernelLauncher(
torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
const torch::Tensor &feats, const torch::Tensor &reduced_feats,
const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
const reduce_t reduce_type);
return DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, feats, coors,
reduce_type);
}
void dynamic_point_to_voxel_backward_cuda(
void dynamic_point_to_voxel_backward_impl(
torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
const torch::Tensor &feats, const torch::Tensor &reduced_feats,
const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
const reduce_t reduce_type) {
DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
feats, reduced_feats, coors_idx,
reduce_count, reduce_type);
};
#endif
std::vector<at::Tensor> dynamic_point_to_voxel_forward_cpu(
const at::Tensor &points, const at::Tensor &voxel_mapping,
const std::vector<float> voxel_size, const std::vector<float> coors_range);
DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, grad_feats,
grad_reduced_feats, feats, reduced_feats, coors_idx,
reduce_count, reduce_type);
}
inline reduce_t convert_reduce_type(const std::string &reduce_type) {
if (reduce_type == "max")
......@@ -51,19 +36,8 @@ inline reduce_t convert_reduce_type(const std::string &reduce_type) {
std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
const torch::Tensor &feats, const torch::Tensor &coors,
const std::string &reduce_type) {
if (feats.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(feats);
CHECK_CUDA_INPUT(coors);
return dynamic_point_to_voxel_forward_cuda(
feats, coors, convert_reduce_type(reduce_type));
#else
AT_ERROR("dynamic_point_to_voxel is not compiled with GPU support");
#endif
} else {
AT_ERROR("dynamic_point_to_voxel is not implemented on CPU");
return std::vector<torch::Tensor>();
}
return dynamic_point_to_voxel_forward_impl(feats, coors,
convert_reduce_type(reduce_type));
}
void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
......@@ -73,21 +47,7 @@ void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
const torch::Tensor &coors_idx,
const torch::Tensor &reduce_count,
const std::string &reduce_type) {
if (grad_feats.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_feats);
CHECK_CUDA_INPUT(grad_reduced_feats);
CHECK_CUDA_INPUT(feats);
CHECK_CUDA_INPUT(reduced_feats);
CHECK_CUDA_INPUT(coors_idx);
CHECK_CUDA_INPUT(reduce_count);
dynamic_point_to_voxel_backward_cuda(grad_feats, grad_reduced_feats, feats,
reduced_feats, coors_idx, reduce_count,
convert_reduce_type(reduce_type));
#else
AT_ERROR("dynamic_point_to_voxel is not compiled with GPU support");
#endif
} else {
AT_ERROR("dynamic_point_to_voxel is not implemented on CPU");
}
dynamic_point_to_voxel_backward_impl(grad_feats, grad_reduced_feats, feats,
reduced_feats, coors_idx, reduce_count,
convert_reduce_type(reduce_type));
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
Tensor var);
void SyncBNForwardOutputCUDAKernelLauncher(
const Tensor input, const Tensor mean, const Tensor var,
Tensor running_mean, Tensor running_var, const Tensor weight,
const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
float momentum, int group_size);
void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
const Tensor norm,
Tensor grad_weight,
Tensor grad_bias);
void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
const Tensor weight,
const Tensor grad_weight,
const Tensor grad_bias,
const Tensor norm, const Tensor std,
Tensor grad_input);
void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
SyncBNForwardMeanCUDAKernelLauncher(input, mean);
void sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {
DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);
}
void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
Tensor var) {
SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);
}
void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
const Tensor var, Tensor running_mean,
Tensor running_var, const Tensor weight,
const Tensor bias, Tensor norm, Tensor std,
Tensor output, float eps, float momentum,
int group_size) {
SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
running_var, weight, bias, norm, std,
output, eps, momentum, group_size);
DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,
running_mean, running_var, weight, bias, norm, std,
output, eps, momentum, group_size);
}
void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
Tensor grad_weight, Tensor grad_bias) {
SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
grad_bias);
DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,
grad_weight, grad_bias);
}
void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
const Tensor grad_weight,
const Tensor grad_bias, const Tensor norm,
const Tensor std, Tensor grad_input) {
SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
grad_bias, norm, std, grad_input);
DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,
grad_weight, grad_bias, norm, std, grad_input);
}
#endif
void sync_bn_forward_mean(const Tensor input, Tensor mean) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(mean);
sync_bn_forward_mean_cuda(input, mean);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
sync_bn_forward_mean_impl(input, mean);
}
void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(mean);
CHECK_CUDA_INPUT(var);
sync_bn_forward_var_cuda(input, mean, var);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
sync_bn_forward_var_impl(input, mean, var);
}
void sync_bn_forward_output(const Tensor input, const Tensor mean,
......@@ -95,65 +50,20 @@ void sync_bn_forward_output(const Tensor input, const Tensor mean,
Tensor running_var, Tensor norm, Tensor std,
Tensor output, float eps, float momentum,
int group_size) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(mean);
CHECK_CUDA_INPUT(var);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(running_mean);
CHECK_CUDA_INPUT(running_var);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(std);
CHECK_CUDA_INPUT(output);
sync_bn_forward_output_cuda(input, mean, var, running_mean, running_var,
weight, bias, norm, std, output, eps, momentum,
group_size);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,
weight, bias, norm, std, output, eps, momentum,
group_size);
}
void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
Tensor grad_weight, Tensor grad_bias) {
if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
sync_bn_backward_param_cuda(grad_output, norm, grad_weight, grad_bias);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);
}
void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
const Tensor grad_weight, const Tensor grad_bias,
const Tensor norm, const Tensor std,
Tensor grad_input) {
if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(std);
CHECK_CUDA_INPUT(grad_input);
sync_bn_backward_data_cuda(grad_output, weight, grad_weight, grad_bias,
norm, std, grad_input);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,
std, grad_input);
}
......@@ -2,60 +2,32 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
const Tensor points,
const Tensor idx,
const Tensor weight, Tensor out);
void three_interpolate_forward_cuda(int b, int c, int m, int n,
void three_interpolate_forward_impl(int b, int c, int m, int n,
const Tensor points, const Tensor idx,
const Tensor weight, Tensor out) {
ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
out);
};
void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
const Tensor grad_out,
const Tensor idx,
const Tensor weight,
Tensor grad_points);
DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,
weight, out);
}
void three_interpolate_backward_cuda(int b, int c, int n, int m,
void three_interpolate_backward_impl(int b, int c, int n, int m,
const Tensor grad_out, const Tensor idx,
const Tensor weight, Tensor grad_points) {
ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
grad_points);
};
#endif
DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,
idx, weight, grad_points);
}
void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor weight_tensor, Tensor out_tensor, int b,
int c, int m, int n) {
if (points_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
three_interpolate_forward_cuda(b, c, m, n, points_tensor, idx_tensor,
weight_tensor, out_tensor);
#else
AT_ERROR("three_interpolate is not compiled with GPU support");
#endif
} else {
AT_ERROR("three_interpolate is not implemented on CPU");
}
three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,
weight_tensor, out_tensor);
}
void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor weight_tensor, Tensor grad_points_tensor,
int b, int c, int n, int m) {
if (grad_out_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
three_interpolate_backward_cuda(b, c, n, m, grad_out_tensor, idx_tensor,
weight_tensor, grad_points_tensor);
#else
AT_ERROR("three_interpolate is not compiled with GPU support");
#endif
} else {
AT_ERROR("three_interpolate is not implemented on CPU");
}
three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,
weight_tensor, grad_points_tensor);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment