Unverified Commit 230f9a3b authored by q.yao's avatar q.yao Committed by GitHub
Browse files

Refactor csrc with device dispatcher (#1463)

* Add device registry for pytorch ops

* add declaration of CheckDeviceConsistency

* fix for torch130

* assert with torch check

* Refactor ops with dispatch

* update rest ops

* faster install

* update compatibility

* update compatibility, rename parameter

* move cpu implement to pytorch/cpu

* update ops/csrc/README.md

* fix rocm support

* update cn document

* update docs

* list instead of map
parent ef8ba752
...@@ -100,25 +100,20 @@ THE POSSIBILITY OF SUCH DAMAGES. ...@@ -100,25 +100,20 @@ THE POSSIBILITY OF SUCH DAMAGES.
*/ */
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input, torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
const torch::Tensor& bias, const torch::Tensor& bias,
const torch::Tensor& refer, int act, const torch::Tensor& refer, int act,
int grad, float alpha, float scale); int grad, float alpha, float scale) {
return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
#endif act, grad, alpha, scale);
}
torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input, torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
const torch::Tensor& bias, const torch::Tensor& bias,
const torch::Tensor& refer, int act, const torch::Tensor& refer, int act,
int grad, float alpha, float scale) { int grad, float alpha, float scale) {
#ifdef MMCV_WITH_CUDA return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
CHECK_CUDA(input); scale);
CHECK_CUDA(bias);
return fused_bias_leakyrelu_op(input, bias, refer, act, grad, alpha, scale);
#else
AT_ERROR("Fused bias leakyrelu is not compiled with GPU support");
#endif
} }
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void gather_points_forward_impl(int b, int c, int n, int npoints,
void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
const Tensor points,
const Tensor idx, Tensor out);
void gather_points_forward_cuda(int b, int c, int n, int npoints,
const Tensor points, const Tensor idx, const Tensor points, const Tensor idx,
Tensor out) { Tensor out) {
GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out); DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
}; idx, out);
}
void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
const Tensor grad_out,
const Tensor idx,
Tensor grad_points);
void gather_points_backward_cuda(int b, int c, int n, int npoints, void gather_points_backward_impl(int b, int c, int n, int npoints,
const Tensor grad_out, const Tensor idx, const Tensor grad_out, const Tensor idx,
Tensor grad_points) { Tensor grad_points) {
GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx, DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
grad_points); idx, grad_points);
}; }
#endif
void gather_points_forward(Tensor points_tensor, Tensor idx_tensor, void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor out_tensor, int b, int c, int n, Tensor out_tensor, int b, int c, int n,
int npoints) { int npoints) {
if (points_tensor.device().is_cuda()) { gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA out_tensor);
gather_points_forward_cuda(b, c, n, npoints, points_tensor, idx_tensor,
out_tensor);
#else
AT_ERROR("gather_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("gather_points is not implemented on CPU");
}
} }
void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor, void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor grad_points_tensor, int b, int c, int n, Tensor grad_points_tensor, int b, int c, int n,
int npoints) { int npoints) {
if (grad_out_tensor.device().is_cuda()) { gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA grad_points_tensor);
gather_points_backward_cuda(b, c, n, npoints, grad_out_tensor, idx_tensor,
grad_points_tensor);
#else
AT_ERROR("gather_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("gather_points is not implemented on CPU");
}
} }
...@@ -3,56 +3,32 @@ ...@@ -3,56 +3,32 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
int nsample, const Tensor points,
const Tensor idx, Tensor out);
void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
const Tensor points, const Tensor idx, const Tensor points, const Tensor idx,
Tensor out) { Tensor out) {
GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx, DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
out); points, idx, out);
}; }
void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints, void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
int nsample, const Tensor grad_out,
const Tensor idx,
Tensor grad_points);
void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
const Tensor grad_out, const Tensor idx, const Tensor grad_out, const Tensor idx,
Tensor grad_points) { Tensor grad_points) {
GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out, DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
idx, grad_points); grad_out, idx, grad_points);
}; }
#endif
void group_points_forward(Tensor points_tensor, Tensor idx_tensor, void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor out_tensor, int b, int c, int n, int npoints, Tensor out_tensor, int b, int c, int n, int npoints,
int nsample) { int nsample) {
if (points_tensor.device().is_cuda()) { DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
#ifdef MMCV_WITH_CUDA points_tensor, idx_tensor, out_tensor);
group_points_forward_cuda(b, c, n, npoints, nsample, points_tensor,
idx_tensor, out_tensor);
#else
AT_ERROR("group_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("group_points is not implemented on CPU");
}
} }
void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor, void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor grad_points_tensor, int b, int c, int n, Tensor grad_points_tensor, int b, int c, int n,
int npoints, int nsample) { int npoints, int nsample) {
if (grad_out_tensor.device().is_cuda()) { group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
#ifdef MMCV_WITH_CUDA idx_tensor, grad_points_tensor);
group_points_backward_cuda(b, c, n, npoints, nsample, grad_out_tensor,
idx_tensor, grad_points_tensor);
#else
AT_ERROR("group_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("group_points is not implemented on CPU");
}
} }
...@@ -8,68 +8,35 @@ All Rights Reserved 2019-2020. ...@@ -8,68 +8,35 @@ All Rights Reserved 2019-2020.
*/ */
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8; const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
#ifdef MMCV_WITH_CUDA void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
#include <cuda.h>
#include <cuda_runtime_api.h>
#define CHECK_ERROR(state) \
{ gpuAssert((state), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
line);
if (abort) exit(code);
}
}
void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
const Tensor boxes_a,
const int num_b,
const Tensor boxes_b,
Tensor ans_overlap);
void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b, const int num_b, const Tensor boxes_b,
Tensor ans_overlap) { Tensor ans_overlap) {
IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b, DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
ans_overlap); num_b, boxes_b, ans_overlap);
}; }
void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a, void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
const Tensor boxes_a,
const int num_b,
const Tensor boxes_b,
Tensor ans_iou);
void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b, const int num_b, const Tensor boxes_b,
Tensor ans_iou) { Tensor ans_iou) {
IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b, DISPATCH_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, num_a, boxes_a, num_b,
ans_iou); boxes_b, ans_iou);
}; }
void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long *mask, int boxes_num,
float nms_overlap_thresh);
void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long *mask, void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) { int boxes_num, float nms_overlap_thresh) {
IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh); DISPATCH_DEVICE_IMPL(iou3d_nms_forward_impl, boxes, mask, boxes_num,
}; nms_overlap_thresh);
}
void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long *mask,
int boxes_num,
float nms_overlap_thresh);
void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long *mask, void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) { int boxes_num, float nms_overlap_thresh) {
IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num, DISPATCH_DEVICE_IMPL(iou3d_nms_normal_forward_impl, boxes, mask, boxes_num,
nms_overlap_thresh); nms_overlap_thresh);
}; }
#endif
void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b, void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
Tensor ans_overlap) { Tensor ans_overlap) {
...@@ -77,23 +44,11 @@ void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b, ...@@ -77,23 +44,11 @@ void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
// params boxes_b: (M, 5) // params boxes_b: (M, 5)
// params ans_overlap: (N, M) // params ans_overlap: (N, M)
if (boxes_a.device().is_cuda()) { int num_a = boxes_a.size(0);
#ifdef MMCV_WITH_CUDA int num_b = boxes_b.size(0);
CHECK_CUDA_INPUT(boxes_a);
CHECK_CUDA_INPUT(boxes_b); iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
CHECK_CUDA_INPUT(ans_overlap); ans_overlap);
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
iou3d_boxes_overlap_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b,
ans_overlap);
#else
AT_ERROR("iou3d_boxes_overlap_bev is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_boxes_overlap_bev is not implemented on CPU");
}
} }
void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b, void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
...@@ -101,77 +56,52 @@ void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b, ...@@ -101,77 +56,52 @@ void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry] // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5) // params boxes_b: (M, 5)
// params ans_overlap: (N, M) // params ans_overlap: (N, M)
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
if (boxes_a.device().is_cuda()) { iou3d_boxes_iou_bev_forward_impl(num_a, boxes_a, num_b, boxes_b, ans_iou);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes_a);
CHECK_CUDA_INPUT(boxes_b);
CHECK_CUDA_INPUT(ans_iou);
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
iou3d_boxes_iou_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b, ans_iou);
#else
AT_ERROR("iou3d_boxes_iou_bev is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_boxes_iou_bev is not implemented on CPU");
}
} }
void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num, void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh) { float nms_overlap_thresh) {
// params boxes: (N, 5) [x1, y1, x2, y2, ry] // params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N) // params keep: (N)
CHECK_CONTIGUOUS(boxes);
CHECK_CONTIGUOUS(keep);
if (boxes.device().is_cuda()) { int boxes_num = boxes.size(0);
#ifdef MMCV_WITH_CUDA int64_t *keep_data = keep.data_ptr<int64_t>();
CHECK_CUDA_INPUT(boxes); int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
CHECK_CONTIGUOUS(keep);
int boxes_num = boxes.size(0); const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
Tensor mask = at::Tensor mask_cpu = mask.to(at::kCPU);
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong)); unsigned long long *mask_host =
unsigned long long *mask_data = (unsigned long long *)mask_cpu.data_ptr<int64_t>();
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_forward_cuda(boxes, mask_data, boxes_num, nms_overlap_thresh);
at::Tensor mask_cpu = mask.to(at::kCPU); std::vector<unsigned long long> remv_cpu(col_blocks);
unsigned long long *mask_host = memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
(unsigned long long *)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv_cpu(col_blocks); int num_to_keep = 0;
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
for (int i = 0; i < boxes_num; i++) { if (!(remv_cpu[nblock] & (1ULL << inblock))) {
int nblock = i / THREADS_PER_BLOCK_NMS; keep_data[num_to_keep++] = i;
int inblock = i % THREADS_PER_BLOCK_NMS; unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
if (!(remv_cpu[nblock] & (1ULL << inblock))) { remv_cpu[j] |= p[j];
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
} }
} }
if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
*keep_num_data = num_to_keep; *keep_num_data = num_to_keep;
#else
AT_ERROR("iou3d_nms is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_nms is not implemented on CPU");
} }
} }
...@@ -180,53 +110,42 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num, ...@@ -180,53 +110,42 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
// params boxes: (N, 5) [x1, y1, x2, y2, ry] // params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N) // params keep: (N)
if (boxes.device().is_cuda()) { CHECK_CONTIGUOUS(boxes);
#ifdef MMCV_WITH_CUDA CHECK_CONTIGUOUS(keep);
CHECK_CUDA_INPUT(boxes);
CHECK_CONTIGUOUS(keep);
int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_normal_forward_cuda(boxes, mask_data, boxes_num,
nms_overlap_thresh);
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host =
(unsigned long long *)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
}
}
if (cudaSuccess != cudaGetLastError()) printf("Error!\n"); int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
*keep_num_data = num_to_keep; const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_normal_forward_impl(boxes, mask_data, boxes_num,
nms_overlap_thresh);
#else at::Tensor mask_cpu = mask.to(at::kCPU);
AT_ERROR("iou3d_nms_normal is not compiled with GPU support"); unsigned long long *mask_host =
#endif (unsigned long long *)mask_cpu.data_ptr<int64_t>();
} else {
AT_ERROR("iou3d_nms_normal is not implemented on CPU"); std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
}
} }
*keep_num_data = num_to_keep;
} }
...@@ -2,31 +2,16 @@ ...@@ -2,31 +2,16 @@
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
const Tensor xyz, const Tensor new_xyz,
Tensor idx, Tensor dist2);
void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
const Tensor new_xyz, Tensor idx, Tensor dist2) { const Tensor new_xyz, Tensor idx, Tensor dist2) {
KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2); DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
dist2);
} }
#endif
void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor, void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
Tensor dist2_tensor, int b, int n, int m, int nsample) { Tensor dist2_tensor, int b, int n, int m, int nsample) {
if (new_xyz_tensor.device().is_cuda()) { knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA dist2_tensor);
CHECK_CUDA_INPUT(new_xyz_tensor);
CHECK_CUDA_INPUT(xyz_tensor);
knn_forward_cuda(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
dist2_tensor);
#else
AT_ERROR("knn is not compiled with GPU support");
#endif
} else {
AT_ERROR("knn is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int kernel_h,
const int kernel_w, const int pad_h,
const int pad_w);
void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int height,
const int width, const int channels);
void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col, const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) { const int pad_h, const int pad_w) {
// im: (n, ic, h, w), kernel size (kh, kw) DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh) col, kernel_h, kernel_w, pad_h, pad_w);
MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
kernel_h, kernel_w, pad_h, pad_w);
} }
void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx, void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height, const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) { int width, int channels) {
// im: (n, ic, h, w), kernel size (kh, kw) DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh) im, height, width, channels);
MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
width, channels);
} }
#endif
void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx, void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col, const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) { const int pad_h, const int pad_w) {
if (im.device().is_cuda()) { masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
#ifdef MMCV_WITH_CUDA kernel_w, pad_h, pad_w);
CHECK_CUDA_INPUT(im);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(col);
masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
} }
void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx, void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height, const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) { int width, int channels) {
if (col.device().is_cuda()) { masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
#ifdef MMCV_WITH_CUDA channels);
CHECK_CUDA_INPUT(col);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(im);
masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
channels);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void modulated_deformable_im2col_impl(
void modulated_deformable_im2col_cuda(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask, const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im, const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col, const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col);
void modulated_deformable_col2im_cuda(
const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h, const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im); const int dilation_w, const int deformable_group, Tensor data_col) {
DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
void modulated_deformable_col2im_coord_cuda( data_mask, batch_size, channels, height_im, width_im,
const Tensor data_col, const Tensor data_im, const Tensor data_offset, height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
const Tensor data_mask, const int batch_size, const int channels, stride_h, stride_w, dilation_h, dilation_w,
const int height_im, const int width_im, const int height_col, deformable_group, data_col);
const int width_col, const int kernel_h, const int kernel_w, }
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask);
#endif
void modulated_deformable_im2col_cpu(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col);
void modulated_deformable_col2im_cpu( void modulated_deformable_col2im_impl(
const Tensor data_col, const Tensor data_offset, const Tensor data_mask, const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im, const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col, const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h, const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im); const int dilation_w, const int deformable_group, Tensor grad_im) {
DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
data_mask, batch_size, channels, height_im, width_im,
height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w,
deformable_group, grad_im);
}
void modulated_deformable_col2im_coord_cpu( void modulated_deformable_col2im_coord_impl(
const Tensor data_col, const Tensor data_im, const Tensor data_offset, const Tensor data_col, const Tensor data_im, const Tensor data_offset,
const Tensor data_mask, const int batch_size, const int channels, const Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col, const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kernel_w, const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group, const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask); Tensor grad_offset, Tensor grad_mask) {
DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
data_im, data_offset, data_mask, batch_size, channels,
height_im, width_im, height_col, width_col, kernel_h,
kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
dilation_w, deformable_group, grad_offset, grad_mask);
}
void modulated_deform_conv_forward( void modulated_deform_conv_forward(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset, Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
...@@ -61,31 +51,6 @@ void modulated_deform_conv_forward( ...@@ -61,31 +51,6 @@ void modulated_deform_conv_forward(
const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group, const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias) { const int deformable_group, const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(columns);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(weight);
CHECK_CPU_INPUT(bias);
CHECK_CPU_INPUT(ones);
CHECK_CPU_INPUT(offset);
CHECK_CPU_INPUT(mask);
CHECK_CPU_INPUT(output);
CHECK_CPU_INPUT(columns);
}
at::DeviceGuard guard(input.device()); at::DeviceGuard guard(input.device());
const int batch = input.size(0); const int batch = input.size(0);
...@@ -127,19 +92,10 @@ void modulated_deform_conv_forward( ...@@ -127,19 +92,10 @@ void modulated_deform_conv_forward(
output.size(2), output.size(3)}); output.size(2), output.size(3)});
for (int b = 0; b < batch; b++) { for (int b = 0; b < batch; b++) {
if (input.device().is_cuda()) { modulated_deformable_im2col_impl(
#ifdef MMCV_WITH_CUDA input[b], offset[b], mask[b], 1, channels, height, width, height_out,
modulated_deformable_im2col_cuda( width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
input[b], offset[b], mask[b], 1, channels, height, width, height_out, dilation_h, dilation_w, deformable_group, columns);
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
#endif
} else {
modulated_deformable_im2col_cpu(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
}
// divide into group // divide into group
weight = weight.view({group, weight.size(0) / group, weight.size(1), weight = weight.view({group, weight.size(0) / group, weight.size(1),
...@@ -174,41 +130,6 @@ void modulated_deform_conv_backward( ...@@ -174,41 +130,6 @@ void modulated_deform_conv_backward(
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias) { const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(columns);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
CHECK_CUDA_INPUT(grad_offset);
CHECK_CUDA_INPUT(grad_mask);
CHECK_CUDA_INPUT(grad_output);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(weight);
CHECK_CPU_INPUT(bias);
CHECK_CPU_INPUT(ones);
CHECK_CPU_INPUT(offset);
CHECK_CPU_INPUT(mask);
CHECK_CPU_INPUT(columns);
CHECK_CPU_INPUT(grad_input);
CHECK_CPU_INPUT(grad_weight);
CHECK_CPU_INPUT(grad_bias);
CHECK_CPU_INPUT(grad_offset);
CHECK_CPU_INPUT(grad_mask);
CHECK_CPU_INPUT(grad_output);
}
at::DeviceGuard guard(input.device()); at::DeviceGuard guard(input.device());
const int batch = input.size(0); const int batch = input.size(0);
...@@ -261,46 +182,24 @@ void modulated_deform_conv_backward( ...@@ -261,46 +182,24 @@ void modulated_deform_conv_backward(
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)}); weight.size(3), weight.size(4)});
if (input.device().is_cuda()) { // gradient w.r.t. input coordinate data
#ifdef MMCV_WITH_CUDA modulated_deformable_col2im_coord_impl(
// gradient w.r.t. input coordinate data columns, input[b], offset[b], mask[b], 1, channels, height, width,
modulated_deformable_col2im_coord_cuda( height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
columns, input[b], offset[b], mask[b], 1, channels, height, width, stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, grad_mask[b]);
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b], // gradient w.r.t. input data
grad_mask[b]); modulated_deformable_col2im_impl(
// gradient w.r.t. input data columns, offset[b], mask[b], 1, channels, height, width, height_out,
modulated_deformable_col2im_cuda( width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
columns, offset[b], mask[b], 1, channels, height, width, height_out, dilation_h, dilation_w, deformable_group, grad_input[b]);
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b]); // gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
// gradient w.r.t. weight, dWeight should accumulate across the batch and modulated_deformable_im2col_impl(
// group input[b], offset[b], mask[b], 1, channels, height, width, height_out,
modulated_deformable_im2col_cuda( width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
input[b], offset[b], mask[b], 1, channels, height, width, height_out, dilation_h, dilation_w, deformable_group, columns);
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
#endif
} else {
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cpu(
columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b]);
// gradient w.r.t. input data
modulated_deformable_col2im_cpu(
columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
modulated_deformable_im2col_cpu(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
}
columns = columns.view({group, columns.size(0) / group, columns.size(1)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)});
grad_weight = grad_weight.view({group, grad_weight.size(0) / group, grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
......
...@@ -10,43 +10,39 @@ ...@@ -10,43 +10,39 @@
*/ */
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA Tensor ms_deform_attn_impl_forward(const Tensor &value,
Tensor ms_deform_attn_cuda_forward(const Tensor &value,
const Tensor &spatial_shapes, const Tensor &spatial_shapes,
const Tensor &level_start_index, const Tensor &level_start_index,
const Tensor &sampling_loc, const Tensor &sampling_loc,
const Tensor &attn_weight, const Tensor &attn_weight,
const int im2col_step); const int im2col_step) {
return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
spatial_shapes, level_start_index, sampling_loc,
attn_weight, im2col_step);
}
void ms_deform_attn_cuda_backward( void ms_deform_attn_impl_backward(
const Tensor &value, const Tensor &spatial_shapes, const Tensor &value, const Tensor &spatial_shapes,
const Tensor &level_start_index, const Tensor &sampling_loc, const Tensor &level_start_index, const Tensor &sampling_loc,
const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value, const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step); Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
const int im2col_step) {
#endif DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
level_start_index, sampling_loc, attn_weight,
grad_output, grad_value, grad_sampling_loc,
grad_attn_weight, im2col_step);
}
Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes, Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
const Tensor &level_start_index, const Tensor &level_start_index,
const Tensor &sampling_loc, const Tensor &sampling_loc,
const Tensor &attn_weight, const Tensor &attn_weight,
const int im2col_step) { const int im2col_step) {
if (value.type().is_cuda()) { at::DeviceGuard guard(value.device());
#ifdef MMCV_WITH_CUDA return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
CHECK_CUDA_INPUT(value) sampling_loc, attn_weight, im2col_step);
CHECK_CUDA_INPUT(spatial_shapes)
CHECK_CUDA_INPUT(level_start_index)
CHECK_CUDA_INPUT(sampling_loc)
CHECK_CUDA_INPUT(attn_weight)
at::DeviceGuard guard(value.device());
return ms_deform_attn_cuda_forward(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
} }
void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes, void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
...@@ -56,26 +52,9 @@ void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes, ...@@ -56,26 +52,9 @@ void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
const Tensor &grad_output, Tensor &grad_value, const Tensor &grad_output, Tensor &grad_value,
Tensor &grad_sampling_loc, Tensor &grad_sampling_loc,
Tensor &grad_attn_weight, const int im2col_step) { Tensor &grad_attn_weight, const int im2col_step) {
if (value.type().is_cuda()) { at::DeviceGuard guard(value.device());
#ifdef MMCV_WITH_CUDA ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
CHECK_CUDA_INPUT(value) sampling_loc, attn_weight, grad_output,
CHECK_CUDA_INPUT(spatial_shapes) grad_value, grad_sampling_loc, grad_attn_weight,
CHECK_CUDA_INPUT(level_start_index) im2col_step);
CHECK_CUDA_INPUT(sampling_loc)
CHECK_CUDA_INPUT(attn_weight)
CHECK_CUDA_INPUT(grad_output)
CHECK_CUDA_INPUT(grad_value)
CHECK_CUDA_INPUT(grad_sampling_loc)
CHECK_CUDA_INPUT(grad_attn_weight)
at::DeviceGuard guard(value.device());
ms_deform_attn_cuda_backward(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, grad_output,
grad_value, grad_sampling_loc,
grad_attn_weight, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
} else {
AT_ERROR("Not implemented on the CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold, return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
int offset);
Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
} }
#endif
Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto nboxes = boxes.size(0);
Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
auto select = select_t.data_ptr<bool>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
for (int64_t _i = 0; _i < nboxes; _i++) {
if (select[_i] == false) continue;
auto i = order[_i];
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < nboxes; _j++) {
if (select[_j] == false) continue;
auto j = order[_j];
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(0.f, xx2 - xx1 + offset); Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
auto h = std::max(0.f, yy2 - yy1 + offset); float iou_threshold, float sigma, float min_score,
auto inter = w * h; int method, int offset) {
auto ovr = inter / (iarea + areas[j] - inter); return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
if (ovr > iou_threshold) select[_j] = false; sigma, min_score, method, offset);
}
}
return order_t.masked_select(select_t);
} }
Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) { std::vector<std::vector<int> > nms_match_impl(Tensor dets,
if (boxes.device().is_cuda()) { float iou_threshold) {
#ifdef MMCV_WITH_CUDA return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
CHECK_CUDA_INPUT(boxes);
CHECK_CUDA_INPUT(scores);
return nms_cuda(boxes, scores, iou_threshold, offset);
#else
AT_ERROR("nms is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(boxes);
CHECK_CPU_INPUT(scores);
return nms_cpu(boxes, scores, iou_threshold, offset);
}
} }
Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets, Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
float iou_threshold, float sigma, float min_score, return nms_impl(boxes, scores, iou_threshold, offset);
int method, int offset) {
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
auto scores_t = scores.clone();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
auto nboxes = boxes.size(0);
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto sc = scores_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
auto de = dets.data_ptr<float>();
int64_t pos = 0;
Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
auto inds = inds_t.data_ptr<int64_t>();
for (int64_t i = 0; i < nboxes; i++) {
auto max_score = sc[i];
auto max_pos = i;
pos = i + 1;
// get max box
while (pos < nboxes) {
if (max_score < sc[pos]) {
max_score = sc[pos];
max_pos = pos;
}
pos = pos + 1;
}
// swap
auto ix1 = de[i * 5 + 0] = x1[max_pos];
auto iy1 = de[i * 5 + 1] = y1[max_pos];
auto ix2 = de[i * 5 + 2] = x2[max_pos];
auto iy2 = de[i * 5 + 3] = y2[max_pos];
auto iscore = de[i * 5 + 4] = sc[max_pos];
auto iarea = areas[max_pos];
auto iind = inds[max_pos];
x1[max_pos] = x1[i];
y1[max_pos] = y1[i];
x2[max_pos] = x2[i];
y2[max_pos] = y2[i];
sc[max_pos] = sc[i];
areas[max_pos] = areas[i];
inds[max_pos] = inds[i];
x1[i] = ix1;
y1[i] = iy1;
x2[i] = ix2;
y2[i] = iy2;
sc[i] = iscore;
areas[i] = iarea;
inds[i] = iind;
pos = i + 1;
while (pos < nboxes) {
auto xx1 = std::max(ix1, x1[pos]);
auto yy1 = std::max(iy1, y1[pos]);
auto xx2 = std::min(ix2, x2[pos]);
auto yy2 = std::min(iy2, y2[pos]);
auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas[pos] - inter);
float weight = 1.;
if (method == 0) {
if (ovr >= iou_threshold) weight = 0;
} else if (method == 1) {
if (ovr >= iou_threshold) weight = 1 - ovr;
} else if (method == 2) {
weight = std::exp(-(ovr * ovr) / sigma);
}
sc[pos] *= weight;
// if box score falls below threshold, discard the box by
// swapping with last box update N
if (sc[pos] < min_score) {
x1[pos] = x1[nboxes - 1];
y1[pos] = y1[nboxes - 1];
x2[pos] = x2[nboxes - 1];
y2[pos] = y2[nboxes - 1];
sc[pos] = sc[nboxes - 1];
areas[pos] = areas[nboxes - 1];
inds[pos] = inds[nboxes - 1];
nboxes = nboxes - 1;
pos = pos - 1;
}
pos = pos + 1;
}
}
return inds_t.slice(0, 0, nboxes);
} }
Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold, Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
float sigma, float min_score, int method, int offset) { float sigma, float min_score, int method, int offset) {
if (boxes.device().is_cuda()) { return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
AT_ERROR("softnms is not implemented on GPU"); method, offset);
} else {
return softnms_cpu(boxes, scores, dets, iou_threshold, sigma, min_score,
method, offset);
}
}
std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
auto x1_t = dets.select(1, 0).contiguous();
auto y1_t = dets.select(1, 1).contiguous();
auto x2_t = dets.select(1, 2).contiguous();
auto y2_t = dets.select(1, 3).contiguous();
auto scores = dets.select(1, 4).contiguous();
at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
at::Tensor suppressed_t =
at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
std::vector<int> keep;
std::vector<std::vector<int> > matched;
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1) continue;
keep.push_back(i);
std::vector<int> v_i;
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1) continue;
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(static_cast<float>(0), xx2 - xx1);
auto h = std::max(static_cast<float>(0), yy2 - yy1);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr >= iou_threshold) {
suppressed[j] = 1;
v_i.push_back(j);
}
}
matched.push_back(v_i);
}
for (int i = 0; i < keep.size(); i++)
matched[i].insert(matched[i].begin(), keep[i]);
return matched;
} }
std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) { std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
if (dets.device().is_cuda()) { return nms_match_impl(dets, iou_threshold);
AT_ERROR("nms_match is not implemented on GPU");
} else {
return nms_match_cpu(dets, iou_threshold);
}
} }
...@@ -2,120 +2,14 @@ ...@@ -2,120 +2,14 @@
// It is modified from https://github.com/WenmuZhou/PAN.pytorch // It is modified from https://github.com/WenmuZhou/PAN.pytorch
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
std::vector<std::vector<float>> estimate_confidence(int32_t* label, std::vector<std::vector<float>> pixel_group_impl(
float* score, int label_num,
int height, int width) {
std::vector<std::vector<float>> point_vector;
for (int i = 0; i < label_num; i++) {
std::vector<float> point;
point.push_back(0);
point.push_back(0);
point_vector.push_back(point);
}
for (int y = 0; y < height; y++) {
auto label_tmp = label + y * width;
auto score_tmp = score + y * width;
for (int x = 0; x < width; x++) {
auto l = label_tmp[x];
if (l > 0) {
float confidence = score_tmp[x];
point_vector[l].push_back(x);
point_vector[l].push_back(y);
point_vector[l][0] += confidence;
point_vector[l][1] += 1;
}
}
}
for (int l = 0; l < point_vector.size(); l++)
if (point_vector[l][1] > 0) {
point_vector[l][0] /= point_vector[l][1];
}
return point_vector;
}
std::vector<std::vector<float>> pixel_group_cpu(
Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label, Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
Tensor kernel_contour, int kernel_region_num, float dis_threshold) { Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
assert(score.dim() == 2); return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
assert(mask.dim() == 2); kernel_label, kernel_contour, kernel_region_num,
assert(embedding_dim.dim() == 3); dis_threshold);
int height = score.size(0);
int width = score.size(1);
assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
assert(width == mask.size(1) == embedding.size(2) == kernel_label.size(2));
auto threshold_square = dis_threshold * dis_threshold;
auto ptr_score = score.data_ptr<float>();
auto ptr_mask = mask.data_ptr<bool>();
auto ptr_kernel_contour = kernel_contour.data_ptr<uint8_t>();
auto ptr_embedding = embedding.data_ptr<float>();
auto ptr_kernel_label = kernel_label.data_ptr<int32_t>();
std::queue<std::tuple<int, int, int32_t>> contour_pixels;
auto embedding_dim = embedding.size(2);
std::vector<std::vector<float>> kernel_vector(
kernel_region_num, std::vector<float>(embedding_dim + 1, 0));
Tensor text_label;
text_label = kernel_label.clone();
auto ptr_text_label = text_label.data_ptr<int32_t>();
for (int i = 0; i < height; i++) {
auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
for (int j = 0, k = 0; j < width && k < width * embedding_dim;
j++, k += embedding_dim) {
int32_t label = ptr_kernel_label_tmp[j];
if (label > 0) {
for (int d = 0; d < embedding_dim; d++)
kernel_vector[label][d] += ptr_embedding_tmp[k + d];
kernel_vector[label][embedding_dim] += 1;
// kernel pixel number
if (ptr_kernel_contour_tmp[j]) {
contour_pixels.push(std::make_tuple(i, j, label));
}
}
}
}
for (int i = 0; i < kernel_region_num; i++) {
for (int j = 0; j < embedding_dim; j++) {
kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
}
}
int dx[4] = {-1, 1, 0, 0};
int dy[4] = {0, 0, -1, 1};
while (!contour_pixels.empty()) {
auto query_pixel = contour_pixels.front();
contour_pixels.pop();
int y = std::get<0>(query_pixel);
int x = std::get<1>(query_pixel);
int32_t l = std::get<2>(query_pixel);
auto kernel_cv = kernel_vector[l];
for (int idx = 0; idx < 4; idx++) {
int tmpy = y + dy[idx];
int tmpx = x + dx[idx];
auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0)
continue;
float dis = 0;
auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
for (size_t i = 0; i < embedding_dim; i++) {
dis +=
pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
// ignore further computing if dis is big enough
if (dis >= threshold_square) break;
}
if (dis >= threshold_square) continue;
contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
ptr_text_label_tmp[tmpx] = l;
}
}
return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num,
height, width);
} }
std::vector<std::vector<float>> pixel_group( std::vector<std::vector<float>> pixel_group(
...@@ -127,11 +21,6 @@ std::vector<std::vector<float>> pixel_group( ...@@ -127,11 +21,6 @@ std::vector<std::vector<float>> pixel_group(
kernel_label = kernel_label.contiguous(); kernel_label = kernel_label.contiguous();
kernel_contour = kernel_contour.contiguous(); kernel_contour = kernel_contour.contiguous();
CHECK_CPU_INPUT(score); return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
CHECK_CPU_INPUT(mask); kernel_region_num, distance_threshold);
CHECK_CPU_INPUT(embedding);
CHECK_CPU_INPUT(kernel_label);
CHECK_CPU_INPUT(kernel_contour);
return pixel_group_cpu(score, mask, embedding, kernel_label, kernel_contour,
kernel_region_num, distance_threshold);
} }
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
int pts_num, const Tensor boxes,
const Tensor pts,
Tensor box_idx_of_points);
void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
int pts_num, const Tensor boxes, int pts_num, const Tensor boxes,
const Tensor pts, const Tensor pts,
Tensor box_idx_of_points) { Tensor box_idx_of_points) {
PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num, DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
boxes, pts, box_idx_of_points); pts_num, boxes, pts, box_idx_of_points);
}; }
void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
int pts_num, const Tensor boxes,
const Tensor pts,
Tensor box_idx_of_points);
void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num, void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
int pts_num, const Tensor boxes, int pts_num, const Tensor boxes,
const Tensor pts, const Tensor pts,
Tensor box_idx_of_points) { Tensor box_idx_of_points) {
PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num, DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
boxes, pts, box_idx_of_points); pts_num, boxes, pts, box_idx_of_points);
}; }
#endif
void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor, void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
Tensor box_idx_of_points_tensor) { Tensor box_idx_of_points_tensor) {
...@@ -34,30 +23,12 @@ void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor, ...@@ -34,30 +23,12 @@ void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
// coordinate, z is the bottom center, each box params pts: (B, npoints, 3) // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
// [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
// default -1 // default -1
int batch_size = boxes_tensor.size(0);
if (pts_tensor.device().is_cuda()) { int boxes_num = boxes_tensor.size(1);
#ifdef MMCV_WITH_CUDA int pts_num = pts_tensor.size(1);
CHECK_CUDA_INPUT(boxes_tensor); points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
CHECK_CUDA_INPUT(pts_tensor); boxes_tensor, pts_tensor,
CHECK_CUDA_INPUT(box_idx_of_points_tensor); box_idx_of_points_tensor);
int batch_size = boxes_tensor.size(0);
int boxes_num = boxes_tensor.size(1);
int pts_num = pts_tensor.size(1);
const float *boxes = boxes_tensor.data_ptr<float>();
const float *pts = pts_tensor.data_ptr<float>();
int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
points_in_boxes_part_forward_cuda(batch_size, boxes_num, pts_num,
boxes_tensor, pts_tensor,
box_idx_of_points_tensor);
#else
AT_ERROR("points_in_boxes_part is not compiled with GPU support");
#endif
} else {
AT_ERROR("points_in_boxes_part is not implemented on CPU");
}
} }
void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor, void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
...@@ -65,28 +36,9 @@ void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor, ...@@ -65,28 +36,9 @@ void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
// params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
// coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z] // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
// in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1 // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
int batch_size = boxes_tensor.size(0);
if (pts_tensor.device().is_cuda()) { int boxes_num = boxes_tensor.size(1);
#ifdef MMCV_WITH_CUDA int pts_num = pts_tensor.size(1);
CHECK_CUDA_INPUT(boxes_tensor); points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
CHECK_CUDA_INPUT(pts_tensor); pts_tensor, box_idx_of_points_tensor);
CHECK_CUDA_INPUT(box_idx_of_points_tensor);
int batch_size = boxes_tensor.size(0);
int boxes_num = boxes_tensor.size(1);
int pts_num = pts_tensor.size(1);
const float *boxes = boxes_tensor.data_ptr<float>();
const float *pts = pts_tensor.data_ptr<float>();
int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
points_in_boxes_all_forward_cuda(batch_size, boxes_num, pts_num,
boxes_tensor, pts_tensor,
box_idx_of_points_tensor);
#else
AT_ERROR("points_in_boxes_all is not compiled with GPU support");
#endif
} else {
AT_ERROR("points_in_boxes_all is not implemented on CPU");
}
} }
...@@ -2,255 +2,40 @@ ...@@ -2,255 +2,40 @@
// Modified from // Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src // https://github.com/hszhao/semseg/blob/master/lib/psa/src
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifndef min void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
#define min(a, b) (((a) < (b)) ? (a) : (b))
#endif
#ifndef max
#define max(a, b) (((a) > (b)) ? (a) : (b))
#endif
void psamask_collect_forward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const Tensor mask_data,
Tensor buffer_data) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
buffer_data.view({-1})[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) *
h_feature * w_feature +
h * w_feature + w] =
mask_data.view(
{-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w];
}
}
}
}
}
}
void psamask_distribute_forward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const Tensor mask_data,
Tensor buffer_data) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
buffer_data.view(
{-1})[(n * h_feature * w_feature + h * w_feature + w) *
h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)] =
mask_data.view(
{-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w];
}
}
}
}
}
}
void psamask_collect_backward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const Tensor buffer_diff,
Tensor mask_diff) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w] =
buffer_diff.view({-1})[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) *
h_feature * w_feature +
h * w_feature + w];
}
}
}
}
}
}
void psamask_distribute_backward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask,
const Tensor buffer_diff, Tensor mask_diff) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w] =
buffer_diff.view(
{-1})[(n * h_feature * w_feature + h * w_feature + w) *
h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)];
}
}
}
}
}
}
void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask) {
if (psa_type == 0)
psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input, output);
else
psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input, output);
}
void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) {
if (psa_type == 0)
psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, grad_output, grad_input);
else
psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, grad_output,
grad_input);
}
#ifdef MMCV_WITH_CUDA
void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
Tensor output, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask,
const int half_w_mask);
void PSAMaskBackwardCUDAKernelLauncher(
const int psa_type, const Tensor grad_output, Tensor grad_input,
const int num_, const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask, const int half_w_mask);
void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature, const int num_, const int h_feature,
const int w_feature, const int h_mask, const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) { const int half_w_mask) {
PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature, DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
w_feature, h_mask, w_mask, half_h_mask, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask); half_w_mask);
} }
void psamask_backward_cuda(const int psa_type, const Tensor grad_output, void psamask_backward_impl(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_, Tensor grad_input, const int num_,
const int h_feature, const int w_feature, const int h_feature, const int w_feature,
const int h_mask, const int w_mask, const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) { const int half_h_mask, const int half_w_mask) {
PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_, DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
h_feature, w_feature, h_mask, w_mask, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_h_mask, half_w_mask); half_w_mask);
} }
#endif
void psamask_forward(const Tensor input, Tensor output, const int psa_type, void psamask_forward(const Tensor input, Tensor output, const int psa_type,
const int num_, const int h_feature, const int w_feature, const int num_, const int h_feature, const int w_feature,
const int h_mask, const int w_mask, const int half_h_mask, const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) { const int half_w_mask) {
if (input.device().is_cuda()) { psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
#ifdef MMCV_WITH_CUDA h_mask, w_mask, half_h_mask, half_w_mask);
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(output);
psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
h_mask, w_mask, half_h_mask, half_w_mask);
#else
AT_ERROR("PSAMask is not compiled with GPU support");
#endif
} else {
psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
h_mask, w_mask, half_h_mask, half_w_mask);
}
} }
void psamask_backward(Tensor grad_output, const Tensor grad_input, void psamask_backward(Tensor grad_output, const Tensor grad_input,
const int psa_type, const int num_, const int h_feature, const int psa_type, const int num_, const int h_feature,
const int w_feature, const int h_mask, const int w_mask, const int w_feature, const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) { const int half_h_mask, const int half_w_mask) {
if (grad_input.device().is_cuda()) { psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
#ifdef MMCV_WITH_CUDA w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_output);
psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
#else
AT_ERROR("PSAMask is not compiled with GPU support");
#endif
} else {
psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned);
void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width, int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio, float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) { int pool_mode, bool aligned) {
ROIAlignForwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, argmax_x, aligned_height, aligned_width, spatial_scale,
spatial_scale, sampling_ratio, pool_mode, aligned); sampling_ratio, pool_mode, aligned);
} }
void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y, void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input, Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width, int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio, float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) { int pool_mode, bool aligned) {
ROIAlignBackwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height, argmax_x, grad_input, aligned_height, aligned_width,
aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); spatial_scale, sampling_ratio, pool_mode, aligned);
}
#endif
void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned);
void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) {
ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
} }
void roi_align_forward(Tensor input, Tensor rois, Tensor output, void roi_align_forward(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height, Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale, int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) { int sampling_ratio, int pool_mode, bool aligned) {
if (input.device().is_cuda()) { roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
#ifdef MMCV_WITH_CUDA aligned_height, aligned_width, spatial_scale,
CHECK_CUDA_INPUT(input); sampling_ratio, pool_mode, aligned);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax_y);
CHECK_CUDA_INPUT(argmax_x);
roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
AT_ERROR("RoIAlign is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(output);
CHECK_CPU_INPUT(argmax_y);
CHECK_CPU_INPUT(argmax_x);
roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
} }
void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y, void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input, int aligned_height, Tensor argmax_x, Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale, int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) { int sampling_ratio, int pool_mode, bool aligned) {
if (grad_output.device().is_cuda()) { roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
#ifdef MMCV_WITH_CUDA aligned_height, aligned_width, spatial_scale,
CHECK_CUDA_INPUT(grad_output); sampling_ratio, pool_mode, aligned);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(argmax_y);
CHECK_CUDA_INPUT(argmax_x);
CHECK_CUDA_INPUT(grad_input);
roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
AT_ERROR("RoIAlign is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(grad_output);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(argmax_y);
CHECK_CPU_INPUT(argmax_x);
CHECK_CPU_INPUT(grad_input);
roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
void ROIAlignRotatedForwardCUDAKernelLauncher(
const at::Tensor features, const at::Tensor rois, const float spatial_scale,
const int sample_ratio, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois,
const int aligned_height, const int aligned_width, at::Tensor output);
void ROIAlignRotatedBackwardCUDAKernelLauncher(
const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
const int sample_ratio, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois,
const int aligned_height, const int aligned_width, at::Tensor bottom_grad);
void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
int aligned_height, int aligned_width, int aligned_height, int aligned_width,
float spatial_scale, int sample_ratio, float spatial_scale, int sample_ratio,
bool aligned, bool clockwise) { bool aligned, bool clockwise) {
// Number of ROIs DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,
int num_rois = rois.size(0); aligned_height, aligned_width, spatial_scale,
int size_rois = rois.size(1); sample_ratio, aligned, clockwise);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
int num_channels = features.size(1);
int data_height = features.size(2);
int data_width = features.size(3);
ROIAlignRotatedForwardCUDAKernelLauncher(
features, rois, spatial_scale, sample_ratio, aligned, clockwise,
num_channels, data_height, data_width, num_rois, aligned_height,
aligned_width, output);
} }
void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois, void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height, Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale, int aligned_width, float spatial_scale,
int sample_ratio, bool aligned, int sample_ratio, bool aligned,
bool clockwise) { bool clockwise) {
// Number of ROIs DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
int num_rois = rois.size(0); bottom_grad, aligned_height, aligned_width,
int size_rois = rois.size(1); spatial_scale, sample_ratio, aligned, clockwise);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
int num_channels = bottom_grad.size(1);
int data_height = bottom_grad.size(2);
int data_width = bottom_grad.size(3);
ROIAlignRotatedBackwardCUDAKernelLauncher(
top_grad, rois, spatial_scale, sample_ratio, aligned, clockwise,
num_channels, data_height, data_width, num_rois, aligned_height,
aligned_width, bottom_grad);
}
#endif
void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise);
void ROIAlignRotatedBackwardCPULauncher(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise);
void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise) {
ROIAlignRotatedForwardCPULauncher(input, rois, output, aligned_height,
aligned_width, spatial_scale,
sampling_ratio, aligned, clockwise);
}
void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
ROIAlignRotatedBackwardCPULauncher(
top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
sampling_ratio, aligned, clockwise);
} }
void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output, void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width, int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio, float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise) { bool aligned, bool clockwise) {
if (input.device().is_cuda()) { roi_align_rotated_forward_impl(input, rois, output, aligned_height,
#ifdef MMCV_WITH_CUDA aligned_width, spatial_scale, sampling_ratio,
CHECK_CUDA_INPUT(input); aligned, clockwise);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
roi_align_rotated_forward_cuda(input, rois, output, aligned_height,
aligned_width, spatial_scale, sampling_ratio,
aligned, clockwise);
#else
AT_ERROR("RoIAlignRotated is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(output);
roi_align_rotated_forward_cpu(input, rois, output, aligned_height,
aligned_width, spatial_scale, sampling_ratio,
aligned, clockwise);
}
} }
void roi_align_rotated_backward(Tensor top_grad, Tensor rois, void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
...@@ -124,25 +35,7 @@ void roi_align_rotated_backward(Tensor top_grad, Tensor rois, ...@@ -124,25 +35,7 @@ void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
int aligned_width, float spatial_scale, int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned, int sampling_ratio, bool aligned,
bool clockwise) { bool clockwise) {
if (top_grad.device().is_cuda()) { roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
#ifdef MMCV_WITH_CUDA aligned_width, spatial_scale, sampling_ratio,
CHECK_CUDA_INPUT(top_grad); aligned, clockwise);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(bottom_grad);
roi_align_rotated_backward_cuda(top_grad, rois, bottom_grad, aligned_height,
aligned_width, spatial_scale,
sampling_ratio, aligned, clockwise);
#else
AT_ERROR("RoIAlignRotated is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(top_grad);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(bottom_grad);
roi_align_rotated_backward_cpu(top_grad, rois, bottom_grad, aligned_height,
aligned_width, spatial_scale, sampling_ratio,
aligned, clockwise);
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height,
int pooled_width, float spatial_scale);
void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax, Tensor grad_input,
int pooled_height, int pooled_width,
float spatial_scale);
void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height, int pooled_width, Tensor argmax, int pooled_height, int pooled_width,
float spatial_scale) { float spatial_scale) {
ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height, DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
pooled_width, spatial_scale); pooled_height, pooled_width, spatial_scale);
} }
void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax, void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height, Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale) { int pooled_width, float spatial_scale) {
ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input, DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
pooled_height, pooled_width, spatial_scale); grad_input, pooled_height, pooled_width, spatial_scale);
} }
#endif
void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax, void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
int pooled_height, int pooled_width, int pooled_height, int pooled_width,
float spatial_scale) { float spatial_scale) {
if (input.device().is_cuda()) { roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
#ifdef MMCV_WITH_CUDA pooled_width, spatial_scale);
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax);
roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale);
#else
AT_ERROR("RoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("RoIPool is not implemented on CPU");
}
} }
void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax, void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height, int pooled_width, Tensor grad_input, int pooled_height, int pooled_width,
float spatial_scale) { float spatial_scale) {
if (grad_output.device().is_cuda()) { roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
#ifdef MMCV_WITH_CUDA pooled_width, spatial_scale);
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(argmax);
CHECK_CUDA_INPUT(grad_input);
roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
pooled_width, spatial_scale);
#else
AT_ERROR("RoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("RoIPool is not implemented on CPU");
}
} }
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
void RoiawarePool3dForwardCUDAKernelLauncher(
int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
int out_y, int out_z, const Tensor rois, const Tensor pts,
const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
Tensor pooled_features, int pool_method);
void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
int max_pts_each_voxel, int out_x, int out_y, int max_pts_each_voxel, int out_x, int out_y,
int out_z, const Tensor rois, int out_z, const Tensor rois,
const Tensor pts, const Tensor pts_feature, const Tensor pts, const Tensor pts_feature,
Tensor argmax, Tensor pts_idx_of_voxels, Tensor argmax, Tensor pts_idx_of_voxels,
Tensor pooled_features, int pool_method) { Tensor pooled_features, int pool_method) {
RoiawarePool3dForwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,
boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z, channels, max_pts_each_voxel, out_x, out_y, out_z, rois,
rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features, pts, pts_feature, argmax, pts_idx_of_voxels,
pool_method); pooled_features, pool_method);
}; }
void RoiawarePool3dBackwardCUDAKernelLauncher(
int boxes_num, int out_x, int out_y, int out_z, int channels,
int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
const Tensor grad_out, Tensor grad_in, int pool_method);
void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y, void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
int out_z, int channels, int out_z, int channels,
int max_pts_each_voxel, int max_pts_each_voxel,
const Tensor pts_idx_of_voxels, const Tensor pts_idx_of_voxels,
const Tensor argmax, const Tensor grad_out, const Tensor argmax, const Tensor grad_out,
Tensor grad_in, int pool_method) { Tensor grad_in, int pool_method) {
RoiawarePool3dBackwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,
boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel, out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,
pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method); argmax, grad_out, grad_in, pool_method);
}; }
#endif
void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature, void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
Tensor argmax, Tensor pts_idx_of_voxels, Tensor argmax, Tensor pts_idx_of_voxels,
...@@ -47,36 +35,20 @@ void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature, ...@@ -47,36 +35,20 @@ void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params pooled_features: (N, out_x, out_y, out_z, C) // params pooled_features: (N, out_x, out_y, out_z, C)
// params pool_method: 0: max_pool 1: avg_pool // params pool_method: 0: max_pool 1: avg_pool
if (pts.device().is_cuda()) { int boxes_num = rois.size(0);
#ifdef MMCV_WITH_CUDA int pts_num = pts.size(0);
CHECK_CUDA_INPUT(rois); int channels = pts_feature.size(1);
CHECK_CUDA_INPUT(pts); int max_pts_each_voxel = pts_idx_of_voxels.size(4); // index 0 is the counter
CHECK_CUDA_INPUT(pts_feature); int out_x = pts_idx_of_voxels.size(1);
CHECK_CUDA_INPUT(argmax); int out_y = pts_idx_of_voxels.size(2);
CHECK_CUDA_INPUT(pts_idx_of_voxels); int out_z = pts_idx_of_voxels.size(3);
CHECK_CUDA_INPUT(pooled_features); assert((out_x < 256) && (out_y < 256) &&
(out_z < 256)); // we encode index with 8bit
int boxes_num = rois.size(0); roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,
int pts_num = pts.size(0); out_x, out_y, out_z, rois, pts, pts_feature,
int channels = pts_feature.size(1); argmax, pts_idx_of_voxels, pooled_features,
int max_pts_each_voxel = pool_method);
pts_idx_of_voxels.size(4); // index 0 is the counter
int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3);
assert((out_x < 256) && (out_y < 256) &&
(out_z < 256)); // we encode index with 8bit
roiaware_pool3d_forward_cuda(boxes_num, pts_num, channels,
max_pts_each_voxel, out_x, out_y, out_z, rois,
pts, pts_feature, argmax, pts_idx_of_voxels,
pooled_features, pool_method);
#else
AT_ERROR("roiaware_pool3d is not compiled with GPU support");
#endif
} else {
AT_ERROR("roiaware_pool3d is not implemented on CPU");
}
} }
void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax, void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
...@@ -87,29 +59,14 @@ void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax, ...@@ -87,29 +59,14 @@ void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
// params grad_out: (N, out_x, out_y, out_z, C) // params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value // params grad_in: (npoints, C), return value
// params pool_method: 0: max_pool 1: avg_pool // params pool_method: 0: max_pool 1: avg_pool
int boxes_num = pts_idx_of_voxels.size(0);
int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3);
int max_pts_each_voxel = pts_idx_of_voxels.size(4); // index 0 is the counter
int channels = grad_out.size(4);
if (grad_in.device().is_cuda()) { roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,
#ifdef MMCV_WITH_CUDA max_pts_each_voxel, pts_idx_of_voxels, argmax,
CHECK_CUDA_INPUT(pts_idx_of_voxels); grad_out, grad_in, pool_method);
CHECK_CUDA_INPUT(argmax);
CHECK_CUDA_INPUT(grad_out);
CHECK_CUDA_INPUT(grad_in);
int boxes_num = pts_idx_of_voxels.size(0);
int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3);
int max_pts_each_voxel =
pts_idx_of_voxels.size(4); // index 0 is the counter
int channels = grad_out.size(4);
roiaware_pool3d_backward_cuda(boxes_num, out_x, out_y, out_z, channels,
max_pts_each_voxel, pts_idx_of_voxels, argmax,
grad_out, grad_in, pool_method);
#else
AT_ERROR("roiaware_pool3d is not compiled with GPU support");
#endif
} else {
AT_ERROR("roiaware_pool3d is not implemented on CPU");
}
} }
...@@ -7,24 +7,18 @@ All Rights Reserved 2018. ...@@ -7,24 +7,18 @@ All Rights Reserved 2018.
*/ */
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
void RoIPointPool3dForwardCUDAKernelLauncher(
int batch_size, int pts_num, int boxes_num, int feature_in_len,
int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
int feature_in_len, int sampled_pts_num, int feature_in_len, int sampled_pts_num,
const Tensor xyz, const Tensor boxes3d, const Tensor xyz, const Tensor boxes3d,
const Tensor pts_feature, const Tensor pts_feature,
Tensor pooled_features, Tensor pooled_features,
Tensor pooled_empty_flag) { Tensor pooled_empty_flag) {
RoIPointPool3dForwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz, boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
boxes3d, pts_feature, pooled_features, pooled_empty_flag); pts_feature, pooled_features, pooled_empty_flag);
}; }
#endif
void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature, void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
Tensor pooled_features, Tensor pooled_empty_flag) { Tensor pooled_features, Tensor pooled_empty_flag) {
...@@ -33,28 +27,13 @@ void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature, ...@@ -33,28 +27,13 @@ void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
// params pts_feature: (B, N, C) // params pts_feature: (B, N, C)
// params pooled_features: (B, M, 512, 3+C) // params pooled_features: (B, M, 512, 3+C)
// params pooled_empty_flag: (B, M) // params pooled_empty_flag: (B, M)
int batch_size = xyz.size(0);
int pts_num = xyz.size(1);
int boxes_num = boxes3d.size(1);
int feature_in_len = pts_feature.size(2);
int sampled_pts_num = pooled_features.size(2);
if (xyz.device().is_cuda()) { roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
#ifdef MMCV_WITH_CUDA sampled_pts_num, xyz, boxes3d, pts_feature,
CHECK_CUDA_INPUT(xyz); pooled_features, pooled_empty_flag);
CHECK_CUDA_INPUT(boxes3d);
CHECK_CUDA_INPUT(pts_feature);
CHECK_CUDA_INPUT(pooled_features);
CHECK_CUDA_INPUT(pooled_empty_flag);
int batch_size = xyz.size(0);
int pts_num = xyz.size(1);
int boxes_num = boxes3d.size(1);
int feature_in_len = pts_feature.size(2);
int sampled_pts_num = pooled_features.size(2);
roipoint_pool3d_forward_cuda(batch_size, pts_num, boxes_num, feature_in_len,
sampled_pts_num, xyz, boxes3d, pts_feature,
pooled_features, pooled_empty_flag);
#else
AT_ERROR("roipoint_pool3d is not compiled with GPU support");
#endif
} else {
AT_ERROR("roipoint_pool3d is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved. // Copyright (c) OpenMMLab. All rights reserved.
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t; typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
#ifdef MMCV_WITH_CUDA std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
std::vector<torch::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
const torch::Tensor &feats, const torch::Tensor &coors,
const reduce_t reduce_type);
std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
const torch::Tensor &feats, const torch::Tensor &coors, const torch::Tensor &feats, const torch::Tensor &coors,
const reduce_t reduce_type) { const reduce_t reduce_type) {
return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors, return DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, feats, coors,
reduce_type); reduce_type);
}; }
void DynamicPointToVoxelBackwardCUDAKernelLauncher(
torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
const torch::Tensor &feats, const torch::Tensor &reduced_feats,
const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
const reduce_t reduce_type);
void dynamic_point_to_voxel_backward_cuda( void dynamic_point_to_voxel_backward_impl(
torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats, torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
const torch::Tensor &feats, const torch::Tensor &reduced_feats, const torch::Tensor &feats, const torch::Tensor &reduced_feats,
const torch::Tensor &coors_idx, const torch::Tensor &reduce_count, const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
const reduce_t reduce_type) { const reduce_t reduce_type) {
DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats, DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, grad_feats,
feats, reduced_feats, coors_idx, grad_reduced_feats, feats, reduced_feats, coors_idx,
reduce_count, reduce_type); reduce_count, reduce_type);
}; }
#endif
std::vector<at::Tensor> dynamic_point_to_voxel_forward_cpu(
const at::Tensor &points, const at::Tensor &voxel_mapping,
const std::vector<float> voxel_size, const std::vector<float> coors_range);
inline reduce_t convert_reduce_type(const std::string &reduce_type) { inline reduce_t convert_reduce_type(const std::string &reduce_type) {
if (reduce_type == "max") if (reduce_type == "max")
...@@ -51,19 +36,8 @@ inline reduce_t convert_reduce_type(const std::string &reduce_type) { ...@@ -51,19 +36,8 @@ inline reduce_t convert_reduce_type(const std::string &reduce_type) {
std::vector<torch::Tensor> dynamic_point_to_voxel_forward( std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
const torch::Tensor &feats, const torch::Tensor &coors, const torch::Tensor &feats, const torch::Tensor &coors,
const std::string &reduce_type) { const std::string &reduce_type) {
if (feats.device().is_cuda()) { return dynamic_point_to_voxel_forward_impl(feats, coors,
#ifdef MMCV_WITH_CUDA convert_reduce_type(reduce_type));
CHECK_CUDA_INPUT(feats);
CHECK_CUDA_INPUT(coors);
return dynamic_point_to_voxel_forward_cuda(
feats, coors, convert_reduce_type(reduce_type));
#else
AT_ERROR("dynamic_point_to_voxel is not compiled with GPU support");
#endif
} else {
AT_ERROR("dynamic_point_to_voxel is not implemented on CPU");
return std::vector<torch::Tensor>();
}
} }
void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats, void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
...@@ -73,21 +47,7 @@ void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats, ...@@ -73,21 +47,7 @@ void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
const torch::Tensor &coors_idx, const torch::Tensor &coors_idx,
const torch::Tensor &reduce_count, const torch::Tensor &reduce_count,
const std::string &reduce_type) { const std::string &reduce_type) {
if (grad_feats.device().is_cuda()) { dynamic_point_to_voxel_backward_impl(grad_feats, grad_reduced_feats, feats,
#ifdef MMCV_WITH_CUDA reduced_feats, coors_idx, reduce_count,
CHECK_CUDA_INPUT(grad_feats); convert_reduce_type(reduce_type));
CHECK_CUDA_INPUT(grad_reduced_feats);
CHECK_CUDA_INPUT(feats);
CHECK_CUDA_INPUT(reduced_feats);
CHECK_CUDA_INPUT(coors_idx);
CHECK_CUDA_INPUT(reduce_count);
dynamic_point_to_voxel_backward_cuda(grad_feats, grad_reduced_feats, feats,
reduced_feats, coors_idx, reduce_count,
convert_reduce_type(reduce_type));
#else
AT_ERROR("dynamic_point_to_voxel is not compiled with GPU support");
#endif
} else {
AT_ERROR("dynamic_point_to_voxel is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {
void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean); DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);
void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
Tensor var);
void SyncBNForwardOutputCUDAKernelLauncher(
const Tensor input, const Tensor mean, const Tensor var,
Tensor running_mean, Tensor running_var, const Tensor weight,
const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
float momentum, int group_size);
void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
const Tensor norm,
Tensor grad_weight,
Tensor grad_bias);
void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
const Tensor weight,
const Tensor grad_weight,
const Tensor grad_bias,
const Tensor norm, const Tensor std,
Tensor grad_input);
void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
SyncBNForwardMeanCUDAKernelLauncher(input, mean);
} }
void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean, void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
Tensor var) { Tensor var) {
SyncBNForwardVarCUDAKernelLauncher(input, mean, var); DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);
} }
void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean, void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
const Tensor var, Tensor running_mean, const Tensor var, Tensor running_mean,
Tensor running_var, const Tensor weight, Tensor running_var, const Tensor weight,
const Tensor bias, Tensor norm, Tensor std, const Tensor bias, Tensor norm, Tensor std,
Tensor output, float eps, float momentum, Tensor output, float eps, float momentum,
int group_size) { int group_size) {
SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean, DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,
running_var, weight, bias, norm, std, running_mean, running_var, weight, bias, norm, std,
output, eps, momentum, group_size); output, eps, momentum, group_size);
} }
void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm, void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
Tensor grad_weight, Tensor grad_bias) { Tensor grad_weight, Tensor grad_bias) {
SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight, DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,
grad_bias); grad_weight, grad_bias);
} }
void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight, void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
const Tensor grad_weight, const Tensor grad_weight,
const Tensor grad_bias, const Tensor norm, const Tensor grad_bias, const Tensor norm,
const Tensor std, Tensor grad_input) { const Tensor std, Tensor grad_input) {
SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight, DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,
grad_bias, norm, std, grad_input); grad_weight, grad_bias, norm, std, grad_input);
} }
#endif
void sync_bn_forward_mean(const Tensor input, Tensor mean) { void sync_bn_forward_mean(const Tensor input, Tensor mean) {
if (input.device().is_cuda()) { sync_bn_forward_mean_impl(input, mean);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(mean);
sync_bn_forward_mean_cuda(input, mean);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
} }
void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) { void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
if (input.device().is_cuda()) { sync_bn_forward_var_impl(input, mean, var);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(mean);
CHECK_CUDA_INPUT(var);
sync_bn_forward_var_cuda(input, mean, var);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
} }
void sync_bn_forward_output(const Tensor input, const Tensor mean, void sync_bn_forward_output(const Tensor input, const Tensor mean,
...@@ -95,65 +50,20 @@ void sync_bn_forward_output(const Tensor input, const Tensor mean, ...@@ -95,65 +50,20 @@ void sync_bn_forward_output(const Tensor input, const Tensor mean,
Tensor running_var, Tensor norm, Tensor std, Tensor running_var, Tensor norm, Tensor std,
Tensor output, float eps, float momentum, Tensor output, float eps, float momentum,
int group_size) { int group_size) {
if (input.device().is_cuda()) { sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,
#ifdef MMCV_WITH_CUDA weight, bias, norm, std, output, eps, momentum,
CHECK_CUDA_INPUT(input); group_size);
CHECK_CUDA_INPUT(mean);
CHECK_CUDA_INPUT(var);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(running_mean);
CHECK_CUDA_INPUT(running_var);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(std);
CHECK_CUDA_INPUT(output);
sync_bn_forward_output_cuda(input, mean, var, running_mean, running_var,
weight, bias, norm, std, output, eps, momentum,
group_size);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
} }
void sync_bn_backward_param(const Tensor grad_output, const Tensor norm, void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
Tensor grad_weight, Tensor grad_bias) { Tensor grad_weight, Tensor grad_bias) {
if (grad_output.device().is_cuda()) { sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
sync_bn_backward_param_cuda(grad_output, norm, grad_weight, grad_bias);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
} }
void sync_bn_backward_data(const Tensor grad_output, const Tensor weight, void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
const Tensor grad_weight, const Tensor grad_bias, const Tensor grad_weight, const Tensor grad_bias,
const Tensor norm, const Tensor std, const Tensor norm, const Tensor std,
Tensor grad_input) { Tensor grad_input) {
if (grad_output.device().is_cuda()) { sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,
#ifdef MMCV_WITH_CUDA std, grad_input);
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(std);
CHECK_CUDA_INPUT(grad_input);
sync_bn_backward_data_cuda(grad_output, weight, grad_weight, grad_bias,
norm, std, grad_input);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
} }
...@@ -2,60 +2,32 @@ ...@@ -2,60 +2,32 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void three_interpolate_forward_impl(int b, int c, int m, int n,
void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
const Tensor points,
const Tensor idx,
const Tensor weight, Tensor out);
void three_interpolate_forward_cuda(int b, int c, int m, int n,
const Tensor points, const Tensor idx, const Tensor points, const Tensor idx,
const Tensor weight, Tensor out) { const Tensor weight, Tensor out) {
ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight, DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,
out); weight, out);
}; }
void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
const Tensor grad_out,
const Tensor idx,
const Tensor weight,
Tensor grad_points);
void three_interpolate_backward_cuda(int b, int c, int n, int m, void three_interpolate_backward_impl(int b, int c, int n, int m,
const Tensor grad_out, const Tensor idx, const Tensor grad_out, const Tensor idx,
const Tensor weight, Tensor grad_points) { const Tensor weight, Tensor grad_points) {
ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight, DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,
grad_points); idx, weight, grad_points);
}; }
#endif
void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor, void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor weight_tensor, Tensor out_tensor, int b, Tensor weight_tensor, Tensor out_tensor, int b,
int c, int m, int n) { int c, int m, int n) {
if (points_tensor.device().is_cuda()) { three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA weight_tensor, out_tensor);
three_interpolate_forward_cuda(b, c, m, n, points_tensor, idx_tensor,
weight_tensor, out_tensor);
#else
AT_ERROR("three_interpolate is not compiled with GPU support");
#endif
} else {
AT_ERROR("three_interpolate is not implemented on CPU");
}
} }
void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor, void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor weight_tensor, Tensor grad_points_tensor, Tensor weight_tensor, Tensor grad_points_tensor,
int b, int c, int n, int m) { int b, int c, int n, int m) {
if (grad_out_tensor.device().is_cuda()) { three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA weight_tensor, grad_points_tensor);
three_interpolate_backward_cuda(b, c, n, m, grad_out_tensor, idx_tensor,
weight_tensor, grad_points_tensor);
#else
AT_ERROR("three_interpolate is not compiled with GPU support");
#endif
} else {
AT_ERROR("three_interpolate is not implemented on CPU");
}
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment