Unverified Commit 230f9a3b authored by q.yao's avatar q.yao Committed by GitHub
Browse files

Refactor csrc with device dispatcher (#1463)

* Add device registry for pytorch ops

* add declaration of CheckDeviceConsistency

* fix for torch130

* assert with torch check

* Refactor ops with dispatch

* update rest ops

* faster install

* update compatibility

* update compatibility, rename parameter

* move cpu implement to pytorch/cpu

* update ops/csrc/README.md

* fix rocm support

* update cn document

* update docs

* list instead of map
parent ef8ba752
......@@ -100,25 +100,20 @@ THE POSSIBILITY OF SUCH DAMAGES.
*/
#include "pytorch_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
const torch::Tensor& bias,
const torch::Tensor& refer, int act,
int grad, float alpha, float scale);
#endif
#include "pytorch_device_registry.hpp"
torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
const torch::Tensor& bias,
const torch::Tensor& refer, int act,
int grad, float alpha, float scale) {
return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
act, grad, alpha, scale);
}
torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
const torch::Tensor& bias,
const torch::Tensor& refer, int act,
int grad, float alpha, float scale) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA(input);
CHECK_CUDA(bias);
return fused_bias_leakyrelu_op(input, bias, refer, act, grad, alpha, scale);
#else
AT_ERROR("Fused bias leakyrelu is not compiled with GPU support");
#endif
return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
scale);
}
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
const Tensor points,
const Tensor idx, Tensor out);
void gather_points_forward_cuda(int b, int c, int n, int npoints,
void gather_points_forward_impl(int b, int c, int n, int npoints,
const Tensor points, const Tensor idx,
Tensor out) {
GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
};
void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
const Tensor grad_out,
const Tensor idx,
Tensor grad_points);
DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
idx, out);
}
void gather_points_backward_cuda(int b, int c, int n, int npoints,
void gather_points_backward_impl(int b, int c, int n, int npoints,
const Tensor grad_out, const Tensor idx,
Tensor grad_points) {
GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
grad_points);
};
#endif
DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
idx, grad_points);
}
void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor out_tensor, int b, int c, int n,
int npoints) {
if (points_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
gather_points_forward_cuda(b, c, n, npoints, points_tensor, idx_tensor,
out_tensor);
#else
AT_ERROR("gather_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("gather_points is not implemented on CPU");
}
gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
out_tensor);
}
void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor grad_points_tensor, int b, int c, int n,
int npoints) {
if (grad_out_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
gather_points_backward_cuda(b, c, n, npoints, grad_out_tensor, idx_tensor,
grad_points_tensor);
#else
AT_ERROR("gather_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("gather_points is not implemented on CPU");
}
gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
grad_points_tensor);
}
......@@ -3,56 +3,32 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
int nsample, const Tensor points,
const Tensor idx, Tensor out);
void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
const Tensor points, const Tensor idx,
Tensor out) {
GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
out);
};
DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
points, idx, out);
}
void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
int nsample, const Tensor grad_out,
const Tensor idx,
Tensor grad_points);
void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
const Tensor grad_out, const Tensor idx,
Tensor grad_points) {
GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
idx, grad_points);
};
#endif
DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
grad_out, idx, grad_points);
}
void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor out_tensor, int b, int c, int n, int npoints,
int nsample) {
if (points_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
group_points_forward_cuda(b, c, n, npoints, nsample, points_tensor,
idx_tensor, out_tensor);
#else
AT_ERROR("group_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("group_points is not implemented on CPU");
}
DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
points_tensor, idx_tensor, out_tensor);
}
void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor grad_points_tensor, int b, int c, int n,
int npoints, int nsample) {
if (grad_out_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
group_points_backward_cuda(b, c, n, npoints, nsample, grad_out_tensor,
idx_tensor, grad_points_tensor);
#else
AT_ERROR("group_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("group_points is not implemented on CPU");
}
group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
idx_tensor, grad_points_tensor);
}
......@@ -8,68 +8,35 @@ All Rights Reserved 2019-2020.
*/
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
#ifdef MMCV_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime_api.h>
#define CHECK_ERROR(state) \
{ gpuAssert((state), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
line);
if (abort) exit(code);
}
}
void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
const Tensor boxes_a,
const int num_b,
const Tensor boxes_b,
Tensor ans_overlap);
void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b,
Tensor ans_overlap) {
IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
ans_overlap);
};
void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
const Tensor boxes_a,
const int num_b,
const Tensor boxes_b,
Tensor ans_iou);
void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
num_b, boxes_b, ans_overlap);
}
void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b,
Tensor ans_iou) {
IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
ans_iou);
};
void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long *mask, int boxes_num,
float nms_overlap_thresh);
DISPATCH_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, num_a, boxes_a, num_b,
boxes_b, ans_iou);
}
void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long *mask,
void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) {
IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh);
};
void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long *mask,
int boxes_num,
float nms_overlap_thresh);
DISPATCH_DEVICE_IMPL(iou3d_nms_forward_impl, boxes, mask, boxes_num,
nms_overlap_thresh);
}
void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long *mask,
void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) {
IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
nms_overlap_thresh);
};
#endif
DISPATCH_DEVICE_IMPL(iou3d_nms_normal_forward_impl, boxes, mask, boxes_num,
nms_overlap_thresh);
}
void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
Tensor ans_overlap) {
......@@ -77,23 +44,11 @@ void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
if (boxes_a.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes_a);
CHECK_CUDA_INPUT(boxes_b);
CHECK_CUDA_INPUT(ans_overlap);
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
iou3d_boxes_overlap_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b,
ans_overlap);
#else
AT_ERROR("iou3d_boxes_overlap_bev is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_boxes_overlap_bev is not implemented on CPU");
}
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
ans_overlap);
}
void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
......@@ -101,77 +56,52 @@ void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
if (boxes_a.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes_a);
CHECK_CUDA_INPUT(boxes_b);
CHECK_CUDA_INPUT(ans_iou);
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
iou3d_boxes_iou_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b, ans_iou);
#else
AT_ERROR("iou3d_boxes_iou_bev is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_boxes_iou_bev is not implemented on CPU");
}
iou3d_boxes_iou_bev_forward_impl(num_a, boxes_a, num_b, boxes_b, ans_iou);
}
void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh) {
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N)
CHECK_CONTIGUOUS(boxes);
CHECK_CONTIGUOUS(keep);
if (boxes.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes);
CHECK_CONTIGUOUS(keep);
int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_forward_cuda(boxes, mask_data, boxes_num, nms_overlap_thresh);
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host =
(unsigned long long *)mask_cpu.data_ptr<int64_t>();
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host =
(unsigned long long *)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
}
if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
*keep_num_data = num_to_keep;
#else
AT_ERROR("iou3d_nms is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_nms is not implemented on CPU");
}
}
......@@ -180,53 +110,42 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N)
if (boxes.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes);
CHECK_CONTIGUOUS(keep);
int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_normal_forward_cuda(boxes, mask_data, boxes_num,
nms_overlap_thresh);
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host =
(unsigned long long *)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
}
}
CHECK_CONTIGUOUS(boxes);
CHECK_CONTIGUOUS(keep);
if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
*keep_num_data = num_to_keep;
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_normal_forward_impl(boxes, mask_data, boxes_num,
nms_overlap_thresh);
#else
AT_ERROR("iou3d_nms_normal is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_nms_normal is not implemented on CPU");
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host =
(unsigned long long *)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
}
}
*keep_num_data = num_to_keep;
}
......@@ -2,31 +2,16 @@
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
const Tensor xyz, const Tensor new_xyz,
Tensor idx, Tensor dist2);
void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
const Tensor new_xyz, Tensor idx, Tensor dist2) {
KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
dist2);
}
#endif
void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
Tensor dist2_tensor, int b, int n, int m, int nsample) {
if (new_xyz_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(new_xyz_tensor);
CHECK_CUDA_INPUT(xyz_tensor);
knn_forward_cuda(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
dist2_tensor);
#else
AT_ERROR("knn is not compiled with GPU support");
#endif
} else {
AT_ERROR("knn is not implemented on CPU");
}
knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
dist2_tensor);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int kernel_h,
const int kernel_w, const int pad_h,
const int pad_w);
void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int height,
const int width, const int channels);
void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
kernel_h, kernel_w, pad_h, pad_w);
DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
col, kernel_h, kernel_w, pad_h, pad_w);
}
void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
width, channels);
DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
im, height, width, channels);
}
#endif
void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) {
if (im.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(im);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(col);
masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w);
}
void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) {
if (col.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(col);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(im);
masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
channels);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
channels);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void modulated_deformable_im2col_cuda(
void modulated_deformable_im2col_impl(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col);
void modulated_deformable_col2im_cuda(
const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im);
void modulated_deformable_col2im_coord_cuda(
const Tensor data_col, const Tensor data_im, const Tensor data_offset,
const Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask);
#endif
void modulated_deformable_im2col_cpu(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col);
const int dilation_w, const int deformable_group, Tensor data_col) {
DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
data_mask, batch_size, channels, height_im, width_im,
height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w,
deformable_group, data_col);
}
void modulated_deformable_col2im_cpu(
void modulated_deformable_col2im_impl(
const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im);
const int dilation_w, const int deformable_group, Tensor grad_im) {
DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
data_mask, batch_size, channels, height_im, width_im,
height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w,
deformable_group, grad_im);
}
void modulated_deformable_col2im_coord_cpu(
void modulated_deformable_col2im_coord_impl(
const Tensor data_col, const Tensor data_im, const Tensor data_offset,
const Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask);
Tensor grad_offset, Tensor grad_mask) {
DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
data_im, data_offset, data_mask, batch_size, channels,
height_im, width_im, height_col, width_col, kernel_h,
kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
dilation_w, deformable_group, grad_offset, grad_mask);
}
void modulated_deform_conv_forward(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
......@@ -61,31 +51,6 @@ void modulated_deform_conv_forward(
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(columns);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(weight);
CHECK_CPU_INPUT(bias);
CHECK_CPU_INPUT(ones);
CHECK_CPU_INPUT(offset);
CHECK_CPU_INPUT(mask);
CHECK_CPU_INPUT(output);
CHECK_CPU_INPUT(columns);
}
at::DeviceGuard guard(input.device());
const int batch = input.size(0);
......@@ -127,19 +92,10 @@ void modulated_deform_conv_forward(
output.size(2), output.size(3)});
for (int b = 0; b < batch; b++) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
modulated_deformable_im2col_cuda(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
#endif
} else {
modulated_deformable_im2col_cpu(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
}
modulated_deformable_im2col_impl(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
// divide into group
weight = weight.view({group, weight.size(0) / group, weight.size(1),
......@@ -174,41 +130,6 @@ void modulated_deform_conv_backward(
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(columns);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
CHECK_CUDA_INPUT(grad_offset);
CHECK_CUDA_INPUT(grad_mask);
CHECK_CUDA_INPUT(grad_output);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(weight);
CHECK_CPU_INPUT(bias);
CHECK_CPU_INPUT(ones);
CHECK_CPU_INPUT(offset);
CHECK_CPU_INPUT(mask);
CHECK_CPU_INPUT(columns);
CHECK_CPU_INPUT(grad_input);
CHECK_CPU_INPUT(grad_weight);
CHECK_CPU_INPUT(grad_bias);
CHECK_CPU_INPUT(grad_offset);
CHECK_CPU_INPUT(grad_mask);
CHECK_CPU_INPUT(grad_output);
}
at::DeviceGuard guard(input.device());
const int batch = input.size(0);
......@@ -261,46 +182,24 @@ void modulated_deform_conv_backward(
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)});
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cuda(
columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b]);
// gradient w.r.t. input data
modulated_deformable_col2im_cuda(
columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
modulated_deformable_im2col_cuda(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
#endif
} else {
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cpu(
columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b]);
// gradient w.r.t. input data
modulated_deformable_col2im_cpu(
columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
modulated_deformable_im2col_cpu(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
}
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_impl(
columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b]);
// gradient w.r.t. input data
modulated_deformable_col2im_impl(
columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
modulated_deformable_im2col_impl(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
......
......@@ -10,43 +10,39 @@
*/
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
Tensor ms_deform_attn_cuda_forward(const Tensor &value,
Tensor ms_deform_attn_impl_forward(const Tensor &value,
const Tensor &spatial_shapes,
const Tensor &level_start_index,
const Tensor &sampling_loc,
const Tensor &attn_weight,
const int im2col_step);
const int im2col_step) {
return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
spatial_shapes, level_start_index, sampling_loc,
attn_weight, im2col_step);
}
void ms_deform_attn_cuda_backward(
void ms_deform_attn_impl_backward(
const Tensor &value, const Tensor &spatial_shapes,
const Tensor &level_start_index, const Tensor &sampling_loc,
const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);
#endif
Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
const int im2col_step) {
DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
level_start_index, sampling_loc, attn_weight,
grad_output, grad_value, grad_sampling_loc,
grad_attn_weight, im2col_step);
}
Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
const Tensor &level_start_index,
const Tensor &sampling_loc,
const Tensor &attn_weight,
const int im2col_step) {
if (value.type().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(value)
CHECK_CUDA_INPUT(spatial_shapes)
CHECK_CUDA_INPUT(level_start_index)
CHECK_CUDA_INPUT(sampling_loc)
CHECK_CUDA_INPUT(attn_weight)
at::DeviceGuard guard(value.device());
return ms_deform_attn_cuda_forward(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
at::DeviceGuard guard(value.device());
return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, im2col_step);
}
void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
......@@ -56,26 +52,9 @@ void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
const Tensor &grad_output, Tensor &grad_value,
Tensor &grad_sampling_loc,
Tensor &grad_attn_weight, const int im2col_step) {
if (value.type().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(value)
CHECK_CUDA_INPUT(spatial_shapes)
CHECK_CUDA_INPUT(level_start_index)
CHECK_CUDA_INPUT(sampling_loc)
CHECK_CUDA_INPUT(attn_weight)
CHECK_CUDA_INPUT(grad_output)
CHECK_CUDA_INPUT(grad_value)
CHECK_CUDA_INPUT(grad_sampling_loc)
CHECK_CUDA_INPUT(grad_attn_weight)
at::DeviceGuard guard(value.device());
ms_deform_attn_cuda_backward(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, grad_output,
grad_value, grad_sampling_loc,
grad_attn_weight, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
} else {
AT_ERROR("Not implemented on the CPU");
}
at::DeviceGuard guard(value.device());
ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, grad_output,
grad_value, grad_sampling_loc, grad_attn_weight,
im2col_step);
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height,
int pooled_width, float spatial_scale);
void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax, Tensor grad_input,
int pooled_height, int pooled_width,
float spatial_scale);
void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height, int pooled_width,
float spatial_scale) {
ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale);
DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
pooled_height, pooled_width, spatial_scale);
}
void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale) {
ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
pooled_height, pooled_width, spatial_scale);
DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
grad_input, pooled_height, pooled_width, spatial_scale);
}
#endif
void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
int pooled_height, int pooled_width,
float spatial_scale) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax);
roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale);
#else
AT_ERROR("RoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("RoIPool is not implemented on CPU");
}
roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale);
}
void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height, int pooled_width,
float spatial_scale) {
if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(argmax);
CHECK_CUDA_INPUT(grad_input);
roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
pooled_width, spatial_scale);
#else
AT_ERROR("RoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("RoIPool is not implemented on CPU");
}
roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
pooled_width, spatial_scale);
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment