Unverified Commit a4dc2a72 authored by pc's avatar pc Committed by GitHub
Browse files

support device dispatch in parrots (#1588)

parent 0bcbeadb
...@@ -2,31 +2,16 @@ ...@@ -2,31 +2,16 @@
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
const Tensor xyz, const Tensor new_xyz,
Tensor idx, Tensor dist2);
void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
const Tensor new_xyz, Tensor idx, Tensor dist2) { const Tensor new_xyz, Tensor idx, Tensor dist2) {
KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2); DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
dist2);
} }
#endif
void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor, void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
Tensor dist2_tensor, int b, int n, int m, int nsample) { Tensor dist2_tensor, int b, int n, int m, int nsample) {
if (new_xyz_tensor.device().is_cuda()) { knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(new_xyz_tensor);
CHECK_CUDA_INPUT(xyz_tensor);
knn_forward_cuda(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
dist2_tensor); dist2_tensor);
#else
AT_ERROR("knn is not compiled with GPU support");
#endif
} else {
AT_ERROR("knn is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int kernel_h,
const int kernel_w, const int pad_h,
const int pad_w);
void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int height,
const int width, const int channels);
void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col, const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) { const int pad_h, const int pad_w) {
// im: (n, ic, h, w), kernel size (kh, kw) DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh) col, kernel_h, kernel_w, pad_h, pad_w);
MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
kernel_h, kernel_w, pad_h, pad_w);
} }
void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx, void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height, const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) { int width, int channels) {
// im: (n, ic, h, w), kernel size (kh, kw) DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh) im, height, width, channels);
MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
width, channels);
} }
#endif
void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx, void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col, const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) { const int pad_h, const int pad_w) {
if (im.device().is_cuda()) { masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(im);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(col);
masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w); kernel_w, pad_h, pad_w);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
} }
void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx, void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height, const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) { int width, int channels) {
if (col.device().is_cuda()) { masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(col);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(im);
masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
channels); channels);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void modulated_deformable_im2col_impl(
void modulated_deformable_im2col_cuda(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask, const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im, const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col, const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col);
void modulated_deformable_col2im_cuda(
const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h, const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im); const int dilation_w, const int deformable_group, Tensor data_col) {
DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
void modulated_deformable_col2im_coord_cuda( data_mask, batch_size, channels, height_im, width_im,
const Tensor data_col, const Tensor data_im, const Tensor data_offset, height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
const Tensor data_mask, const int batch_size, const int channels, stride_h, stride_w, dilation_h, dilation_w,
const int height_im, const int width_im, const int height_col, deformable_group, data_col);
const int width_col, const int kernel_h, const int kernel_w, }
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask);
#endif
void modulated_deformable_im2col_cpu(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col);
void modulated_deformable_col2im_cpu( void modulated_deformable_col2im_impl(
const Tensor data_col, const Tensor data_offset, const Tensor data_mask, const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im, const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col, const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h, const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im); const int dilation_w, const int deformable_group, Tensor grad_im) {
DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
data_mask, batch_size, channels, height_im, width_im,
height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w,
deformable_group, grad_im);
}
void modulated_deformable_col2im_coord_cpu( void modulated_deformable_col2im_coord_impl(
const Tensor data_col, const Tensor data_im, const Tensor data_offset, const Tensor data_col, const Tensor data_im, const Tensor data_offset,
const Tensor data_mask, const int batch_size, const int channels, const Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col, const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kernel_w, const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group, const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask); Tensor grad_offset, Tensor grad_mask) {
DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
data_im, data_offset, data_mask, batch_size, channels,
height_im, width_im, height_col, width_col, kernel_h,
kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
dilation_w, deformable_group, grad_offset, grad_mask);
}
void modulated_deform_conv_forward( void modulated_deform_conv_forward(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset, Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
...@@ -61,31 +51,6 @@ void modulated_deform_conv_forward( ...@@ -61,31 +51,6 @@ void modulated_deform_conv_forward(
const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group, const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias) { const int deformable_group, const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(columns);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(weight);
CHECK_CPU_INPUT(bias);
CHECK_CPU_INPUT(ones);
CHECK_CPU_INPUT(offset);
CHECK_CPU_INPUT(mask);
CHECK_CPU_INPUT(output);
CHECK_CPU_INPUT(columns);
}
at::DeviceGuard guard(input.device()); at::DeviceGuard guard(input.device());
const int batch = input.size(0); const int batch = input.size(0);
...@@ -127,19 +92,10 @@ void modulated_deform_conv_forward( ...@@ -127,19 +92,10 @@ void modulated_deform_conv_forward(
output.size(2), output.size(3)}); output.size(2), output.size(3)});
for (int b = 0; b < batch; b++) { for (int b = 0; b < batch; b++) {
if (input.device().is_cuda()) { modulated_deformable_im2col_impl(
#ifdef MMCV_WITH_CUDA
modulated_deformable_im2col_cuda(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
#endif
} else {
modulated_deformable_im2col_cpu(
input[b], offset[b], mask[b], 1, channels, height, width, height_out, input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns); dilation_h, dilation_w, deformable_group, columns);
}
// divide into group // divide into group
weight = weight.view({group, weight.size(0) / group, weight.size(1), weight = weight.view({group, weight.size(0) / group, weight.size(1),
...@@ -174,41 +130,6 @@ void modulated_deform_conv_backward( ...@@ -174,41 +130,6 @@ void modulated_deform_conv_backward(
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias) { const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(columns);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
CHECK_CUDA_INPUT(grad_offset);
CHECK_CUDA_INPUT(grad_mask);
CHECK_CUDA_INPUT(grad_output);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(weight);
CHECK_CPU_INPUT(bias);
CHECK_CPU_INPUT(ones);
CHECK_CPU_INPUT(offset);
CHECK_CPU_INPUT(mask);
CHECK_CPU_INPUT(columns);
CHECK_CPU_INPUT(grad_input);
CHECK_CPU_INPUT(grad_weight);
CHECK_CPU_INPUT(grad_bias);
CHECK_CPU_INPUT(grad_offset);
CHECK_CPU_INPUT(grad_mask);
CHECK_CPU_INPUT(grad_output);
}
at::DeviceGuard guard(input.device()); at::DeviceGuard guard(input.device());
const int batch = input.size(0); const int batch = input.size(0);
...@@ -261,46 +182,24 @@ void modulated_deform_conv_backward( ...@@ -261,46 +182,24 @@ void modulated_deform_conv_backward(
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)}); weight.size(3), weight.size(4)});
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
// gradient w.r.t. input coordinate data // gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cuda( modulated_deformable_col2im_coord_impl(
columns, input[b], offset[b], mask[b], 1, channels, height, width, columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b], stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b]); grad_mask[b]);
// gradient w.r.t. input data // gradient w.r.t. input data
modulated_deformable_col2im_cuda( modulated_deformable_col2im_impl(
columns, offset[b], mask[b], 1, channels, height, width, height_out, columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b]); dilation_h, dilation_w, deformable_group, grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and // gradient w.r.t. weight, dWeight should accumulate across the batch and
// group // group
modulated_deformable_im2col_cuda( modulated_deformable_im2col_impl(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
#endif
} else {
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cpu(
columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b]);
// gradient w.r.t. input data
modulated_deformable_col2im_cpu(
columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
modulated_deformable_im2col_cpu(
input[b], offset[b], mask[b], 1, channels, height, width, height_out, input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns); dilation_h, dilation_w, deformable_group, columns);
}
columns = columns.view({group, columns.size(0) / group, columns.size(1)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)});
grad_weight = grad_weight.view({group, grad_weight.size(0) / group, grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
......
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
template <typename T>
T dmcn_im2col_bilinear_cpu(const T *input, const int data_width,
const int height, const int width, T h, T w) {
int h_low = floorf(h);
int w_low = floorf(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
T lh = h - h_low;
T lw = w - w_low;
T hh = 1 - lh, hw = 1 - lw;
T v1 = 0;
if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
T v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
v2 = input[h_low * data_width + w_high];
T v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
v3 = input[h_high * data_width + w_low];
T v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
v4 = input[h_high * data_width + w_high];
T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename T>
T dmcn_get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
const int height, const int width) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floorf(argmax_h);
int argmax_w_low = floorf(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (h == argmax_h_low && w == argmax_w_low)
weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
if (h == argmax_h_low && w == argmax_w_high)
weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
if (h == argmax_h_high && w == argmax_w_low)
weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
if (h == argmax_h_high && w == argmax_w_high)
weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
return weight;
}
template <typename T>
T dmcn_get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
const int width, const T *im_data,
const int data_width, const int bp_dir) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floorf(argmax_h);
int argmax_w_low = floorf(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (bp_dir == 0) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += -1 * (argmax_w - argmax_w_low) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_w - argmax_w_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
} else if (bp_dir == 1) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += -1 * (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
}
return weight;
}
template <typename T>
void modulated_deformable_im2col_cpu_kernel(
const int n, const T *data_im, const T *data_offset, const T *data_mask,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int num_channels, const int deformable_group, const int height_col,
const int width_col, T *data_col) {
for (int index = 0; index < n; index++) {
// index index of output matrix
const int w_col = index % width_col;
const int h_col = (index / width_col) % height_col;
const int b_col = (index / width_col / height_col) % batch_size;
const int c_im = (index / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
// compute deformable group index
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
T *data_col_ptr =
data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
const T *data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const T *data_offset_ptr =
data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const T *data_mask_ptr =
data_mask + (b_col * deformable_group + deformable_group_index) *
kernel_h * kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
T val = static_cast<T>(0);
const T h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width,
h_im, w_im);
*data_col_ptr = val * mask;
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
template <typename T>
void modulated_deformable_col2im_cpu_kernel(
const int n, const T *data_col, const T *data_offset, const T *data_mask,
const int channels, const int height, const int width, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int deformable_group, const int height_col, const int width_col,
T *grad_im) {
for (int index = 0; index < n; index++) {
const int j = (index / width_col / height_col / batch_size) % kernel_w;
const int i =
(index / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c =
index / width_col / height_col / batch_size / kernel_w / kernel_h;
// compute the start and end of the output
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = index % width_col;
int h_out = (index / width_col) % height_col;
int b = (index / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const T *data_offset_ptr =
data_offset + (b * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const T *data_mask_ptr =
data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
kernel_w * height_col * width_col;
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
const T cur_top_grad = data_col[index] * mask;
const int cur_h = (int)cur_inv_h_data;
const int cur_w = (int)cur_inv_w_data;
for (int dy = -2; dy <= 2; dy++) {
for (int dx = -2; dx <= 2; dx++) {
if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1) {
int cur_bottom_grad_pos =
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
T weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data,
cur_inv_w_data, cur_h + dy,
cur_w + dx, height, width);
*(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
}
}
}
}
}
template <typename T>
void modulated_deformable_col2im_coord_cpu_kernel(
const int n, const T *data_col, const T *data_im, const T *data_offset,
const T *data_mask, const int channels, const int height, const int width,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int channel_per_deformable_group,
const int batch_size, const int offset_channels, const int deformable_group,
const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
for (int index = 0; index < n; index++) {
T val = 0, mval = 0;
int w = index % width_col;
int h = (index / width_col) % height_col;
int c = (index / width_col / height_col) % offset_channels;
int b = (index / width_col / height_col) / offset_channels;
// compute the start and end of the output
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const T *data_col_ptr = data_col + deformable_group_index *
channel_per_deformable_group *
batch_size * width_col * height_col;
const T *data_im_ptr =
data_im + (b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h / kernel_w *
height * width;
const T *data_offset_ptr =
data_offset + (b * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const T *data_mask_ptr =
data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
kernel_w * height_col * width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
col_c += col_step) {
const int col_pos =
(((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i =
(col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr =
(((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr =
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
w_out);
const int data_mask_hw_ptr =
(((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
T inv_h = h_in + i * dilation_h + offset_h;
T inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
inv_h = inv_w = -2;
else
mval += data_col_ptr[col_pos] *
dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width,
width, height, width, inv_h, inv_w);
const T weight = dmcn_get_coordinate_weight_cpu(
inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
width, bp_dir);
val += weight * data_col_ptr[col_pos] * mask;
cnt += 1;
}
// KERNEL_ASSIGN(grad_offset[index], offset_req, val);
grad_offset[index] = val;
if (offset_c % 2 == 0)
// KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
// deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
// height_col + h) * width_col + w], mask_req, mval);
grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
kernel_w +
offset_c / 2) *
height_col +
h) *
width_col +
w] = mval;
}
}
void modulated_deformable_im2col_cpu(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col) {
// num_axes should be smaller than block size
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels = channels * batch_size * height_col * width_col;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.scalar_type(), "modulated_deformable_im2col_cpu", ([&] {
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
modulated_deformable_im2col_cpu_kernel(
num_kernels, data_im_, data_offset_, data_mask_, height_im,
width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group, batch_size,
channels, deformable_group, height_col, width_col, data_col_);
}));
}
void modulated_deformable_col2im_cpu(
const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im) {
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels =
channels * kernel_h * kernel_w * batch_size * height_col * width_col;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "modulated_deformable_col2im_cpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
modulated_deformable_col2im_cpu_kernel(
num_kernels, data_col_, data_offset_, data_mask_, channels,
height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, channel_per_deformable_group,
batch_size, deformable_group, height_col, width_col, grad_im_);
}));
}
void modulated_deformable_col2im_coord_cpu(
const Tensor data_col, const Tensor data_im, const Tensor data_offset,
const Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask) {
const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
kernel_w * deformable_group;
const int channel_per_deformable_group =
channels * kernel_h * kernel_w / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "modulated_deformable_col2im_coord_cpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
modulated_deformable_col2im_coord_cpu_kernel(
num_kernels, data_col_, data_im_, data_offset_, data_mask_,
channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, batch_size,
2 * kernel_h * kernel_w * deformable_group, deformable_group,
height_col, width_col, grad_offset_, grad_mask_);
}));
}
...@@ -10,42 +10,39 @@ ...@@ -10,42 +10,39 @@
*/ */
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA Tensor ms_deform_attn_impl_forward(const Tensor &value,
Tensor ms_deform_attn_cuda_forward(const Tensor &value,
const Tensor &spatial_shapes, const Tensor &spatial_shapes,
const Tensor &level_start_index, const Tensor &level_start_index,
const Tensor &sampling_loc, const Tensor &sampling_loc,
const Tensor &attn_weight, const Tensor &attn_weight,
const int im2col_step); const int im2col_step) {
return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
spatial_shapes, level_start_index, sampling_loc,
attn_weight, im2col_step);
}
void ms_deform_attn_cuda_backward( void ms_deform_attn_impl_backward(
const Tensor &value, const Tensor &spatial_shapes, const Tensor &value, const Tensor &spatial_shapes,
const Tensor &level_start_index, const Tensor &sampling_loc, const Tensor &level_start_index, const Tensor &sampling_loc,
const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value, const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step); Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
const int im2col_step) {
#endif DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
level_start_index, sampling_loc, attn_weight,
grad_output, grad_value, grad_sampling_loc,
grad_attn_weight, im2col_step);
}
Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes, Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
const Tensor &level_start_index, const Tensor &level_start_index,
const Tensor &sampling_loc, const Tensor &sampling_loc,
const Tensor &attn_weight, const Tensor &attn_weight,
const int im2col_step) { const int im2col_step) {
if (value.type().is_cuda()) { at::DeviceGuard guard(value.device());
#ifdef MMCV_WITH_CUDA return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
CHECK_CUDA_INPUT(value)
CHECK_CUDA_INPUT(spatial_shapes)
CHECK_CUDA_INPUT(level_start_index)
CHECK_CUDA_INPUT(sampling_loc)
CHECK_CUDA_INPUT(attn_weight)
return ms_deform_attn_cuda_forward(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, im2col_step); sampling_loc, attn_weight, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
} }
void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes, void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
...@@ -55,25 +52,9 @@ void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes, ...@@ -55,25 +52,9 @@ void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
const Tensor &grad_output, Tensor &grad_value, const Tensor &grad_output, Tensor &grad_value,
Tensor &grad_sampling_loc, Tensor &grad_sampling_loc,
Tensor &grad_attn_weight, const int im2col_step) { Tensor &grad_attn_weight, const int im2col_step) {
if (value.type().is_cuda()) { at::DeviceGuard guard(value.device());
#ifdef MMCV_WITH_CUDA ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
CHECK_CUDA_INPUT(value)
CHECK_CUDA_INPUT(spatial_shapes)
CHECK_CUDA_INPUT(level_start_index)
CHECK_CUDA_INPUT(sampling_loc)
CHECK_CUDA_INPUT(attn_weight)
CHECK_CUDA_INPUT(grad_output)
CHECK_CUDA_INPUT(grad_value)
CHECK_CUDA_INPUT(grad_sampling_loc)
CHECK_CUDA_INPUT(grad_attn_weight)
ms_deform_attn_cuda_backward(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, grad_output, sampling_loc, attn_weight, grad_output,
grad_value, grad_sampling_loc, grad_value, grad_sampling_loc, grad_attn_weight,
grad_attn_weight, im2col_step); im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
} else {
AT_ERROR("Not implemented on the CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold, return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
int offset);
Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
} }
#endif
Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto nboxes = boxes.size(0);
Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
auto select = select_t.data_ptr<bool>(); Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
for (int64_t _i = 0; _i < nboxes; _i++) {
if (select[_i] == false) continue;
auto i = order[_i];
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < nboxes; _j++) {
if (select[_j] == false) continue;
auto j = order[_j];
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr > iou_threshold) select[_j] = false;
}
}
return order_t.masked_select(select_t);
}
Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
if (boxes.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes);
CHECK_CUDA_INPUT(scores);
return nms_cuda(boxes, scores, iou_threshold, offset);
#else
AT_ERROR("nms is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(boxes);
CHECK_CPU_INPUT(scores);
return nms_cpu(boxes, scores, iou_threshold, offset);
}
}
Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
float iou_threshold, float sigma, float min_score, float iou_threshold, float sigma, float min_score,
int method, int offset) { int method, int offset) {
if (boxes.numel() == 0) { return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
return at::empty({0}, boxes.options().dtype(at::kLong)); sigma, min_score, method, offset);
} }
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
auto scores_t = scores.clone();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
auto nboxes = boxes.size(0);
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto sc = scores_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
auto de = dets.data_ptr<float>();
int64_t pos = 0;
Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
auto inds = inds_t.data_ptr<int64_t>();
for (int64_t i = 0; i < nboxes; i++) {
auto max_score = sc[i];
auto max_pos = i;
pos = i + 1;
// get max box
while (pos < nboxes) {
if (max_score < sc[pos]) {
max_score = sc[pos];
max_pos = pos;
}
pos = pos + 1;
}
// swap
auto ix1 = de[i * 5 + 0] = x1[max_pos];
auto iy1 = de[i * 5 + 1] = y1[max_pos];
auto ix2 = de[i * 5 + 2] = x2[max_pos];
auto iy2 = de[i * 5 + 3] = y2[max_pos];
auto iscore = de[i * 5 + 4] = sc[max_pos];
auto iarea = areas[max_pos];
auto iind = inds[max_pos];
x1[max_pos] = x1[i];
y1[max_pos] = y1[i];
x2[max_pos] = x2[i];
y2[max_pos] = y2[i];
sc[max_pos] = sc[i];
areas[max_pos] = areas[i];
inds[max_pos] = inds[i];
x1[i] = ix1;
y1[i] = iy1;
x2[i] = ix2;
y2[i] = iy2;
sc[i] = iscore;
areas[i] = iarea;
inds[i] = iind;
pos = i + 1;
while (pos < nboxes) {
auto xx1 = std::max(ix1, x1[pos]);
auto yy1 = std::max(iy1, y1[pos]);
auto xx2 = std::min(ix2, x2[pos]);
auto yy2 = std::min(iy2, y2[pos]);
auto w = std::max(0.f, xx2 - xx1 + offset); std::vector<std::vector<int> > nms_match_impl(Tensor dets,
auto h = std::max(0.f, yy2 - yy1 + offset); float iou_threshold) {
auto inter = w * h; return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
auto ovr = inter / (iarea + areas[pos] - inter); }
float weight = 1.; Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
if (method == 0) { return nms_impl(boxes, scores, iou_threshold, offset);
if (ovr >= iou_threshold) weight = 0;
} else if (method == 1) {
if (ovr >= iou_threshold) weight = 1 - ovr;
} else if (method == 2) {
weight = std::exp(-(ovr * ovr) / sigma);
}
sc[pos] *= weight;
// if box score falls below threshold, discard the box by
// swapping with last box update N
if (sc[pos] < min_score) {
x1[pos] = x1[nboxes - 1];
y1[pos] = y1[nboxes - 1];
x2[pos] = x2[nboxes - 1];
y2[pos] = y2[nboxes - 1];
sc[pos] = sc[nboxes - 1];
areas[pos] = areas[nboxes - 1];
inds[pos] = inds[nboxes - 1];
nboxes = nboxes - 1;
pos = pos - 1;
}
pos = pos + 1;
}
}
return inds_t.slice(0, 0, nboxes);
} }
Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold, Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
float sigma, float min_score, int method, int offset) { float sigma, float min_score, int method, int offset) {
if (boxes.device().is_cuda()) { return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
AT_ERROR("softnms is not implemented on GPU");
} else {
return softnms_cpu(boxes, scores, dets, iou_threshold, sigma, min_score,
method, offset); method, offset);
}
}
std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
auto x1_t = dets.select(1, 0).contiguous();
auto y1_t = dets.select(1, 1).contiguous();
auto x2_t = dets.select(1, 2).contiguous();
auto y2_t = dets.select(1, 3).contiguous();
auto scores = dets.select(1, 4).contiguous();
at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
at::Tensor suppressed_t =
at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
std::vector<int> keep;
std::vector<std::vector<int> > matched;
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1) continue;
keep.push_back(i);
std::vector<int> v_i;
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1) continue;
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(static_cast<float>(0), xx2 - xx1);
auto h = std::max(static_cast<float>(0), yy2 - yy1);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr >= iou_threshold) {
suppressed[j] = 1;
v_i.push_back(j);
}
}
matched.push_back(v_i);
}
for (int i = 0; i < keep.size(); i++)
matched[i].insert(matched[i].begin(), keep[i]);
return matched;
} }
std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) { std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
if (dets.device().is_cuda()) { return nms_match_impl(dets, iou_threshold);
AT_ERROR("nms_match is not implemented on GPU");
} else {
return nms_match_cpu(dets, iou_threshold);
}
} }
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
#include "box_iou_rotated_utils.hpp"
#include "pytorch_cpp_helper.hpp"
template <typename scalar_t>
Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
const float iou_threshold) {
// nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
// however, the code in this function is much shorter because
// we delegate the IoU computation for rotated boxes to
// the single_box_iou_rotated function in box_iou_rotated_utils.h
AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
AT_ASSERTM(dets.type() == scores.type(),
"dets should have the same type as scores");
if (dets.numel() == 0) {
return at::empty({0}, dets.options().dtype(at::kLong));
}
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto keep = keep_t.data_ptr<int64_t>();
auto order = order_t.data_ptr<int64_t>();
int64_t num_to_keep = 0;
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1) {
continue;
}
keep[num_to_keep++] = i;
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1) {
continue;
}
auto ovr = single_box_iou_rotated<scalar_t>(
dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
if (ovr >= iou_threshold) {
suppressed[j] = 1;
}
}
}
return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
}
Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
const float iou_threshold) {
auto result = at::empty({0}, dets.options());
AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms_rotated", [&] {
result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
});
return result;
}
...@@ -2,120 +2,14 @@ ...@@ -2,120 +2,14 @@
// It is modified from https://github.com/WenmuZhou/PAN.pytorch // It is modified from https://github.com/WenmuZhou/PAN.pytorch
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
std::vector<std::vector<float>> estimate_confidence(int32_t* label, std::vector<std::vector<float>> pixel_group_impl(
float* score, int label_num,
int height, int width) {
std::vector<std::vector<float>> point_vector;
for (int i = 0; i < label_num; i++) {
std::vector<float> point;
point.push_back(0);
point.push_back(0);
point_vector.push_back(point);
}
for (int y = 0; y < height; y++) {
auto label_tmp = label + y * width;
auto score_tmp = score + y * width;
for (int x = 0; x < width; x++) {
auto l = label_tmp[x];
if (l > 0) {
float confidence = score_tmp[x];
point_vector[l].push_back(x);
point_vector[l].push_back(y);
point_vector[l][0] += confidence;
point_vector[l][1] += 1;
}
}
}
for (int l = 0; l < point_vector.size(); l++)
if (point_vector[l][1] > 0) {
point_vector[l][0] /= point_vector[l][1];
}
return point_vector;
}
std::vector<std::vector<float>> pixel_group_cpu(
Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label, Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
Tensor kernel_contour, int kernel_region_num, float dis_threshold) { Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
assert(score.dim() == 2); return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
assert(mask.dim() == 2); kernel_label, kernel_contour, kernel_region_num,
assert(embedding_dim.dim() == 3); dis_threshold);
int height = score.size(0);
int width = score.size(1);
assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
assert(width == mask.size(1) == embedding.size(2) == kernel_label.size(2));
auto threshold_square = dis_threshold * dis_threshold;
auto ptr_score = score.data_ptr<float>();
auto ptr_mask = mask.data_ptr<bool>();
auto ptr_kernel_contour = kernel_contour.data_ptr<uint8_t>();
auto ptr_embedding = embedding.data_ptr<float>();
auto ptr_kernel_label = kernel_label.data_ptr<int32_t>();
std::queue<std::tuple<int, int, int32_t>> contour_pixels;
auto embedding_dim = embedding.size(2);
std::vector<std::vector<float>> kernel_vector(
kernel_region_num, std::vector<float>(embedding_dim + 1, 0));
Tensor text_label;
text_label = kernel_label.clone();
auto ptr_text_label = text_label.data_ptr<int32_t>();
for (int i = 0; i < height; i++) {
auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
for (int j = 0, k = 0; j < width && k < width * embedding_dim;
j++, k += embedding_dim) {
int32_t label = ptr_kernel_label_tmp[j];
if (label > 0) {
for (int d = 0; d < embedding_dim; d++)
kernel_vector[label][d] += ptr_embedding_tmp[k + d];
kernel_vector[label][embedding_dim] += 1;
// kernel pixel number
if (ptr_kernel_contour_tmp[j]) {
contour_pixels.push(std::make_tuple(i, j, label));
}
}
}
}
for (int i = 0; i < kernel_region_num; i++) {
for (int j = 0; j < embedding_dim; j++) {
kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
}
}
int dx[4] = {-1, 1, 0, 0};
int dy[4] = {0, 0, -1, 1};
while (!contour_pixels.empty()) {
auto query_pixel = contour_pixels.front();
contour_pixels.pop();
int y = std::get<0>(query_pixel);
int x = std::get<1>(query_pixel);
int32_t l = std::get<2>(query_pixel);
auto kernel_cv = kernel_vector[l];
for (int idx = 0; idx < 4; idx++) {
int tmpy = y + dy[idx];
int tmpx = x + dx[idx];
auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0)
continue;
float dis = 0;
auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
for (size_t i = 0; i < embedding_dim; i++) {
dis +=
pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
// ignore further computing if dis is big enough
if (dis >= threshold_square) break;
}
if (dis >= threshold_square) continue;
contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
ptr_text_label_tmp[tmpx] = l;
}
}
return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num,
height, width);
} }
std::vector<std::vector<float>> pixel_group( std::vector<std::vector<float>> pixel_group(
...@@ -127,11 +21,6 @@ std::vector<std::vector<float>> pixel_group( ...@@ -127,11 +21,6 @@ std::vector<std::vector<float>> pixel_group(
kernel_label = kernel_label.contiguous(); kernel_label = kernel_label.contiguous();
kernel_contour = kernel_contour.contiguous(); kernel_contour = kernel_contour.contiguous();
CHECK_CPU_INPUT(score); return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
CHECK_CPU_INPUT(mask);
CHECK_CPU_INPUT(embedding);
CHECK_CPU_INPUT(kernel_label);
CHECK_CPU_INPUT(kernel_contour);
return pixel_group_cpu(score, mask, embedding, kernel_label, kernel_contour,
kernel_region_num, distance_threshold); kernel_region_num, distance_threshold);
} }
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
int pts_num, const Tensor boxes,
const Tensor pts,
Tensor box_idx_of_points);
void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
int pts_num, const Tensor boxes, int pts_num, const Tensor boxes,
const Tensor pts, const Tensor pts,
Tensor box_idx_of_points) { Tensor box_idx_of_points) {
PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num, DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
boxes, pts, box_idx_of_points); pts_num, boxes, pts, box_idx_of_points);
}; }
void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
int pts_num, const Tensor boxes,
const Tensor pts,
Tensor box_idx_of_points);
void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num, void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
int pts_num, const Tensor boxes, int pts_num, const Tensor boxes,
const Tensor pts, const Tensor pts,
Tensor box_idx_of_points) { Tensor box_idx_of_points) {
PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num, DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
boxes, pts, box_idx_of_points); pts_num, boxes, pts, box_idx_of_points);
}; }
#endif
void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor, void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
Tensor box_idx_of_points_tensor) { Tensor box_idx_of_points_tensor) {
...@@ -34,30 +23,12 @@ void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor, ...@@ -34,30 +23,12 @@ void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
// coordinate, z is the bottom center, each box params pts: (B, npoints, 3) // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
// [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
// default -1 // default -1
if (pts_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes_tensor);
CHECK_CUDA_INPUT(pts_tensor);
CHECK_CUDA_INPUT(box_idx_of_points_tensor);
int batch_size = boxes_tensor.size(0); int batch_size = boxes_tensor.size(0);
int boxes_num = boxes_tensor.size(1); int boxes_num = boxes_tensor.size(1);
int pts_num = pts_tensor.size(1); int pts_num = pts_tensor.size(1);
points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
const float *boxes = boxes_tensor.data_ptr<float>();
const float *pts = pts_tensor.data_ptr<float>();
int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
points_in_boxes_part_forward_cuda(batch_size, boxes_num, pts_num,
boxes_tensor, pts_tensor, boxes_tensor, pts_tensor,
box_idx_of_points_tensor); box_idx_of_points_tensor);
#else
AT_ERROR("points_in_boxes_part is not compiled with GPU support");
#endif
} else {
AT_ERROR("points_in_boxes_part is not implemented on CPU");
}
} }
void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor, void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
...@@ -65,28 +36,9 @@ void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor, ...@@ -65,28 +36,9 @@ void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
// params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
// coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z] // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
// in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1 // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
if (pts_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes_tensor);
CHECK_CUDA_INPUT(pts_tensor);
CHECK_CUDA_INPUT(box_idx_of_points_tensor);
int batch_size = boxes_tensor.size(0); int batch_size = boxes_tensor.size(0);
int boxes_num = boxes_tensor.size(1); int boxes_num = boxes_tensor.size(1);
int pts_num = pts_tensor.size(1); int pts_num = pts_tensor.size(1);
points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
const float *boxes = boxes_tensor.data_ptr<float>(); pts_tensor, box_idx_of_points_tensor);
const float *pts = pts_tensor.data_ptr<float>();
int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
points_in_boxes_all_forward_cuda(batch_size, boxes_num, pts_num,
boxes_tensor, pts_tensor,
box_idx_of_points_tensor);
#else
AT_ERROR("points_in_boxes_all is not compiled with GPU support");
#endif
} else {
AT_ERROR("points_in_boxes_all is not implemented on CPU");
}
} }
#include "pytorch_cpp_helper.hpp"
inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
float &local_x, float &local_y) {
float cosa = cos(-rz), sina = sin(-rz);
local_x = shift_x * cosa + shift_y * (-sina);
local_y = shift_x * sina + shift_y * cosa;
}
inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
float &local_x, float &local_y) {
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
// cz in the bottom center
float x = pt[0], y = pt[1], z = pt[2];
float cx = box3d[0], cy = box3d[1], cz = box3d[2];
float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
cz += z_size /
2.0; // shift to the center since cz in box3d is the bottom center
if (fabsf(z - cz) > z_size / 2.0) return 0;
lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
(local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
return in_flag;
}
void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
Tensor pts_indices_tensor) {
// params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
// coordinate, z is the bottom center, each box DO NOT overlaps params pts:
// (npoints, 3) [x, y, z] in LiDAR coordinate params pts_indices: (N, npoints)
CHECK_CONTIGUOUS(boxes_tensor);
CHECK_CONTIGUOUS(pts_tensor);
CHECK_CONTIGUOUS(pts_indices_tensor);
int boxes_num = boxes_tensor.size(0);
int pts_num = pts_tensor.size(0);
const float *boxes = boxes_tensor.data_ptr<float>();
const float *pts = pts_tensor.data_ptr<float>();
int *pts_indices = pts_indices_tensor.data_ptr<int>();
float local_x = 0, local_y = 0;
for (int i = 0; i < boxes_num; i++) {
for (int j = 0; j < pts_num; j++) {
int cur_in_flag =
check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
pts_indices[i * pts_num + j] = cur_in_flag;
}
}
}
This diff is collapsed.
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned);
void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width, int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio, float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) { int pool_mode, bool aligned) {
ROIAlignForwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, argmax_x, aligned_height, aligned_width, spatial_scale,
spatial_scale, sampling_ratio, pool_mode, aligned);
}
void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignBackwardCUDAKernelLauncher(
grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
}
#endif
void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned);
void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) {
ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned); sampling_ratio, pool_mode, aligned);
} }
void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y, void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input, Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width, int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio, float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) { int pool_mode, bool aligned) {
ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input, DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
aligned_height, aligned_width, spatial_scale, argmax_x, grad_input, aligned_height, aligned_width,
sampling_ratio, pool_mode, aligned); spatial_scale, sampling_ratio, pool_mode, aligned);
} }
void roi_align_forward(Tensor input, Tensor rois, Tensor output, void roi_align_forward(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height, Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale, int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) { int sampling_ratio, int pool_mode, bool aligned) {
if (input.device().is_cuda()) { roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax_y);
CHECK_CUDA_INPUT(argmax_x);
roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
AT_ERROR("RoIAlign is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(output);
CHECK_CPU_INPUT(argmax_y);
CHECK_CPU_INPUT(argmax_x);
roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale, aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned); sampling_ratio, pool_mode, aligned);
}
} }
void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y, void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input, int aligned_height, Tensor argmax_x, Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale, int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) { int sampling_ratio, int pool_mode, bool aligned) {
if (grad_output.device().is_cuda()) { roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(argmax_y);
CHECK_CUDA_INPUT(argmax_x);
CHECK_CUDA_INPUT(grad_input);
roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
AT_ERROR("RoIAlign is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(grad_output);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(argmax_y);
CHECK_CPU_INPUT(argmax_x);
CHECK_CPU_INPUT(grad_input);
roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale, aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned); sampling_ratio, pool_mode, aligned);
}
} }
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -7,24 +7,18 @@ All Rights Reserved 2018. ...@@ -7,24 +7,18 @@ All Rights Reserved 2018.
*/ */
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
void RoIPointPool3dForwardCUDAKernelLauncher(
int batch_size, int pts_num, int boxes_num, int feature_in_len,
int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
int feature_in_len, int sampled_pts_num, int feature_in_len, int sampled_pts_num,
const Tensor xyz, const Tensor boxes3d, const Tensor xyz, const Tensor boxes3d,
const Tensor pts_feature, const Tensor pts_feature,
Tensor pooled_features, Tensor pooled_features,
Tensor pooled_empty_flag) { Tensor pooled_empty_flag) {
RoIPointPool3dForwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz, boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
boxes3d, pts_feature, pooled_features, pooled_empty_flag); pts_feature, pooled_features, pooled_empty_flag);
}; }
#endif
void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature, void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
Tensor pooled_features, Tensor pooled_empty_flag) { Tensor pooled_features, Tensor pooled_empty_flag) {
...@@ -33,28 +27,13 @@ void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature, ...@@ -33,28 +27,13 @@ void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
// params pts_feature: (B, N, C) // params pts_feature: (B, N, C)
// params pooled_features: (B, M, 512, 3+C) // params pooled_features: (B, M, 512, 3+C)
// params pooled_empty_flag: (B, M) // params pooled_empty_flag: (B, M)
if (xyz.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(xyz);
CHECK_CUDA_INPUT(boxes3d);
CHECK_CUDA_INPUT(pts_feature);
CHECK_CUDA_INPUT(pooled_features);
CHECK_CUDA_INPUT(pooled_empty_flag);
int batch_size = xyz.size(0); int batch_size = xyz.size(0);
int pts_num = xyz.size(1); int pts_num = xyz.size(1);
int boxes_num = boxes3d.size(1); int boxes_num = boxes3d.size(1);
int feature_in_len = pts_feature.size(2); int feature_in_len = pts_feature.size(2);
int sampled_pts_num = pooled_features.size(2); int sampled_pts_num = pooled_features.size(2);
roipoint_pool3d_forward_cuda(batch_size, pts_num, boxes_num, feature_in_len, roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
sampled_pts_num, xyz, boxes3d, pts_feature, sampled_pts_num, xyz, boxes3d, pts_feature,
pooled_features, pooled_empty_flag); pooled_features, pooled_empty_flag);
#else
AT_ERROR("roipoint_pool3d is not compiled with GPU support");
#endif
} else {
AT_ERROR("roipoint_pool3d is not implemented on CPU");
}
} }
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment