Unverified Commit a4dc2a72 authored by pc's avatar pc Committed by GitHub
Browse files

support device dispatch in parrots (#1588)

parent 0bcbeadb
...@@ -2,31 +2,16 @@ ...@@ -2,31 +2,16 @@
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
const Tensor xyz, const Tensor new_xyz,
Tensor idx, Tensor dist2);
void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
const Tensor new_xyz, Tensor idx, Tensor dist2) { const Tensor new_xyz, Tensor idx, Tensor dist2) {
KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2); DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
dist2);
} }
#endif
void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor, void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
Tensor dist2_tensor, int b, int n, int m, int nsample) { Tensor dist2_tensor, int b, int n, int m, int nsample) {
if (new_xyz_tensor.device().is_cuda()) { knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA dist2_tensor);
CHECK_CUDA_INPUT(new_xyz_tensor);
CHECK_CUDA_INPUT(xyz_tensor);
knn_forward_cuda(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
dist2_tensor);
#else
AT_ERROR("knn is not compiled with GPU support");
#endif
} else {
AT_ERROR("knn is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int kernel_h,
const int kernel_w, const int pad_h,
const int pad_w);
void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int height,
const int width, const int channels);
void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col, const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) { const int pad_h, const int pad_w) {
// im: (n, ic, h, w), kernel size (kh, kw) DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh) col, kernel_h, kernel_w, pad_h, pad_w);
MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
kernel_h, kernel_w, pad_h, pad_w);
} }
void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx, void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height, const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) { int width, int channels) {
// im: (n, ic, h, w), kernel size (kh, kw) DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh) im, height, width, channels);
MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
width, channels);
} }
#endif
void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx, void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col, const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) { const int pad_h, const int pad_w) {
if (im.device().is_cuda()) { masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
#ifdef MMCV_WITH_CUDA kernel_w, pad_h, pad_w);
CHECK_CUDA_INPUT(im);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(col);
masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
} }
void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx, void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height, const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) { int width, int channels) {
if (col.device().is_cuda()) { masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
#ifdef MMCV_WITH_CUDA channels);
CHECK_CUDA_INPUT(col);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(im);
masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
channels);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void modulated_deformable_im2col_impl(
void modulated_deformable_im2col_cuda(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask, const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im, const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col, const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col);
void modulated_deformable_col2im_cuda(
const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h, const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im); const int dilation_w, const int deformable_group, Tensor data_col) {
DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
void modulated_deformable_col2im_coord_cuda( data_mask, batch_size, channels, height_im, width_im,
const Tensor data_col, const Tensor data_im, const Tensor data_offset, height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
const Tensor data_mask, const int batch_size, const int channels, stride_h, stride_w, dilation_h, dilation_w,
const int height_im, const int width_im, const int height_col, deformable_group, data_col);
const int width_col, const int kernel_h, const int kernel_w, }
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask);
#endif
void modulated_deformable_im2col_cpu(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col);
void modulated_deformable_col2im_cpu( void modulated_deformable_col2im_impl(
const Tensor data_col, const Tensor data_offset, const Tensor data_mask, const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im, const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col, const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h, const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im); const int dilation_w, const int deformable_group, Tensor grad_im) {
DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
data_mask, batch_size, channels, height_im, width_im,
height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w,
deformable_group, grad_im);
}
void modulated_deformable_col2im_coord_cpu( void modulated_deformable_col2im_coord_impl(
const Tensor data_col, const Tensor data_im, const Tensor data_offset, const Tensor data_col, const Tensor data_im, const Tensor data_offset,
const Tensor data_mask, const int batch_size, const int channels, const Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col, const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kernel_w, const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group, const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask); Tensor grad_offset, Tensor grad_mask) {
DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
data_im, data_offset, data_mask, batch_size, channels,
height_im, width_im, height_col, width_col, kernel_h,
kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
dilation_w, deformable_group, grad_offset, grad_mask);
}
void modulated_deform_conv_forward( void modulated_deform_conv_forward(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset, Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
...@@ -61,31 +51,6 @@ void modulated_deform_conv_forward( ...@@ -61,31 +51,6 @@ void modulated_deform_conv_forward(
const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group, const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias) { const int deformable_group, const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(columns);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(weight);
CHECK_CPU_INPUT(bias);
CHECK_CPU_INPUT(ones);
CHECK_CPU_INPUT(offset);
CHECK_CPU_INPUT(mask);
CHECK_CPU_INPUT(output);
CHECK_CPU_INPUT(columns);
}
at::DeviceGuard guard(input.device()); at::DeviceGuard guard(input.device());
const int batch = input.size(0); const int batch = input.size(0);
...@@ -127,19 +92,10 @@ void modulated_deform_conv_forward( ...@@ -127,19 +92,10 @@ void modulated_deform_conv_forward(
output.size(2), output.size(3)}); output.size(2), output.size(3)});
for (int b = 0; b < batch; b++) { for (int b = 0; b < batch; b++) {
if (input.device().is_cuda()) { modulated_deformable_im2col_impl(
#ifdef MMCV_WITH_CUDA input[b], offset[b], mask[b], 1, channels, height, width, height_out,
modulated_deformable_im2col_cuda( width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
input[b], offset[b], mask[b], 1, channels, height, width, height_out, dilation_h, dilation_w, deformable_group, columns);
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
#endif
} else {
modulated_deformable_im2col_cpu(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
}
// divide into group // divide into group
weight = weight.view({group, weight.size(0) / group, weight.size(1), weight = weight.view({group, weight.size(0) / group, weight.size(1),
...@@ -174,41 +130,6 @@ void modulated_deform_conv_backward( ...@@ -174,41 +130,6 @@ void modulated_deform_conv_backward(
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias) { const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(columns);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
CHECK_CUDA_INPUT(grad_offset);
CHECK_CUDA_INPUT(grad_mask);
CHECK_CUDA_INPUT(grad_output);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(weight);
CHECK_CPU_INPUT(bias);
CHECK_CPU_INPUT(ones);
CHECK_CPU_INPUT(offset);
CHECK_CPU_INPUT(mask);
CHECK_CPU_INPUT(columns);
CHECK_CPU_INPUT(grad_input);
CHECK_CPU_INPUT(grad_weight);
CHECK_CPU_INPUT(grad_bias);
CHECK_CPU_INPUT(grad_offset);
CHECK_CPU_INPUT(grad_mask);
CHECK_CPU_INPUT(grad_output);
}
at::DeviceGuard guard(input.device()); at::DeviceGuard guard(input.device());
const int batch = input.size(0); const int batch = input.size(0);
...@@ -261,46 +182,24 @@ void modulated_deform_conv_backward( ...@@ -261,46 +182,24 @@ void modulated_deform_conv_backward(
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)}); weight.size(3), weight.size(4)});
if (input.device().is_cuda()) { // gradient w.r.t. input coordinate data
#ifdef MMCV_WITH_CUDA modulated_deformable_col2im_coord_impl(
// gradient w.r.t. input coordinate data columns, input[b], offset[b], mask[b], 1, channels, height, width,
modulated_deformable_col2im_coord_cuda( height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
columns, input[b], offset[b], mask[b], 1, channels, height, width, stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, grad_mask[b]);
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b], // gradient w.r.t. input data
grad_mask[b]); modulated_deformable_col2im_impl(
// gradient w.r.t. input data columns, offset[b], mask[b], 1, channels, height, width, height_out,
modulated_deformable_col2im_cuda( width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
columns, offset[b], mask[b], 1, channels, height, width, height_out, dilation_h, dilation_w, deformable_group, grad_input[b]);
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b]); // gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
// gradient w.r.t. weight, dWeight should accumulate across the batch and modulated_deformable_im2col_impl(
// group input[b], offset[b], mask[b], 1, channels, height, width, height_out,
modulated_deformable_im2col_cuda( width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
input[b], offset[b], mask[b], 1, channels, height, width, height_out, dilation_h, dilation_w, deformable_group, columns);
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
#endif
} else {
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cpu(
columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b]);
// gradient w.r.t. input data
modulated_deformable_col2im_cpu(
columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
modulated_deformable_im2col_cpu(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
}
columns = columns.view({group, columns.size(0) / group, columns.size(1)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)});
grad_weight = grad_weight.view({group, grad_weight.size(0) / group, grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
......
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
template <typename T>
T dmcn_im2col_bilinear_cpu(const T *input, const int data_width,
const int height, const int width, T h, T w) {
int h_low = floorf(h);
int w_low = floorf(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
T lh = h - h_low;
T lw = w - w_low;
T hh = 1 - lh, hw = 1 - lw;
T v1 = 0;
if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
T v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
v2 = input[h_low * data_width + w_high];
T v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
v3 = input[h_high * data_width + w_low];
T v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
v4 = input[h_high * data_width + w_high];
T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename T>
T dmcn_get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
const int height, const int width) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floorf(argmax_h);
int argmax_w_low = floorf(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (h == argmax_h_low && w == argmax_w_low)
weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
if (h == argmax_h_low && w == argmax_w_high)
weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
if (h == argmax_h_high && w == argmax_w_low)
weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
if (h == argmax_h_high && w == argmax_w_high)
weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
return weight;
}
template <typename T>
T dmcn_get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
const int width, const T *im_data,
const int data_width, const int bp_dir) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floorf(argmax_h);
int argmax_w_low = floorf(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (bp_dir == 0) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += -1 * (argmax_w - argmax_w_low) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_w - argmax_w_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
} else if (bp_dir == 1) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += -1 * (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
}
return weight;
}
template <typename T>
void modulated_deformable_im2col_cpu_kernel(
const int n, const T *data_im, const T *data_offset, const T *data_mask,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int num_channels, const int deformable_group, const int height_col,
const int width_col, T *data_col) {
for (int index = 0; index < n; index++) {
// index index of output matrix
const int w_col = index % width_col;
const int h_col = (index / width_col) % height_col;
const int b_col = (index / width_col / height_col) % batch_size;
const int c_im = (index / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
// compute deformable group index
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
T *data_col_ptr =
data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
const T *data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const T *data_offset_ptr =
data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const T *data_mask_ptr =
data_mask + (b_col * deformable_group + deformable_group_index) *
kernel_h * kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
T val = static_cast<T>(0);
const T h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width,
h_im, w_im);
*data_col_ptr = val * mask;
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
template <typename T>
void modulated_deformable_col2im_cpu_kernel(
const int n, const T *data_col, const T *data_offset, const T *data_mask,
const int channels, const int height, const int width, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int deformable_group, const int height_col, const int width_col,
T *grad_im) {
for (int index = 0; index < n; index++) {
const int j = (index / width_col / height_col / batch_size) % kernel_w;
const int i =
(index / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c =
index / width_col / height_col / batch_size / kernel_w / kernel_h;
// compute the start and end of the output
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = index % width_col;
int h_out = (index / width_col) % height_col;
int b = (index / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const T *data_offset_ptr =
data_offset + (b * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const T *data_mask_ptr =
data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
kernel_w * height_col * width_col;
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
const T cur_top_grad = data_col[index] * mask;
const int cur_h = (int)cur_inv_h_data;
const int cur_w = (int)cur_inv_w_data;
for (int dy = -2; dy <= 2; dy++) {
for (int dx = -2; dx <= 2; dx++) {
if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1) {
int cur_bottom_grad_pos =
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
T weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data,
cur_inv_w_data, cur_h + dy,
cur_w + dx, height, width);
*(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
}
}
}
}
}
template <typename T>
void modulated_deformable_col2im_coord_cpu_kernel(
const int n, const T *data_col, const T *data_im, const T *data_offset,
const T *data_mask, const int channels, const int height, const int width,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int channel_per_deformable_group,
const int batch_size, const int offset_channels, const int deformable_group,
const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
for (int index = 0; index < n; index++) {
T val = 0, mval = 0;
int w = index % width_col;
int h = (index / width_col) % height_col;
int c = (index / width_col / height_col) % offset_channels;
int b = (index / width_col / height_col) / offset_channels;
// compute the start and end of the output
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const T *data_col_ptr = data_col + deformable_group_index *
channel_per_deformable_group *
batch_size * width_col * height_col;
const T *data_im_ptr =
data_im + (b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h / kernel_w *
height * width;
const T *data_offset_ptr =
data_offset + (b * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const T *data_mask_ptr =
data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
kernel_w * height_col * width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
col_c += col_step) {
const int col_pos =
(((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i =
(col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr =
(((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr =
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
w_out);
const int data_mask_hw_ptr =
(((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
T inv_h = h_in + i * dilation_h + offset_h;
T inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
inv_h = inv_w = -2;
else
mval += data_col_ptr[col_pos] *
dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width,
width, height, width, inv_h, inv_w);
const T weight = dmcn_get_coordinate_weight_cpu(
inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
width, bp_dir);
val += weight * data_col_ptr[col_pos] * mask;
cnt += 1;
}
// KERNEL_ASSIGN(grad_offset[index], offset_req, val);
grad_offset[index] = val;
if (offset_c % 2 == 0)
// KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
// deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
// height_col + h) * width_col + w], mask_req, mval);
grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
kernel_w +
offset_c / 2) *
height_col +
h) *
width_col +
w] = mval;
}
}
void modulated_deformable_im2col_cpu(
const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col) {
// num_axes should be smaller than block size
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels = channels * batch_size * height_col * width_col;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.scalar_type(), "modulated_deformable_im2col_cpu", ([&] {
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
modulated_deformable_im2col_cpu_kernel(
num_kernels, data_im_, data_offset_, data_mask_, height_im,
width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group, batch_size,
channels, deformable_group, height_col, width_col, data_col_);
}));
}
void modulated_deformable_col2im_cpu(
const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im) {
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels =
channels * kernel_h * kernel_w * batch_size * height_col * width_col;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "modulated_deformable_col2im_cpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
modulated_deformable_col2im_cpu_kernel(
num_kernels, data_col_, data_offset_, data_mask_, channels,
height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, channel_per_deformable_group,
batch_size, deformable_group, height_col, width_col, grad_im_);
}));
}
void modulated_deformable_col2im_coord_cpu(
const Tensor data_col, const Tensor data_im, const Tensor data_offset,
const Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask) {
const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
kernel_w * deformable_group;
const int channel_per_deformable_group =
channels * kernel_h * kernel_w / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "modulated_deformable_col2im_coord_cpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
modulated_deformable_col2im_coord_cpu_kernel(
num_kernels, data_col_, data_im_, data_offset_, data_mask_,
channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, batch_size,
2 * kernel_h * kernel_w * deformable_group, deformable_group,
height_col, width_col, grad_offset_, grad_mask_);
}));
}
...@@ -10,42 +10,39 @@ ...@@ -10,42 +10,39 @@
*/ */
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA Tensor ms_deform_attn_impl_forward(const Tensor &value,
Tensor ms_deform_attn_cuda_forward(const Tensor &value,
const Tensor &spatial_shapes, const Tensor &spatial_shapes,
const Tensor &level_start_index, const Tensor &level_start_index,
const Tensor &sampling_loc, const Tensor &sampling_loc,
const Tensor &attn_weight, const Tensor &attn_weight,
const int im2col_step); const int im2col_step) {
return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
spatial_shapes, level_start_index, sampling_loc,
attn_weight, im2col_step);
}
void ms_deform_attn_cuda_backward( void ms_deform_attn_impl_backward(
const Tensor &value, const Tensor &spatial_shapes, const Tensor &value, const Tensor &spatial_shapes,
const Tensor &level_start_index, const Tensor &sampling_loc, const Tensor &level_start_index, const Tensor &sampling_loc,
const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value, const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step); Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
const int im2col_step) {
#endif DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
level_start_index, sampling_loc, attn_weight,
grad_output, grad_value, grad_sampling_loc,
grad_attn_weight, im2col_step);
}
Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes, Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
const Tensor &level_start_index, const Tensor &level_start_index,
const Tensor &sampling_loc, const Tensor &sampling_loc,
const Tensor &attn_weight, const Tensor &attn_weight,
const int im2col_step) { const int im2col_step) {
if (value.type().is_cuda()) { at::DeviceGuard guard(value.device());
#ifdef MMCV_WITH_CUDA return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
CHECK_CUDA_INPUT(value) sampling_loc, attn_weight, im2col_step);
CHECK_CUDA_INPUT(spatial_shapes)
CHECK_CUDA_INPUT(level_start_index)
CHECK_CUDA_INPUT(sampling_loc)
CHECK_CUDA_INPUT(attn_weight)
return ms_deform_attn_cuda_forward(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
} }
void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes, void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
...@@ -55,25 +52,9 @@ void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes, ...@@ -55,25 +52,9 @@ void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
const Tensor &grad_output, Tensor &grad_value, const Tensor &grad_output, Tensor &grad_value,
Tensor &grad_sampling_loc, Tensor &grad_sampling_loc,
Tensor &grad_attn_weight, const int im2col_step) { Tensor &grad_attn_weight, const int im2col_step) {
if (value.type().is_cuda()) { at::DeviceGuard guard(value.device());
#ifdef MMCV_WITH_CUDA ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
CHECK_CUDA_INPUT(value) sampling_loc, attn_weight, grad_output,
CHECK_CUDA_INPUT(spatial_shapes) grad_value, grad_sampling_loc, grad_attn_weight,
CHECK_CUDA_INPUT(level_start_index) im2col_step);
CHECK_CUDA_INPUT(sampling_loc)
CHECK_CUDA_INPUT(attn_weight)
CHECK_CUDA_INPUT(grad_output)
CHECK_CUDA_INPUT(grad_value)
CHECK_CUDA_INPUT(grad_sampling_loc)
CHECK_CUDA_INPUT(grad_attn_weight)
ms_deform_attn_cuda_backward(value, spatial_shapes, level_start_index,
sampling_loc, attn_weight, grad_output,
grad_value, grad_sampling_loc,
grad_attn_weight, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
} else {
AT_ERROR("Not implemented on the CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold, return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
int offset);
Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
} }
#endif
Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto nboxes = boxes.size(0);
Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
auto select = select_t.data_ptr<bool>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
for (int64_t _i = 0; _i < nboxes; _i++) {
if (select[_i] == false) continue;
auto i = order[_i];
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < nboxes; _j++) {
if (select[_j] == false) continue;
auto j = order[_j];
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(0.f, xx2 - xx1 + offset); Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
auto h = std::max(0.f, yy2 - yy1 + offset); float iou_threshold, float sigma, float min_score,
auto inter = w * h; int method, int offset) {
auto ovr = inter / (iarea + areas[j] - inter); return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
if (ovr > iou_threshold) select[_j] = false; sigma, min_score, method, offset);
}
}
return order_t.masked_select(select_t);
} }
Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) { std::vector<std::vector<int> > nms_match_impl(Tensor dets,
if (boxes.device().is_cuda()) { float iou_threshold) {
#ifdef MMCV_WITH_CUDA return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
CHECK_CUDA_INPUT(boxes);
CHECK_CUDA_INPUT(scores);
return nms_cuda(boxes, scores, iou_threshold, offset);
#else
AT_ERROR("nms is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(boxes);
CHECK_CPU_INPUT(scores);
return nms_cpu(boxes, scores, iou_threshold, offset);
}
} }
Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets, Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
float iou_threshold, float sigma, float min_score, return nms_impl(boxes, scores, iou_threshold, offset);
int method, int offset) {
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
auto scores_t = scores.clone();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
auto nboxes = boxes.size(0);
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto sc = scores_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
auto de = dets.data_ptr<float>();
int64_t pos = 0;
Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
auto inds = inds_t.data_ptr<int64_t>();
for (int64_t i = 0; i < nboxes; i++) {
auto max_score = sc[i];
auto max_pos = i;
pos = i + 1;
// get max box
while (pos < nboxes) {
if (max_score < sc[pos]) {
max_score = sc[pos];
max_pos = pos;
}
pos = pos + 1;
}
// swap
auto ix1 = de[i * 5 + 0] = x1[max_pos];
auto iy1 = de[i * 5 + 1] = y1[max_pos];
auto ix2 = de[i * 5 + 2] = x2[max_pos];
auto iy2 = de[i * 5 + 3] = y2[max_pos];
auto iscore = de[i * 5 + 4] = sc[max_pos];
auto iarea = areas[max_pos];
auto iind = inds[max_pos];
x1[max_pos] = x1[i];
y1[max_pos] = y1[i];
x2[max_pos] = x2[i];
y2[max_pos] = y2[i];
sc[max_pos] = sc[i];
areas[max_pos] = areas[i];
inds[max_pos] = inds[i];
x1[i] = ix1;
y1[i] = iy1;
x2[i] = ix2;
y2[i] = iy2;
sc[i] = iscore;
areas[i] = iarea;
inds[i] = iind;
pos = i + 1;
while (pos < nboxes) {
auto xx1 = std::max(ix1, x1[pos]);
auto yy1 = std::max(iy1, y1[pos]);
auto xx2 = std::min(ix2, x2[pos]);
auto yy2 = std::min(iy2, y2[pos]);
auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas[pos] - inter);
float weight = 1.;
if (method == 0) {
if (ovr >= iou_threshold) weight = 0;
} else if (method == 1) {
if (ovr >= iou_threshold) weight = 1 - ovr;
} else if (method == 2) {
weight = std::exp(-(ovr * ovr) / sigma);
}
sc[pos] *= weight;
// if box score falls below threshold, discard the box by
// swapping with last box update N
if (sc[pos] < min_score) {
x1[pos] = x1[nboxes - 1];
y1[pos] = y1[nboxes - 1];
x2[pos] = x2[nboxes - 1];
y2[pos] = y2[nboxes - 1];
sc[pos] = sc[nboxes - 1];
areas[pos] = areas[nboxes - 1];
inds[pos] = inds[nboxes - 1];
nboxes = nboxes - 1;
pos = pos - 1;
}
pos = pos + 1;
}
}
return inds_t.slice(0, 0, nboxes);
} }
Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold, Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
float sigma, float min_score, int method, int offset) { float sigma, float min_score, int method, int offset) {
if (boxes.device().is_cuda()) { return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
AT_ERROR("softnms is not implemented on GPU"); method, offset);
} else {
return softnms_cpu(boxes, scores, dets, iou_threshold, sigma, min_score,
method, offset);
}
}
std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
auto x1_t = dets.select(1, 0).contiguous();
auto y1_t = dets.select(1, 1).contiguous();
auto x2_t = dets.select(1, 2).contiguous();
auto y2_t = dets.select(1, 3).contiguous();
auto scores = dets.select(1, 4).contiguous();
at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
at::Tensor suppressed_t =
at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
std::vector<int> keep;
std::vector<std::vector<int> > matched;
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1) continue;
keep.push_back(i);
std::vector<int> v_i;
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1) continue;
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(static_cast<float>(0), xx2 - xx1);
auto h = std::max(static_cast<float>(0), yy2 - yy1);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr >= iou_threshold) {
suppressed[j] = 1;
v_i.push_back(j);
}
}
matched.push_back(v_i);
}
for (int i = 0; i < keep.size(); i++)
matched[i].insert(matched[i].begin(), keep[i]);
return matched;
} }
std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) { std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
if (dets.device().is_cuda()) { return nms_match_impl(dets, iou_threshold);
AT_ERROR("nms_match is not implemented on GPU");
} else {
return nms_match_cpu(dets, iou_threshold);
}
} }
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
#include "box_iou_rotated_utils.hpp"
#include "pytorch_cpp_helper.hpp"
template <typename scalar_t>
Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
const float iou_threshold) {
// nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
// however, the code in this function is much shorter because
// we delegate the IoU computation for rotated boxes to
// the single_box_iou_rotated function in box_iou_rotated_utils.h
AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
AT_ASSERTM(dets.type() == scores.type(),
"dets should have the same type as scores");
if (dets.numel() == 0) {
return at::empty({0}, dets.options().dtype(at::kLong));
}
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto keep = keep_t.data_ptr<int64_t>();
auto order = order_t.data_ptr<int64_t>();
int64_t num_to_keep = 0;
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1) {
continue;
}
keep[num_to_keep++] = i;
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1) {
continue;
}
auto ovr = single_box_iou_rotated<scalar_t>(
dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
if (ovr >= iou_threshold) {
suppressed[j] = 1;
}
}
}
return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
}
Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
const float iou_threshold) {
auto result = at::empty({0}, dets.options());
AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms_rotated", [&] {
result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
});
return result;
}
...@@ -2,120 +2,14 @@ ...@@ -2,120 +2,14 @@
// It is modified from https://github.com/WenmuZhou/PAN.pytorch // It is modified from https://github.com/WenmuZhou/PAN.pytorch
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
std::vector<std::vector<float>> estimate_confidence(int32_t* label, std::vector<std::vector<float>> pixel_group_impl(
float* score, int label_num,
int height, int width) {
std::vector<std::vector<float>> point_vector;
for (int i = 0; i < label_num; i++) {
std::vector<float> point;
point.push_back(0);
point.push_back(0);
point_vector.push_back(point);
}
for (int y = 0; y < height; y++) {
auto label_tmp = label + y * width;
auto score_tmp = score + y * width;
for (int x = 0; x < width; x++) {
auto l = label_tmp[x];
if (l > 0) {
float confidence = score_tmp[x];
point_vector[l].push_back(x);
point_vector[l].push_back(y);
point_vector[l][0] += confidence;
point_vector[l][1] += 1;
}
}
}
for (int l = 0; l < point_vector.size(); l++)
if (point_vector[l][1] > 0) {
point_vector[l][0] /= point_vector[l][1];
}
return point_vector;
}
std::vector<std::vector<float>> pixel_group_cpu(
Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label, Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
Tensor kernel_contour, int kernel_region_num, float dis_threshold) { Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
assert(score.dim() == 2); return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
assert(mask.dim() == 2); kernel_label, kernel_contour, kernel_region_num,
assert(embedding_dim.dim() == 3); dis_threshold);
int height = score.size(0);
int width = score.size(1);
assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
assert(width == mask.size(1) == embedding.size(2) == kernel_label.size(2));
auto threshold_square = dis_threshold * dis_threshold;
auto ptr_score = score.data_ptr<float>();
auto ptr_mask = mask.data_ptr<bool>();
auto ptr_kernel_contour = kernel_contour.data_ptr<uint8_t>();
auto ptr_embedding = embedding.data_ptr<float>();
auto ptr_kernel_label = kernel_label.data_ptr<int32_t>();
std::queue<std::tuple<int, int, int32_t>> contour_pixels;
auto embedding_dim = embedding.size(2);
std::vector<std::vector<float>> kernel_vector(
kernel_region_num, std::vector<float>(embedding_dim + 1, 0));
Tensor text_label;
text_label = kernel_label.clone();
auto ptr_text_label = text_label.data_ptr<int32_t>();
for (int i = 0; i < height; i++) {
auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
for (int j = 0, k = 0; j < width && k < width * embedding_dim;
j++, k += embedding_dim) {
int32_t label = ptr_kernel_label_tmp[j];
if (label > 0) {
for (int d = 0; d < embedding_dim; d++)
kernel_vector[label][d] += ptr_embedding_tmp[k + d];
kernel_vector[label][embedding_dim] += 1;
// kernel pixel number
if (ptr_kernel_contour_tmp[j]) {
contour_pixels.push(std::make_tuple(i, j, label));
}
}
}
}
for (int i = 0; i < kernel_region_num; i++) {
for (int j = 0; j < embedding_dim; j++) {
kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
}
}
int dx[4] = {-1, 1, 0, 0};
int dy[4] = {0, 0, -1, 1};
while (!contour_pixels.empty()) {
auto query_pixel = contour_pixels.front();
contour_pixels.pop();
int y = std::get<0>(query_pixel);
int x = std::get<1>(query_pixel);
int32_t l = std::get<2>(query_pixel);
auto kernel_cv = kernel_vector[l];
for (int idx = 0; idx < 4; idx++) {
int tmpy = y + dy[idx];
int tmpx = x + dx[idx];
auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0)
continue;
float dis = 0;
auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
for (size_t i = 0; i < embedding_dim; i++) {
dis +=
pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
// ignore further computing if dis is big enough
if (dis >= threshold_square) break;
}
if (dis >= threshold_square) continue;
contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
ptr_text_label_tmp[tmpx] = l;
}
}
return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num,
height, width);
} }
std::vector<std::vector<float>> pixel_group( std::vector<std::vector<float>> pixel_group(
...@@ -127,11 +21,6 @@ std::vector<std::vector<float>> pixel_group( ...@@ -127,11 +21,6 @@ std::vector<std::vector<float>> pixel_group(
kernel_label = kernel_label.contiguous(); kernel_label = kernel_label.contiguous();
kernel_contour = kernel_contour.contiguous(); kernel_contour = kernel_contour.contiguous();
CHECK_CPU_INPUT(score); return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
CHECK_CPU_INPUT(mask); kernel_region_num, distance_threshold);
CHECK_CPU_INPUT(embedding);
CHECK_CPU_INPUT(kernel_label);
CHECK_CPU_INPUT(kernel_contour);
return pixel_group_cpu(score, mask, embedding, kernel_label, kernel_contour,
kernel_region_num, distance_threshold);
} }
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
int pts_num, const Tensor boxes,
const Tensor pts,
Tensor box_idx_of_points);
void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
int pts_num, const Tensor boxes, int pts_num, const Tensor boxes,
const Tensor pts, const Tensor pts,
Tensor box_idx_of_points) { Tensor box_idx_of_points) {
PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num, DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
boxes, pts, box_idx_of_points); pts_num, boxes, pts, box_idx_of_points);
}; }
void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
int pts_num, const Tensor boxes,
const Tensor pts,
Tensor box_idx_of_points);
void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num, void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
int pts_num, const Tensor boxes, int pts_num, const Tensor boxes,
const Tensor pts, const Tensor pts,
Tensor box_idx_of_points) { Tensor box_idx_of_points) {
PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num, DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
boxes, pts, box_idx_of_points); pts_num, boxes, pts, box_idx_of_points);
}; }
#endif
void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor, void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
Tensor box_idx_of_points_tensor) { Tensor box_idx_of_points_tensor) {
...@@ -34,30 +23,12 @@ void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor, ...@@ -34,30 +23,12 @@ void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
// coordinate, z is the bottom center, each box params pts: (B, npoints, 3) // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
// [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
// default -1 // default -1
int batch_size = boxes_tensor.size(0);
if (pts_tensor.device().is_cuda()) { int boxes_num = boxes_tensor.size(1);
#ifdef MMCV_WITH_CUDA int pts_num = pts_tensor.size(1);
CHECK_CUDA_INPUT(boxes_tensor); points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
CHECK_CUDA_INPUT(pts_tensor); boxes_tensor, pts_tensor,
CHECK_CUDA_INPUT(box_idx_of_points_tensor); box_idx_of_points_tensor);
int batch_size = boxes_tensor.size(0);
int boxes_num = boxes_tensor.size(1);
int pts_num = pts_tensor.size(1);
const float *boxes = boxes_tensor.data_ptr<float>();
const float *pts = pts_tensor.data_ptr<float>();
int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
points_in_boxes_part_forward_cuda(batch_size, boxes_num, pts_num,
boxes_tensor, pts_tensor,
box_idx_of_points_tensor);
#else
AT_ERROR("points_in_boxes_part is not compiled with GPU support");
#endif
} else {
AT_ERROR("points_in_boxes_part is not implemented on CPU");
}
} }
void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor, void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
...@@ -65,28 +36,9 @@ void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor, ...@@ -65,28 +36,9 @@ void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
// params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
// coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z] // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
// in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1 // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
int batch_size = boxes_tensor.size(0);
if (pts_tensor.device().is_cuda()) { int boxes_num = boxes_tensor.size(1);
#ifdef MMCV_WITH_CUDA int pts_num = pts_tensor.size(1);
CHECK_CUDA_INPUT(boxes_tensor); points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
CHECK_CUDA_INPUT(pts_tensor); pts_tensor, box_idx_of_points_tensor);
CHECK_CUDA_INPUT(box_idx_of_points_tensor);
int batch_size = boxes_tensor.size(0);
int boxes_num = boxes_tensor.size(1);
int pts_num = pts_tensor.size(1);
const float *boxes = boxes_tensor.data_ptr<float>();
const float *pts = pts_tensor.data_ptr<float>();
int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
points_in_boxes_all_forward_cuda(batch_size, boxes_num, pts_num,
boxes_tensor, pts_tensor,
box_idx_of_points_tensor);
#else
AT_ERROR("points_in_boxes_all is not compiled with GPU support");
#endif
} else {
AT_ERROR("points_in_boxes_all is not implemented on CPU");
}
} }
#include "pytorch_cpp_helper.hpp"
inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
float &local_x, float &local_y) {
float cosa = cos(-rz), sina = sin(-rz);
local_x = shift_x * cosa + shift_y * (-sina);
local_y = shift_x * sina + shift_y * cosa;
}
inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
float &local_x, float &local_y) {
// param pt: (x, y, z)
// param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
// cz in the bottom center
float x = pt[0], y = pt[1], z = pt[2];
float cx = box3d[0], cy = box3d[1], cz = box3d[2];
float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
cz += z_size /
2.0; // shift to the center since cz in box3d is the bottom center
if (fabsf(z - cz) > z_size / 2.0) return 0;
lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
(local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
return in_flag;
}
void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
Tensor pts_indices_tensor) {
// params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
// coordinate, z is the bottom center, each box DO NOT overlaps params pts:
// (npoints, 3) [x, y, z] in LiDAR coordinate params pts_indices: (N, npoints)
CHECK_CONTIGUOUS(boxes_tensor);
CHECK_CONTIGUOUS(pts_tensor);
CHECK_CONTIGUOUS(pts_indices_tensor);
int boxes_num = boxes_tensor.size(0);
int pts_num = pts_tensor.size(0);
const float *boxes = boxes_tensor.data_ptr<float>();
const float *pts = pts_tensor.data_ptr<float>();
int *pts_indices = pts_indices_tensor.data_ptr<int>();
float local_x = 0, local_y = 0;
for (int i = 0; i < boxes_num; i++) {
for (int j = 0; j < pts_num; j++) {
int cur_in_flag =
check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
pts_indices[i * pts_num + j] = cur_in_flag;
}
}
}
...@@ -2,255 +2,40 @@ ...@@ -2,255 +2,40 @@
// Modified from // Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src // https://github.com/hszhao/semseg/blob/master/lib/psa/src
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifndef min void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
#define min(a, b) (((a) < (b)) ? (a) : (b))
#endif
#ifndef max
#define max(a, b) (((a) > (b)) ? (a) : (b))
#endif
void psamask_collect_forward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const Tensor mask_data,
Tensor buffer_data) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
buffer_data.view({-1})[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) *
h_feature * w_feature +
h * w_feature + w] =
mask_data.view(
{-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w];
}
}
}
}
}
}
void psamask_distribute_forward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const Tensor mask_data,
Tensor buffer_data) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
buffer_data.view(
{-1})[(n * h_feature * w_feature + h * w_feature + w) *
h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)] =
mask_data.view(
{-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w];
}
}
}
}
}
}
void psamask_collect_backward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const Tensor buffer_diff,
Tensor mask_diff) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w] =
buffer_diff.view({-1})[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) *
h_feature * w_feature +
h * w_feature + w];
}
}
}
}
}
}
void psamask_distribute_backward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask,
const Tensor buffer_diff, Tensor mask_diff) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w] =
buffer_diff.view(
{-1})[(n * h_feature * w_feature + h * w_feature + w) *
h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)];
}
}
}
}
}
}
void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask) {
if (psa_type == 0)
psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input, output);
else
psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input, output);
}
void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) {
if (psa_type == 0)
psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, grad_output, grad_input);
else
psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, grad_output,
grad_input);
}
#ifdef MMCV_WITH_CUDA
void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
Tensor output, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask,
const int half_w_mask);
void PSAMaskBackwardCUDAKernelLauncher(
const int psa_type, const Tensor grad_output, Tensor grad_input,
const int num_, const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask, const int half_w_mask);
void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature, const int num_, const int h_feature,
const int w_feature, const int h_mask, const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) { const int half_w_mask) {
PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature, DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
w_feature, h_mask, w_mask, half_h_mask, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask); half_w_mask);
} }
void psamask_backward_cuda(const int psa_type, const Tensor grad_output, void psamask_backward_impl(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_, Tensor grad_input, const int num_,
const int h_feature, const int w_feature, const int h_feature, const int w_feature,
const int h_mask, const int w_mask, const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) { const int half_h_mask, const int half_w_mask) {
PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_, DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
h_feature, w_feature, h_mask, w_mask, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_h_mask, half_w_mask); half_w_mask);
} }
#endif
void psamask_forward(const Tensor input, Tensor output, const int psa_type, void psamask_forward(const Tensor input, Tensor output, const int psa_type,
const int num_, const int h_feature, const int w_feature, const int num_, const int h_feature, const int w_feature,
const int h_mask, const int w_mask, const int half_h_mask, const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) { const int half_w_mask) {
if (input.device().is_cuda()) { psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
#ifdef MMCV_WITH_CUDA h_mask, w_mask, half_h_mask, half_w_mask);
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(output);
psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
h_mask, w_mask, half_h_mask, half_w_mask);
#else
AT_ERROR("PSAMask is not compiled with GPU support");
#endif
} else {
psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
h_mask, w_mask, half_h_mask, half_w_mask);
}
} }
void psamask_backward(Tensor grad_output, const Tensor grad_input, void psamask_backward(Tensor grad_output, const Tensor grad_input,
const int psa_type, const int num_, const int h_feature, const int psa_type, const int num_, const int h_feature,
const int w_feature, const int h_mask, const int w_mask, const int w_feature, const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) { const int half_h_mask, const int half_w_mask) {
if (grad_input.device().is_cuda()) { psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
#ifdef MMCV_WITH_CUDA w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_output);
psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
#else
AT_ERROR("PSAMask is not compiled with GPU support");
#endif
} else {
psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned);
void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width, int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio, float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) { int pool_mode, bool aligned) {
ROIAlignForwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, argmax_x, aligned_height, aligned_width, spatial_scale,
spatial_scale, sampling_ratio, pool_mode, aligned); sampling_ratio, pool_mode, aligned);
} }
void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y, void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input, Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width, int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio, float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) { int pool_mode, bool aligned) {
ROIAlignBackwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height, argmax_x, grad_input, aligned_height, aligned_width,
aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); spatial_scale, sampling_ratio, pool_mode, aligned);
}
#endif
void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned);
void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) {
ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
} }
void roi_align_forward(Tensor input, Tensor rois, Tensor output, void roi_align_forward(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height, Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale, int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) { int sampling_ratio, int pool_mode, bool aligned) {
if (input.device().is_cuda()) { roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
#ifdef MMCV_WITH_CUDA aligned_height, aligned_width, spatial_scale,
CHECK_CUDA_INPUT(input); sampling_ratio, pool_mode, aligned);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax_y);
CHECK_CUDA_INPUT(argmax_x);
roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
AT_ERROR("RoIAlign is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(output);
CHECK_CPU_INPUT(argmax_y);
CHECK_CPU_INPUT(argmax_x);
roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
} }
void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y, void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input, int aligned_height, Tensor argmax_x, Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale, int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) { int sampling_ratio, int pool_mode, bool aligned) {
if (grad_output.device().is_cuda()) { roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
#ifdef MMCV_WITH_CUDA aligned_height, aligned_width, spatial_scale,
CHECK_CUDA_INPUT(grad_output); sampling_ratio, pool_mode, aligned);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(argmax_y);
CHECK_CUDA_INPUT(argmax_x);
CHECK_CUDA_INPUT(grad_input);
roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
AT_ERROR("RoIAlign is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(grad_output);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(argmax_y);
CHECK_CPU_INPUT(argmax_x);
CHECK_CPU_INPUT(grad_input);
roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
} }
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include "../pytorch_cpp_helper.hpp"
// implementation taken from Caffe2
template <typename T>
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
T w1;
T w2;
T w3;
T w4;
};
template <typename T>
void pre_calc_for_bilinear_interpolate(
const int height, const int width, const int pooled_height,
const int pooled_width, const int iy_upper, const int ix_upper,
T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc<T>>& pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T x = xx;
T y = yy;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc<T> pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}
if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indices
PreCalc<T> pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
}
}
}
}
}
template <typename T>
void ROIAlignForward(const int nthreads, const T* input, const T* rois,
T* output, T* argmax_y, T* argmax_x,
const int pooled_height, const int pooled_width,
const T spatial_scale, const int sampling_ratio,
const int pool_mode, // 0 - max pool, 1 - avg pool
const bool aligned, const int channels, const int height,
const int width) {
int n_rois = nthreads / channels / pooled_width / pooled_height;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;
const T* offset_rois = rois + n * 5;
int roi_batch_ind = offset_rois[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T roi_start_w = offset_rois[1] * spatial_scale - offset;
T roi_start_h = offset_rois[2] * spatial_scale - offset;
T roi_end_w = offset_rois[3] * spatial_scale - offset;
T roi_end_h = offset_rois[4] * spatial_scale - offset;
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
if (aligned) {
AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlign cannot have non-negative size!");
} else { // for backward-compatibility only
roi_width = std::max(roi_width, (T)1.);
roi_height = std::max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// When the grid is empty, output zeros == 0/1, instead of NaN.
const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
pooled_width * pooled_height);
pre_calc_for_bilinear_interpolate(
height, width, pooled_height, pooled_width, roi_bin_grid_h,
roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
roi_bin_grid_h, roi_bin_grid_w, pre_calc);
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const T* offset_input =
input + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;
T output_val = 0.;
T maxval = -10000;
T maxidx_y = -1.f, maxidx_x = -1.f;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T y = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h);
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
PreCalc<T> pc = pre_calc[pre_calc_index];
T val = pc.w1 * offset_input[pc.pos1] +
pc.w2 * offset_input[pc.pos2] +
pc.w3 * offset_input[pc.pos3] +
pc.w4 * offset_input[pc.pos4];
if (val > maxval) {
maxval = val;
maxidx_y = y;
maxidx_x = x;
}
output_val += val;
pre_calc_index += 1;
}
}
if (pool_mode == 0) {
// We do max pooling inside a bin
output[index] = maxval;
argmax_y[index] = maxidx_y;
argmax_x[index] = maxidx_x;
} else if (pool_mode == 1) {
// We do average (integral) pooling inside a bin
output[index] = output_val / count;
} // if
} // for pw
} // for ph
} // for c
} // for n
}
template <typename T>
void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
T& w1, T& w2, T& w3, T& w4, int& x_low,
int& x_high, int& y_low, int& y_high,
const int index /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y <= 0) y = 0;
if (x <= 0) x = 0;
y_low = (int)y;
x_low = (int)x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// reference in forward
// T v1 = input[y_low * width + x_low];
// T v2 = input[y_low * width + x_high];
// T v3 = input[y_high * width + x_low];
// T v4 = input[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
template <class T>
inline void add(T* address, const T& val) {
*address += val;
}
template <typename T>
void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,
const T* argmax_y, const T* argmax_x, T* grad_input,
const int pooled_height, const int pooled_width,
const T spatial_scale, const int sampling_ratio,
const int pool_mode, // 0 - max pool, 1 - avg pool
const bool aligned, const int channels, const int height,
const int width, const int n_stride, const int c_stride,
const int h_stride, const int w_stride) {
for (int index = 0; index < nthreads; index++) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_rois = rois + n * 5;
int roi_batch_ind = offset_rois[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T roi_start_w = offset_rois[1] * spatial_scale - offset;
T roi_start_h = offset_rois[2] * spatial_scale - offset;
T roi_end_w = offset_rois[3] * spatial_scale - offset;
T roi_end_h = offset_rois[4] * spatial_scale - offset;
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
if (aligned) {
AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlign do not have non-negative size!");
} else { // for backward-compatibility only
roi_width = std::max(roi_width, (T)1.);
roi_height = std::max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T* offset_grad_input =
grad_input + ((roi_batch_ind * channels + c) * height * width);
int output_offset = n * n_stride + c * c_stride;
const T* offset_grad_output = grad_output + output_offset;
const T grad_output_this_bin =
offset_grad_output[ph * h_stride + pw * w_stride];
if (pool_mode == 0) {
// We do max pooling inside a bin
T y = argmax_y[index], x = argmax_x[index];
if (y != -1.f) {
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
x_low, x_high, y_low, y_high, index);
T g1 = grad_output_this_bin * w1;
T g2 = grad_output_this_bin * w2;
T g3 = grad_output_this_bin * w3;
T g4 = grad_output_this_bin * w4;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
// atomic add is not needed for now since it is single threaded
add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
} // if
} // mode
} else if (pool_mode == 1) {
// We do average (integral) pooling inside a bin
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_width / pooled_width);
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T y = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
x_low, x_high, y_low, y_high, index);
T g1 = grad_output_this_bin * w1 / count;
T g2 = grad_output_this_bin * w2 / count;
T g3 = grad_output_this_bin * w3 / count;
T g4 = grad_output_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
// atomic add is not needed for now since it is single threaded
add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
add(offset_grad_input + y_high * width + x_high,
static_cast<T>(g4));
} // if
} // ix
} // iy
} // mode
} // for
} // ROIAlignBackward
void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
int output_size = output.numel();
int channels = input.size(1);
int height = input.size(2);
int width = input.size(3);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "ROIAlign_forward", [&] {
ROIAlignForward<scalar_t>(
output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
argmax_x.data_ptr<scalar_t>(), aligned_height, aligned_width,
static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
aligned, channels, height, width);
});
}
void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned) {
int output_size = grad_output.numel();
int channels = grad_input.size(1);
int height = grad_input.size(2);
int width = grad_input.size(3);
// get stride values to ensure indexing into gradients is correct.
int n_stride = grad_output.stride(0);
int c_stride = grad_output.stride(1);
int h_stride = grad_output.stride(2);
int w_stride = grad_output.stride(3);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "ROIAlign_backward", [&] {
ROIAlignBackward<scalar_t>(
output_size, grad_output.data_ptr<scalar_t>(),
rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
sampling_ratio, pool_mode, aligned, channels, height, width,
n_stride, c_stride, h_stride, w_stride);
});
}
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
void ROIAlignRotatedForwardCUDAKernelLauncher( int aligned_height, int aligned_width,
const at::Tensor features, const at::Tensor rois, const float spatial_scale, float spatial_scale, int sample_ratio,
const int sample_num, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, at::Tensor output);
void ROIAlignRotatedBackwardCUDAKernelLauncher(
const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
const int sample_num, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale, int sample_num,
bool aligned, bool clockwise) { bool aligned, bool clockwise) {
// Number of ROIs DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,
int num_rois = rois.size(0); aligned_height, aligned_width, spatial_scale,
int size_rois = rois.size(1); sample_ratio, aligned, clockwise);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
int num_channels = features.size(1);
int data_height = features.size(2);
int data_width = features.size(3);
ROIAlignRotatedForwardCUDAKernelLauncher(
features, rois, spatial_scale, sample_num, aligned, clockwise,
num_channels, data_height, data_width, num_rois, pooled_height,
pooled_width, output);
} }
void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois, void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int pooled_height, Tensor bottom_grad, int aligned_height,
int pooled_width, float spatial_scale, int aligned_width, float spatial_scale,
int sample_num, bool aligned, int sample_ratio, bool aligned,
bool clockwise) { bool clockwise) {
// Number of ROIs DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
int num_rois = rois.size(0); bottom_grad, aligned_height, aligned_width,
int size_rois = rois.size(1); spatial_scale, sample_ratio, aligned, clockwise);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
int num_channels = bottom_grad.size(1);
int data_height = bottom_grad.size(2);
int data_width = bottom_grad.size(3);
ROIAlignRotatedBackwardCUDAKernelLauncher(
top_grad, rois, spatial_scale, sample_num, aligned, clockwise,
num_channels, data_height, data_width, num_rois, pooled_height,
pooled_width, bottom_grad);
}
#endif
void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise);
void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise);
void roi_align_rotated_forward_cpu(Tensor features, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale, int sample_num,
bool aligned, bool clockwise) {
ROIAlignRotatedForwardCPULauncher(features, rois, output, pooled_height,
pooled_width, spatial_scale, sample_num,
aligned, clockwise);
}
void roi_align_rotated_backward_cpu(Tensor features, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale, int sample_num,
bool aligned, bool clockwise) {
ROIAlignRotatedBackwardCPULauncher(features, rois, output, pooled_height,
pooled_width, spatial_scale, sample_num,
aligned, clockwise);
} }
void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output, void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
int pooled_height, int pooled_width, int aligned_height, int aligned_width,
float spatial_scale, int sample_num, float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise) { bool aligned, bool clockwise) {
if (input.device().is_cuda()) { roi_align_rotated_forward_impl(input, rois, output, aligned_height,
#ifdef MMCV_WITH_CUDA aligned_width, spatial_scale, sampling_ratio,
CHECK_CUDA_INPUT(input); aligned, clockwise);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
roi_align_rotated_forward_cuda(input, rois, output, pooled_height,
pooled_width, spatial_scale, sample_num,
aligned, clockwise);
#else
AT_ERROR("RoIAlignRotated is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(output);
roi_align_rotated_forward_cpu(input, rois, output, pooled_height,
pooled_width, spatial_scale, sample_num,
aligned, clockwise);
}
} }
void roi_align_rotated_backward(Tensor grad_output, Tensor rois, void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
Tensor grad_input, int pooled_height, Tensor bottom_grad, int aligned_height,
int pooled_width, float spatial_scale, int aligned_width, float spatial_scale,
int sample_num, bool aligned, bool clockwise) { int sampling_ratio, bool aligned,
if (grad_output.device().is_cuda()) { bool clockwise) {
#ifdef MMCV_WITH_CUDA roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
CHECK_CUDA_INPUT(grad_output); aligned_width, spatial_scale, sampling_ratio,
CHECK_CUDA_INPUT(rois); aligned, clockwise);
CHECK_CUDA_INPUT(grad_input);
roi_align_rotated_backward_cuda(grad_output, rois, grad_input,
pooled_height, pooled_width, spatial_scale,
sample_num, aligned, clockwise);
#else
AT_ERROR("RoIAlignRotated is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(grad_output);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(grad_input);
roi_align_rotated_backward_cpu(grad_output, rois, grad_input, pooled_height,
pooled_width, spatial_scale, sample_num,
aligned, clockwise);
}
} }
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include "../pytorch_cpp_helper.hpp"
// implementation taken from Caffe2
template <typename T>
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
T w1;
T w2;
T w3;
T w4;
};
template <typename T>
void pre_calc_for_bilinear_interpolate(
const int height, const int width, const int pooled_height,
const int pooled_width, const int iy_upper, const int ix_upper,
T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
int roi_bin_grid_h, int roi_bin_grid_w, T roi_center_h, T roi_center_w,
T cos_theta, T sin_theta, std::vector<PreCalc<T>>& pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
// Rotate by theta around the center and translate
// In image space, (y, x) is the order for Right Handed System,
// and this is essentially multiplying the point by a rotation matrix
// to rotate it counterclockwise through angle theta.
T y = yy * cos_theta - xx * sin_theta + roi_center_h;
T x = yy * sin_theta + xx * cos_theta + roi_center_w;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc<T> pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indices
PreCalc<T> pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
}
}
}
}
}
template <typename T>
void ROIAlignRotatedForward(const int nthreads, const T* input,
const T& spatial_scale, const bool aligned,
const bool clockwise, const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
const int sampling_ratio, const T* rois,
T* output) {
int n_rois = nthreads / channels / pooled_width / pooled_height;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;
const T* current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T roi_center_w = current_roi[1] * spatial_scale - offset;
T roi_center_h = current_roi[2] * spatial_scale - offset;
T roi_width = current_roi[3] * spatial_scale;
T roi_height = current_roi[4] * spatial_scale;
T theta = current_roi[5];
if (clockwise) {
theta = -theta; // If clockwise, the angle needs to be reversed.
}
T cos_theta = cos(theta);
T sin_theta = sin(theta);
if (aligned) {
AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlignRotated do not have non-negative size!");
} else { // for backward-compatibility only
roi_width = std::max(roi_width, (T)1.);
roi_height = std::max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
pooled_width * pooled_height);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T roi_start_h = -roi_height / 2.0;
T roi_start_w = -roi_width / 2.0;
pre_calc_for_bilinear_interpolate(
height, width, pooled_height, pooled_width, roi_bin_grid_h,
roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
sin_theta, pre_calc);
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const T* offset_input =
input + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
PreCalc<T> pc = pre_calc[pre_calc_index];
output_val += pc.w1 * offset_input[pc.pos1] +
pc.w2 * offset_input[pc.pos2] +
pc.w3 * offset_input[pc.pos3] +
pc.w4 * offset_input[pc.pos4];
pre_calc_index += 1;
}
}
output_val /= count;
output[index] = output_val;
} // for pw
} // for ph
} // for c
} // for n
}
template <typename T>
void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
T& w1, T& w2, T& w3, T& w4, int& x_low,
int& x_high, int& y_low, int& y_high) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
y_low = (int)y;
x_low = (int)x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// reference in forward
// T v1 = input[y_low * width + x_low];
// T v2 = input[y_low * width + x_high];
// T v3 = input[y_high * width + x_low];
// T v4 = input[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
template <class T>
inline void add(T* address, const T& val) {
*address += val;
}
template <typename T>
void ROIAlignRotatedBackward(
const int nthreads,
// may not be contiguous. should index using n_stride, etc
const T* grad_output, const T& spatial_scale, const bool aligned,
const bool clockwise, const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, const int sampling_ratio,
T* grad_input, const T* rois, const int n_stride, const int c_stride,
const int h_stride, const int w_stride) {
for (int index = 0; index < nthreads; index++) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T roi_center_w = current_roi[1] * spatial_scale - offset;
T roi_center_h = current_roi[2] * spatial_scale - offset;
T roi_width = current_roi[3] * spatial_scale;
T roi_height = current_roi[4] * spatial_scale;
T theta = current_roi[5];
if (clockwise) {
theta = -theta; // If clockwise, the angle needs to be reversed.
}
T cos_theta = cos(theta);
T sin_theta = sin(theta);
if (aligned) {
AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlignRotated do not have non-negative size!");
} else { // for backward-compatibility only
roi_width = std::max(roi_width, (T)1.);
roi_height = std::max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T* offset_grad_input =
grad_input + ((roi_batch_ind * channels + c) * height * width);
int output_offset = n * n_stride + c * c_stride;
const T* offset_grad_output = grad_output + output_offset;
const T grad_output_this_bin =
offset_grad_output[ph * h_stride + pw * w_stride];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T roi_start_h = -roi_height / 2.0;
T roi_start_w = -roi_width / 2.0;
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
// Rotate by theta around the center and translate
T y = yy * cos_theta - xx * sin_theta + roi_center_h;
T x = yy * sin_theta + xx * cos_theta + roi_center_w;
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
x_low, x_high, y_low, y_high);
T g1 = grad_output_this_bin * w1 / count;
T g2 = grad_output_this_bin * w2 / count;
T g3 = grad_output_this_bin * w3 / count;
T g4 = grad_output_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
// atomic add is not needed for now since it is single threaded
add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
} // if
} // ix
} // iy
} // for
} // ROIAlignRotatedBackward
void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise) {
int output_size = output.numel();
int channels = input.size(1);
int height = input.size(2);
int width = input.size(3);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "ROIAlignRotated_forward", [&] {
ROIAlignRotatedForward<scalar_t>(
output_size, input.data_ptr<scalar_t>(),
static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
height, width, aligned_height, aligned_width, sampling_ratio,
rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
});
}
void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise) {
int output_size = grad_output.numel();
int channels = grad_input.size(1);
int height = grad_input.size(2);
int width = grad_input.size(3);
// get stride values to ensure indexing into gradients is correct.
int n_stride = grad_output.stride(0);
int c_stride = grad_output.stride(1);
int h_stride = grad_output.stride(2);
int w_stride = grad_output.stride(3);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "ROIAlignRotated_backward", [&] {
ROIAlignRotatedBackward<scalar_t>(
grad_output.numel(), grad_output.data_ptr<scalar_t>(),
static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
height, width, aligned_height, aligned_width, sampling_ratio,
grad_input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
n_stride, c_stride, h_stride, w_stride);
});
}
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height,
int pooled_width, float spatial_scale);
void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax, Tensor grad_input,
int pooled_height, int pooled_width,
float spatial_scale);
void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
Tensor argmax, int pooled_height, int pooled_width, Tensor argmax, int pooled_height, int pooled_width,
float spatial_scale) { float spatial_scale) {
ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height, DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
pooled_width, spatial_scale); pooled_height, pooled_width, spatial_scale);
} }
void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax, void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height, Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale) { int pooled_width, float spatial_scale) {
ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input, DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
pooled_height, pooled_width, spatial_scale); grad_input, pooled_height, pooled_width, spatial_scale);
} }
#endif
void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax, void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
int pooled_height, int pooled_width, int pooled_height, int pooled_width,
float spatial_scale) { float spatial_scale) {
if (input.device().is_cuda()) { roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
#ifdef MMCV_WITH_CUDA pooled_width, spatial_scale);
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax);
roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale);
#else
AT_ERROR("RoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("RoIPool is not implemented on CPU");
}
} }
void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax, void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
Tensor grad_input, int pooled_height, int pooled_width, Tensor grad_input, int pooled_height, int pooled_width,
float spatial_scale) { float spatial_scale) {
if (grad_output.device().is_cuda()) { roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
#ifdef MMCV_WITH_CUDA pooled_width, spatial_scale);
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(argmax);
CHECK_CUDA_INPUT(grad_input);
roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
pooled_width, spatial_scale);
#else
AT_ERROR("RoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("RoIPool is not implemented on CPU");
}
} }
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
void RoiawarePool3dForwardCUDAKernelLauncher(
int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
int out_y, int out_z, const Tensor rois, const Tensor pts,
const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
Tensor pooled_features, int pool_method);
void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
int max_pts_each_voxel, int out_x, int out_y, int max_pts_each_voxel, int out_x, int out_y,
int out_z, const Tensor rois, int out_z, const Tensor rois,
const Tensor pts, const Tensor pts_feature, const Tensor pts, const Tensor pts_feature,
Tensor argmax, Tensor pts_idx_of_voxels, Tensor argmax, Tensor pts_idx_of_voxels,
Tensor pooled_features, int pool_method) { Tensor pooled_features, int pool_method) {
RoiawarePool3dForwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,
boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z, channels, max_pts_each_voxel, out_x, out_y, out_z, rois,
rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features, pts, pts_feature, argmax, pts_idx_of_voxels,
pool_method); pooled_features, pool_method);
}; }
void RoiawarePool3dBackwardCUDAKernelLauncher(
int boxes_num, int out_x, int out_y, int out_z, int channels,
int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
const Tensor grad_out, Tensor grad_in, int pool_method);
void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y, void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
int out_z, int channels, int out_z, int channels,
int max_pts_each_voxel, int max_pts_each_voxel,
const Tensor pts_idx_of_voxels, const Tensor pts_idx_of_voxels,
const Tensor argmax, const Tensor grad_out, const Tensor argmax, const Tensor grad_out,
Tensor grad_in, int pool_method) { Tensor grad_in, int pool_method) {
RoiawarePool3dBackwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,
boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel, out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,
pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method); argmax, grad_out, grad_in, pool_method);
}; }
#endif
void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature, void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
Tensor argmax, Tensor pts_idx_of_voxels, Tensor argmax, Tensor pts_idx_of_voxels,
...@@ -47,36 +35,20 @@ void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature, ...@@ -47,36 +35,20 @@ void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
// params pooled_features: (N, out_x, out_y, out_z, C) // params pooled_features: (N, out_x, out_y, out_z, C)
// params pool_method: 0: max_pool 1: avg_pool // params pool_method: 0: max_pool 1: avg_pool
if (pts.device().is_cuda()) { int boxes_num = rois.size(0);
#ifdef MMCV_WITH_CUDA int pts_num = pts.size(0);
CHECK_CUDA_INPUT(rois); int channels = pts_feature.size(1);
CHECK_CUDA_INPUT(pts); int max_pts_each_voxel = pts_idx_of_voxels.size(4); // index 0 is the counter
CHECK_CUDA_INPUT(pts_feature); int out_x = pts_idx_of_voxels.size(1);
CHECK_CUDA_INPUT(argmax); int out_y = pts_idx_of_voxels.size(2);
CHECK_CUDA_INPUT(pts_idx_of_voxels); int out_z = pts_idx_of_voxels.size(3);
CHECK_CUDA_INPUT(pooled_features); assert((out_x < 256) && (out_y < 256) &&
(out_z < 256)); // we encode index with 8bit
int boxes_num = rois.size(0); roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,
int pts_num = pts.size(0); out_x, out_y, out_z, rois, pts, pts_feature,
int channels = pts_feature.size(1); argmax, pts_idx_of_voxels, pooled_features,
int max_pts_each_voxel = pool_method);
pts_idx_of_voxels.size(4); // index 0 is the counter
int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3);
assert((out_x < 256) && (out_y < 256) &&
(out_z < 256)); // we encode index with 8bit
roiaware_pool3d_forward_cuda(boxes_num, pts_num, channels,
max_pts_each_voxel, out_x, out_y, out_z, rois,
pts, pts_feature, argmax, pts_idx_of_voxels,
pooled_features, pool_method);
#else
AT_ERROR("roiaware_pool3d is not compiled with GPU support");
#endif
} else {
AT_ERROR("roiaware_pool3d is not implemented on CPU");
}
} }
void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax, void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
...@@ -87,29 +59,14 @@ void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax, ...@@ -87,29 +59,14 @@ void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
// params grad_out: (N, out_x, out_y, out_z, C) // params grad_out: (N, out_x, out_y, out_z, C)
// params grad_in: (npoints, C), return value // params grad_in: (npoints, C), return value
// params pool_method: 0: max_pool 1: avg_pool // params pool_method: 0: max_pool 1: avg_pool
int boxes_num = pts_idx_of_voxels.size(0);
int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3);
int max_pts_each_voxel = pts_idx_of_voxels.size(4); // index 0 is the counter
int channels = grad_out.size(4);
if (grad_in.device().is_cuda()) { roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,
#ifdef MMCV_WITH_CUDA max_pts_each_voxel, pts_idx_of_voxels, argmax,
CHECK_CUDA_INPUT(pts_idx_of_voxels); grad_out, grad_in, pool_method);
CHECK_CUDA_INPUT(argmax);
CHECK_CUDA_INPUT(grad_out);
CHECK_CUDA_INPUT(grad_in);
int boxes_num = pts_idx_of_voxels.size(0);
int out_x = pts_idx_of_voxels.size(1);
int out_y = pts_idx_of_voxels.size(2);
int out_z = pts_idx_of_voxels.size(3);
int max_pts_each_voxel =
pts_idx_of_voxels.size(4); // index 0 is the counter
int channels = grad_out.size(4);
roiaware_pool3d_backward_cuda(boxes_num, out_x, out_y, out_z, channels,
max_pts_each_voxel, pts_idx_of_voxels, argmax,
grad_out, grad_in, pool_method);
#else
AT_ERROR("roiaware_pool3d is not compiled with GPU support");
#endif
} else {
AT_ERROR("roiaware_pool3d is not implemented on CPU");
}
} }
...@@ -7,24 +7,18 @@ All Rights Reserved 2018. ...@@ -7,24 +7,18 @@ All Rights Reserved 2018.
*/ */
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
void RoIPointPool3dForwardCUDAKernelLauncher(
int batch_size, int pts_num, int boxes_num, int feature_in_len,
int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
int feature_in_len, int sampled_pts_num, int feature_in_len, int sampled_pts_num,
const Tensor xyz, const Tensor boxes3d, const Tensor xyz, const Tensor boxes3d,
const Tensor pts_feature, const Tensor pts_feature,
Tensor pooled_features, Tensor pooled_features,
Tensor pooled_empty_flag) { Tensor pooled_empty_flag) {
RoIPointPool3dForwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz, boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
boxes3d, pts_feature, pooled_features, pooled_empty_flag); pts_feature, pooled_features, pooled_empty_flag);
}; }
#endif
void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature, void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
Tensor pooled_features, Tensor pooled_empty_flag) { Tensor pooled_features, Tensor pooled_empty_flag) {
...@@ -33,28 +27,13 @@ void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature, ...@@ -33,28 +27,13 @@ void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
// params pts_feature: (B, N, C) // params pts_feature: (B, N, C)
// params pooled_features: (B, M, 512, 3+C) // params pooled_features: (B, M, 512, 3+C)
// params pooled_empty_flag: (B, M) // params pooled_empty_flag: (B, M)
int batch_size = xyz.size(0);
int pts_num = xyz.size(1);
int boxes_num = boxes3d.size(1);
int feature_in_len = pts_feature.size(2);
int sampled_pts_num = pooled_features.size(2);
if (xyz.device().is_cuda()) { roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
#ifdef MMCV_WITH_CUDA sampled_pts_num, xyz, boxes3d, pts_feature,
CHECK_CUDA_INPUT(xyz); pooled_features, pooled_empty_flag);
CHECK_CUDA_INPUT(boxes3d);
CHECK_CUDA_INPUT(pts_feature);
CHECK_CUDA_INPUT(pooled_features);
CHECK_CUDA_INPUT(pooled_empty_flag);
int batch_size = xyz.size(0);
int pts_num = xyz.size(1);
int boxes_num = boxes3d.size(1);
int feature_in_len = pts_feature.size(2);
int sampled_pts_num = pooled_features.size(2);
roipoint_pool3d_forward_cuda(batch_size, pts_num, boxes_num, feature_in_len,
sampled_pts_num, xyz, boxes3d, pts_feature,
pooled_features, pooled_empty_flag);
#else
AT_ERROR("roipoint_pool3d is not compiled with GPU support");
#endif
} else {
AT_ERROR("roipoint_pool3d is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {
void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean); DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);
void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
Tensor var);
void SyncBNForwardOutputCUDAKernelLauncher(
const Tensor input, const Tensor mean, const Tensor var,
Tensor running_mean, Tensor running_var, const Tensor weight,
const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
float momentum, int group_size);
void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
const Tensor norm,
Tensor grad_weight,
Tensor grad_bias);
void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
const Tensor weight,
const Tensor grad_weight,
const Tensor grad_bias,
const Tensor norm, const Tensor std,
Tensor grad_input);
void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
SyncBNForwardMeanCUDAKernelLauncher(input, mean);
} }
void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean, void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
Tensor var) { Tensor var) {
SyncBNForwardVarCUDAKernelLauncher(input, mean, var); DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);
} }
void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean, void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
const Tensor var, Tensor running_mean, const Tensor var, Tensor running_mean,
Tensor running_var, const Tensor weight, Tensor running_var, const Tensor weight,
const Tensor bias, Tensor norm, Tensor std, const Tensor bias, Tensor norm, Tensor std,
Tensor output, float eps, float momentum, Tensor output, float eps, float momentum,
int group_size) { int group_size) {
SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean, DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,
running_var, weight, bias, norm, std, running_mean, running_var, weight, bias, norm, std,
output, eps, momentum, group_size); output, eps, momentum, group_size);
} }
void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm, void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
Tensor grad_weight, Tensor grad_bias) { Tensor grad_weight, Tensor grad_bias) {
SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight, DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,
grad_bias); grad_weight, grad_bias);
} }
void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight, void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
const Tensor grad_weight, const Tensor grad_weight,
const Tensor grad_bias, const Tensor norm, const Tensor grad_bias, const Tensor norm,
const Tensor std, Tensor grad_input) { const Tensor std, Tensor grad_input) {
SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight, DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,
grad_bias, norm, std, grad_input); grad_weight, grad_bias, norm, std, grad_input);
} }
#endif
void sync_bn_forward_mean(const Tensor input, Tensor mean) { void sync_bn_forward_mean(const Tensor input, Tensor mean) {
if (input.device().is_cuda()) { sync_bn_forward_mean_impl(input, mean);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(mean);
sync_bn_forward_mean_cuda(input, mean);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
} }
void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) { void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
if (input.device().is_cuda()) { sync_bn_forward_var_impl(input, mean, var);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(mean);
CHECK_CUDA_INPUT(var);
sync_bn_forward_var_cuda(input, mean, var);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
} }
void sync_bn_forward_output(const Tensor input, const Tensor mean, void sync_bn_forward_output(const Tensor input, const Tensor mean,
...@@ -95,65 +50,20 @@ void sync_bn_forward_output(const Tensor input, const Tensor mean, ...@@ -95,65 +50,20 @@ void sync_bn_forward_output(const Tensor input, const Tensor mean,
Tensor running_var, Tensor norm, Tensor std, Tensor running_var, Tensor norm, Tensor std,
Tensor output, float eps, float momentum, Tensor output, float eps, float momentum,
int group_size) { int group_size) {
if (input.device().is_cuda()) { sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,
#ifdef MMCV_WITH_CUDA weight, bias, norm, std, output, eps, momentum,
CHECK_CUDA_INPUT(input); group_size);
CHECK_CUDA_INPUT(mean);
CHECK_CUDA_INPUT(var);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(running_mean);
CHECK_CUDA_INPUT(running_var);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(std);
CHECK_CUDA_INPUT(output);
sync_bn_forward_output_cuda(input, mean, var, running_mean, running_var,
weight, bias, norm, std, output, eps, momentum,
group_size);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
} }
void sync_bn_backward_param(const Tensor grad_output, const Tensor norm, void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
Tensor grad_weight, Tensor grad_bias) { Tensor grad_weight, Tensor grad_bias) {
if (grad_output.device().is_cuda()) { sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
sync_bn_backward_param_cuda(grad_output, norm, grad_weight, grad_bias);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
} }
void sync_bn_backward_data(const Tensor grad_output, const Tensor weight, void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
const Tensor grad_weight, const Tensor grad_bias, const Tensor grad_weight, const Tensor grad_bias,
const Tensor norm, const Tensor std, const Tensor norm, const Tensor std,
Tensor grad_input) { Tensor grad_input) {
if (grad_output.device().is_cuda()) { sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,
#ifdef MMCV_WITH_CUDA std, grad_input);
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
CHECK_CUDA_INPUT(norm);
CHECK_CUDA_INPUT(std);
CHECK_CUDA_INPUT(grad_input);
sync_bn_backward_data_cuda(grad_output, weight, grad_weight, grad_bias,
norm, std, grad_input);
#else
AT_ERROR("SyncBatchNorm is not compiled with GPU support");
#endif
} else {
AT_ERROR("SyncBatchNorm is not implemented on CPU");
}
} }
...@@ -2,60 +2,32 @@ ...@@ -2,60 +2,32 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void three_interpolate_forward_impl(int b, int c, int m, int n,
void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
const Tensor points,
const Tensor idx,
const Tensor weight, Tensor out);
void three_interpolate_forward_cuda(int b, int c, int m, int n,
const Tensor points, const Tensor idx, const Tensor points, const Tensor idx,
const Tensor weight, Tensor out) { const Tensor weight, Tensor out) {
ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight, DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,
out); weight, out);
}; }
void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
const Tensor grad_out,
const Tensor idx,
const Tensor weight,
Tensor grad_points);
void three_interpolate_backward_cuda(int b, int c, int n, int m, void three_interpolate_backward_impl(int b, int c, int n, int m,
const Tensor grad_out, const Tensor idx, const Tensor grad_out, const Tensor idx,
const Tensor weight, Tensor grad_points) { const Tensor weight, Tensor grad_points) {
ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight, DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,
grad_points); idx, weight, grad_points);
}; }
#endif
void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor, void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor weight_tensor, Tensor out_tensor, int b, Tensor weight_tensor, Tensor out_tensor, int b,
int c, int m, int n) { int c, int m, int n) {
if (points_tensor.device().is_cuda()) { three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA weight_tensor, out_tensor);
three_interpolate_forward_cuda(b, c, m, n, points_tensor, idx_tensor,
weight_tensor, out_tensor);
#else
AT_ERROR("three_interpolate is not compiled with GPU support");
#endif
} else {
AT_ERROR("three_interpolate is not implemented on CPU");
}
} }
void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor, void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor weight_tensor, Tensor grad_points_tensor, Tensor weight_tensor, Tensor grad_points_tensor,
int b, int c, int n, int m) { int b, int c, int n, int m) {
if (grad_out_tensor.device().is_cuda()) { three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA weight_tensor, grad_points_tensor);
three_interpolate_backward_cuda(b, c, n, m, grad_out_tensor, idx_tensor,
weight_tensor, grad_points_tensor);
#else
AT_ERROR("three_interpolate is not compiled with GPU support");
#endif
} else {
AT_ERROR("three_interpolate is not implemented on CPU");
}
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment