Commit fdeee889 authored by limm's avatar limm
Browse files

release v1.6.1 of mmcv

parent df465820
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor data_col) {
DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, parallel_imgs,
deformable_group, data_col);
}
#ifdef MMCV_WITH_CUDA void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
const int channels, const int height,
void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels, const int width, const int ksize_h,
const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w,
const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group,
const int parallel_imgs, const int deformable_group, Tensor grad_im) {
Tensor data_col); DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels, stride_w, dilation_h, dilation_w, parallel_imgs,
const int height, const int width, const int ksize_h, deformable_group, grad_im);
const int ksize_w, const int pad_h, const int pad_w, }
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor grad_im);
void deformable_col2im_coord(
Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int parallel_imgs,
const int deformable_group, Tensor grad_offset);
#endif
void deformable_im2col_cpu(Tensor data_im, Tensor data_offset, void deformable_col2im_coord_impl(
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor data_col);
void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor grad_im);
void deformable_col2im_coord_cpu(
Tensor data_col, Tensor data_im, Tensor data_offset, const int channels, Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h, const int ksize_w, const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int parallel_imgs, const int dilation_h, const int dilation_w, const int parallel_imgs,
const int deformable_group, Tensor grad_offset); const int deformable_group, Tensor grad_offset) {
DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
data_offset, channels, height, width, ksize_h, ksize_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
parallel_imgs, deformable_group, grad_offset);
}
void deform_conv_shape_check(at::Tensor input, at::Tensor offset, void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
at::Tensor *gradOutput, at::Tensor weight, int kH, at::Tensor *gradOutput, at::Tensor weight, int kH,
...@@ -227,17 +216,9 @@ void deform_conv_forward(Tensor input, Tensor weight, Tensor offset, ...@@ -227,17 +216,9 @@ void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
output_buffer.size(2), output_buffer.size(3)}); output_buffer.size(2), output_buffer.size(3)});
for (int elt = 0; elt < batchSize / im2col_step; elt++) { for (int elt = 0; elt < batchSize / im2col_step; elt++) {
if (input.device().is_cuda()) { deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
#ifdef MMCV_WITH_CUDA inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight, dilationW, im2col_step, deformable_group, columns);
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
#endif
} else {
deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
}
columns = columns.view({group, columns.size(0) / group, columns.size(1)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view({group, weight.size(0) / group, weight.size(1), weight = weight.view({group, weight.size(0) / group, weight.size(1),
...@@ -373,29 +354,15 @@ void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput, ...@@ -373,29 +354,15 @@ void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
{gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2), {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)}); gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
if (input.device().is_cuda()) { deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
#ifdef MMCV_WITH_CUDA inputHeight, inputWidth, kH, kW, padH, padW,
deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane, dH, dW, dilationH, dilationW, im2col_step,
inputHeight, inputWidth, kH, kW, padH, padW, dH, deformable_group, gradOffset[elt]);
dW, dilationH, dilationW, im2col_step,
deformable_group, gradOffset[elt]); deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
deformable_col2im(columns, offset[elt], nInputPlane, inputHeight, dilationW, im2col_step, deformable_group,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH, gradInput[elt]);
dilationW, im2col_step, deformable_group,
gradInput[elt]);
#endif
} else {
deformable_col2im_coord_cpu(columns, input[elt], offset[elt], nInputPlane,
inputHeight, inputWidth, kH, kW, padH, padW,
dH, dW, dilationH, dilationW, im2col_step,
deformable_group, gradOffset[elt]);
deformable_col2im_cpu(columns, offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group,
gradInput[elt]);
}
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2), weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)}); weight.size(3), weight.size(4)});
...@@ -508,17 +475,9 @@ void deform_conv_backward_parameters(Tensor input, Tensor offset, ...@@ -508,17 +475,9 @@ void deform_conv_backward_parameters(Tensor input, Tensor offset,
deformable_group * 2 * kH * kW, outputHeight, outputWidth}); deformable_group * 2 * kH * kW, outputHeight, outputWidth});
for (int elt = 0; elt < batchSize / im2col_step; elt++) { for (int elt = 0; elt < batchSize / im2col_step; elt++) {
if (input.device().is_cuda()) { deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
#ifdef MMCV_WITH_CUDA inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight, dilationW, im2col_step, deformable_group, columns);
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
#endif
} else {
deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
}
// divide into group // divide into group
gradOutputBuffer = gradOutputBuffer.view( gradOutputBuffer = gradOutputBuffer.view(
......
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
template <typename T>
T deformable_im2col_bilinear_cpu(const T *input, const int data_width,
const int height, const int width, T h, T w) {
if (h <= -1 || height <= h || w <= -1 || width <= w) {
return 0;
}
int h_low = floor(h);
int w_low = floor(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
T lh = h - h_low;
T lw = w - w_low;
T hh = 1 - lh, hw = 1 - lw;
T v1 = 0;
if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
T v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
v2 = input[h_low * data_width + w_high];
T v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
v3 = input[h_high * data_width + w_low];
T v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
v4 = input[h_high * data_width + w_high];
T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename T>
T get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
const int height, const int width) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (h == argmax_h_low && w == argmax_w_low)
weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
if (h == argmax_h_low && w == argmax_w_high)
weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
if (h == argmax_h_high && w == argmax_w_low)
weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
if (h == argmax_h_high && w == argmax_w_high)
weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
return weight;
}
template <typename T>
T get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
const int width, const T *im_data,
const int data_width, const int bp_dir) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (bp_dir == 0) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += -1 * (argmax_w - argmax_w_low) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_w - argmax_w_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
} else if (bp_dir == 1) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += -1 * (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
}
return weight;
}
template <typename T>
void deformable_im2col_cpu_kernel(
const int n, const T *data_im, const T *data_offset, const int height,
const int width, const int kernel_h, const int kernel_w, const int pad_h,
const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int num_channels, const int deformable_group, const int height_col,
const int width_col, T *data_col) {
for (int index = 0; index < n; index++) {
// index index of output matrix
const int w_col = index % width_col;
const int h_col = (index / width_col) % height_col;
const int b_col = (index / width_col / height_col) % batch_size;
const int c_im = (index / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
// compute deformable group index
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
T *data_col_ptr =
data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
const T *data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const T *data_offset_ptr =
data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
T val = static_cast<T>(0);
const T h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
val = deformable_im2col_bilinear_cpu(data_im_ptr, width, height,
width, h_im, w_im);
*data_col_ptr = val;
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
template <typename T>
void deformable_col2im_cpu_kernel(
const int n, const T *data_col, const T *data_offset, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int deformable_group, const int height_col, const int width_col,
T *grad_im) {
for (int index = 0; index < n; index++) {
const int j = (index / width_col / height_col / batch_size) % kernel_w;
const int i =
(index / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c =
index / width_col / height_col / batch_size / kernel_w / kernel_h;
// compute the start and end of the output
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = index % width_col;
int h_out = (index / width_col) % height_col;
int b = (index / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const T *data_offset_ptr =
data_offset + (b * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
const T cur_top_grad = data_col[index];
const int cur_h = (int)cur_inv_h_data;
const int cur_w = (int)cur_inv_w_data;
for (int dy = -2; dy <= 2; dy++) {
for (int dx = -2; dx <= 2; dx++) {
if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1) {
int cur_bottom_grad_pos =
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
T weight =
get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data,
cur_h + dy, cur_w + dx, height, width);
*(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
}
}
}
}
}
template <typename T>
void deformable_col2im_coord_cpu_kernel(
const int n, const T *data_col, const T *data_im, const T *data_offset,
const int channels, const int height, const int width, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int offset_channels, const int deformable_group, const int height_col,
const int width_col, T *grad_offset) {
for (int index = 0; index < n; index++) {
T val = 0;
int w = index % width_col;
int h = (index / width_col) % height_col;
int c = (index / width_col / height_col) % offset_channels;
int b = (index / width_col / height_col) / offset_channels;
// compute the start and end of the output
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const T *data_col_ptr = data_col + deformable_group_index *
channel_per_deformable_group *
batch_size * width_col * height_col;
const T *data_im_ptr =
data_im + (b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h / kernel_w *
height * width;
const T *data_offset_ptr =
data_offset + (b * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
col_c += col_step) {
const int col_pos =
(((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i =
(col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr =
(((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr =
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
w_out);
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
T inv_h = h_in + i * dilation_h + offset_h;
T inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
inv_h = inv_w = -2;
const T weight = get_coordinate_weight_cpu(
inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
width, bp_dir);
val += weight * data_col_ptr[col_pos];
cnt += 1;
}
grad_offset[index] = val;
}
}
void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor data_col) {
int height_col =
(height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col =
(width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = channels * height_col * width_col * parallel_imgs;
int channel_per_deformable_group = channels / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.scalar_type(), "deformable_im2col_cpu", [&] {
deformable_im2col_cpu_kernel<scalar_t>(
num_kernels, data_im.data_ptr<scalar_t>(),
data_offset.data_ptr<scalar_t>(), height, width, ksize_h, ksize_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, parallel_imgs, channels,
deformable_group, height_col, width_col,
data_col.data_ptr<scalar_t>());
});
}
void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor grad_im) {
// todo: make sure parallel_imgs is passed in correctly
int height_col =
(height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col =
(width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels =
channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
int channel_per_deformable_group = channels / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
deformable_col2im_cpu_kernel<scalar_t>(
num_kernels, data_col_, data_offset_, channels, height, width,
ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
dilation_w, channel_per_deformable_group, parallel_imgs,
deformable_group, height_col, width_col, grad_im_);
}));
}
void deformable_col2im_coord_cpu(
Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int parallel_imgs,
const int deformable_group, Tensor grad_offset) {
int height_col =
(height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col =
(width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
deformable_group * parallel_imgs;
int channel_per_deformable_group =
channels * ksize_h * ksize_w / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "deformable_col2im_coord_cpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
deformable_col2im_coord_cpu_kernel<scalar_t>(
num_kernels, data_col_, data_im_, data_offset_, channels, height,
width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
2 * ksize_h * ksize_w * deformable_group, deformable_group,
height_col, width_col, grad_offset_);
}));
}
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
Tensor offset, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale,
int sampling_ratio, float gamma);
void DeformRoIPoolBackwardCUDAKernelLauncher(
Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio, float gamma);
void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height, Tensor output, int pooled_height,
int pooled_width, float spatial_scale, int pooled_width, float spatial_scale,
int sampling_ratio, float gamma) { int sampling_ratio, float gamma) {
DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output, DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
pooled_height, pooled_width, output, pooled_height, pooled_width, spatial_scale,
spatial_scale, sampling_ratio, gamma); sampling_ratio, gamma);
} }
void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input, void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
Tensor rois, Tensor offset, Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset, Tensor grad_input, Tensor grad_offset,
int pooled_height, int pooled_width, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio, float spatial_scale, int sampling_ratio,
float gamma) { float gamma) {
DeformRoIPoolBackwardCUDAKernelLauncher( DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
grad_output, input, rois, offset, grad_input, grad_offset, pooled_height, offset, grad_input, grad_offset, pooled_height,
pooled_width, spatial_scale, sampling_ratio, gamma); pooled_width, spatial_scale, sampling_ratio, gamma);
} }
#endif
void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset, void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height, int pooled_width, Tensor output, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio, float spatial_scale, int sampling_ratio,
float gamma) { float gamma) {
if (input.device().is_cuda()) { deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
#ifdef MMCV_WITH_CUDA pooled_width, spatial_scale, sampling_ratio,
CHECK_CUDA_INPUT(input); gamma);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(output);
deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
pooled_width, spatial_scale, sampling_ratio,
gamma);
#else
AT_ERROR("DeformRoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformRoIPool is not implemented on CPU");
}
} }
void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois, void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
...@@ -61,22 +36,7 @@ void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois, ...@@ -61,22 +36,7 @@ void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
Tensor grad_offset, int pooled_height, Tensor grad_offset, int pooled_height,
int pooled_width, float spatial_scale, int pooled_width, float spatial_scale,
int sampling_ratio, float gamma) { int sampling_ratio, float gamma) {
if (grad_output.device().is_cuda()) { deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
#ifdef MMCV_WITH_CUDA grad_offset, pooled_height, pooled_width,
CHECK_CUDA_INPUT(grad_output); spatial_scale, sampling_ratio, gamma);
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_offset);
deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
grad_offset, pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
#else
AT_ERROR("DeformRoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformRoIPool is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
Tensor num_valid) {
return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
vertices, mask, num_valid);
}
Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
Tensor num_valid) {
return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
}
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "diff_iou_rotated_pytorch.h"
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void diff_iou_rotated_sort_vertices_forward_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor boxes, scores, dets;
auto vertices = buildATensor(ctx, ins[0]);
auto mask = buildATensor(ctx, ins[1]);
auto num_valid = buildATensor(ctx, ins[2]);
auto out =
diff_iou_rotated_sort_vertices_forward_cuda(vertices, mask, num_valid);
updateDArray(ctx, out, outs[0]);
}
PARROTS_EXTENSION_REGISTER(diff_iou_rotated_sort_vertices_forward)
.input(3)
.output(1)
.apply(diff_iou_rotated_sort_vertices_forward_cuda_parrots)
.done();
#endif
// Copyright (c) OpenMMLab. All rights reserved
#ifndef DIFF_IOU_ROTATED_PYTORCH_H
#define DIFF_IOU_ROTATED_PYTORCH_H
#include <torch/extension.h>
using namespace at;
Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
Tensor num_valid);
#endif // DIFF_IOU_ROTATED_PYTORCH_H
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor output,
const float gamma,
const float alpha);
void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight,
Tensor grad_input,
const float gamma,
const float alpha);
void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor output,
const float gamma,
const float alpha);
void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor buff,
Tensor grad_input,
const float gamma,
const float alpha);
void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) { Tensor output, float gamma, float alpha) {
SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output, DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
gamma, alpha); output, gamma, alpha);
} }
void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target, void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
Tensor weight, Tensor grad_input, Tensor weight, Tensor grad_input,
float gamma, float alpha) { float gamma, float alpha) {
SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input, DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
gamma, alpha); grad_input, gamma, alpha);
} }
void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight, void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) { Tensor output, float gamma, float alpha) {
SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output, DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
gamma, alpha); output, gamma, alpha);
} }
void softmax_focal_loss_backward_cuda(Tensor input, Tensor target, void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
Tensor weight, Tensor buff, Tensor weight, Tensor buff,
Tensor grad_input, float gamma, Tensor grad_input, float gamma,
float alpha) { float alpha) {
SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff, DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
grad_input, gamma, alpha); buff, grad_input, gamma, alpha);
} }
#endif
void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight, void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) { Tensor output, float gamma, float alpha) {
if (input.device().is_cuda()) { sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma,
alpha);
#else
AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
}
} }
void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight, void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
Tensor grad_input, float gamma, float alpha) { Tensor grad_input, float gamma, float alpha) {
if (input.device().is_cuda()) { sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
#ifdef MMCV_WITH_CUDA alpha);
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(grad_input);
sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
alpha);
#else
AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
}
} }
void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight, void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) { Tensor output, float gamma, float alpha) {
if (input.device().is_cuda()) { softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
softmax_focal_loss_forward_cuda(input, target, weight, output, gamma,
alpha);
#else
AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
}
} }
void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight, void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
Tensor buff, Tensor grad_input, float gamma, Tensor buff, Tensor grad_input, float gamma,
float alpha) { float alpha) {
if (input.device().is_cuda()) { softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
#ifdef MMCV_WITH_CUDA gamma, alpha);
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(buff);
CHECK_CUDA_INPUT(grad_input);
softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
gamma, alpha);
#else
AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
}
} }
...@@ -2,61 +2,33 @@ ...@@ -2,61 +2,33 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void furthest_point_sampling_forward_impl(Tensor points_tensor,
void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m, Tensor temp_tensor, Tensor idx_tensor,
const float *dataset, int b, int n, int m) {
float *temp, int *idxs); DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
temp_tensor, idx_tensor, b, n, m);
void furthest_point_sampling_forward_cuda(int b, int n, int m,
const float *dataset, float *temp,
int *idxs) {
FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
} }
void FurthestPointSamplingWithDistForwardCUDAKernelLauncher( void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
int b, int n, int m, const float *dataset, float *temp, int *idxs); Tensor temp_tensor,
Tensor idx_tensor, int b,
void furthest_point_sampling_with_dist_forward_cuda(int b, int n, int m, int n, int m) {
const float *dataset, DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
float *temp, int *idxs) { points_tensor, temp_tensor, idx_tensor, b, n, m);
FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
idxs);
} }
#endif
void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor, void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
Tensor idx_tensor, int b, int n, int m) { Tensor idx_tensor, int b, int n, int m) {
if (points_tensor.device().is_cuda()) { furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA b, n, m);
const float *points = points_tensor.data_ptr<float>();
float *temp = temp_tensor.data_ptr<float>();
int *idx = idx_tensor.data_ptr<int>();
furthest_point_sampling_forward_cuda(b, n, m, points, temp, idx);
#else
AT_ERROR("furthest_point_sampling is not compiled with GPU support");
#endif
} else {
AT_ERROR("furthest_point_sampling is not implemented on CPU");
}
} }
void furthest_point_sampling_with_dist_forward(Tensor points_tensor, void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
Tensor temp_tensor, Tensor temp_tensor,
Tensor idx_tensor, int b, int n, Tensor idx_tensor, int b, int n,
int m) { int m) {
if (points_tensor.device().is_cuda()) { furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
#ifdef MMCV_WITH_CUDA idx_tensor, b, n, m);
const float *points = points_tensor.data<float>();
float *temp = temp_tensor.data<float>();
int *idx = idx_tensor.data<int>();
furthest_point_sampling_with_dist_forward_cuda(b, n, m, points, temp, idx);
#else
AT_ERROR(
"furthest_point_sampling_with_dist is not compiled with GPU support");
#endif
} else {
AT_ERROR("furthest_point_sampling_with_dist is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved
// Modified from // Modified from
// from
// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp // https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp
#include "pytorch_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA /*
torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor &input, Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
const torch::Tensor &bias,
const torch::Tensor &refer, int act, NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
int grad, float alpha, float scale); Augmentation (ADA)
=======================================================================
1. Definitions
"Licensor" means any person or entity that distributes its Work.
"Software" means the original work of authorship made available under
this License.
"Work" means the Software and any additions to or derivative works of
the Software that are made available under this License.
The terms "reproduce," "reproduction," "derivative works," and
"distribution" have the meaning as provided under U.S. copyright law;
provided, however, that for the purposes of this License, derivative
works shall not include works that remain separable from, or merely
link (or bind by name) to the interfaces of, the Work.
Works, including the Software, are "made available" under this License
by including in or with the Work either (a) a copyright notice
referencing the applicability of this License to the Work, or (b) a
copy of this License.
2. License Grants
2.1 Copyright Grant. Subject to the terms and conditions of this
License, each Licensor grants to you a perpetual, worldwide,
non-exclusive, royalty-free, copyright license to reproduce,
prepare derivative works of, publicly display, publicly perform,
sublicense and distribute its Work and any resulting derivative
works in any form.
3. Limitations
3.1 Redistribution. You may reproduce or distribute the Work only
if (a) you do so under this License, (b) you include a complete
copy of this License with your distribution, and (c) you retain
without modification any copyright, patent, trademark, or
attribution notices that are present in the Work.
3.2 Derivative Works. You may specify that additional or different
terms apply to the use, reproduction, and distribution of your
derivative works of the Work ("Your Terms") only if (a) Your Terms
provide that the use limitation in Section 3.3 applies to your
derivative works, and (b) you identify the specific derivative
works that are subject to Your Terms. Notwithstanding Your Terms,
this License (including the redistribution requirements in Section
3.1) will continue to apply to the Work itself.
#endif 3.3 Use Limitation. The Work and any derivative works thereof only
may be used or intended for use non-commercially. Notwithstanding
the foregoing, NVIDIA and its affiliates may use the Work and any
derivative works commercially. As used herein, "non-commercially"
means for research or evaluation purposes only.
3.4 Patent Claims. If you bring or threaten to bring a patent claim
against any Licensor (including any claim, cross-claim or
counterclaim in a lawsuit) to enforce any patents that you allege
are infringed by any Work, then your rights under this License from
such Licensor (including the grant in Section 2.1) will terminate
immediately.
3.5 Trademarks. This License does not grant any rights to use any
Licensor’s or its affiliates’ names, logos, or trademarks, except
as necessary to reproduce the notices described in this License.
3.6 Termination. If you violate any term of this License, then your
rights under this License (including the grant in Section 2.1) will
terminate immediately.
4. Disclaimer of Warranty.
THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
THIS LICENSE.
5. Limitation of Liability.
EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
THE POSSIBILITY OF SUCH DAMAGES.
=======================================================================
*/
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
const torch::Tensor& bias,
const torch::Tensor& refer, int act,
int grad, float alpha, float scale) {
return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
act, grad, alpha, scale);
}
torch::Tensor fused_bias_leakyrelu(const torch::Tensor &input, torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
const torch::Tensor &bias, const torch::Tensor& bias,
const torch::Tensor &refer, int act, const torch::Tensor& refer, int act,
int grad, float alpha, float scale) { int grad, float alpha, float scale) {
#ifdef MMCV_WITH_CUDA return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
CHECK_CUDA(input); scale);
CHECK_CUDA(bias);
return fused_bias_leakyrelu_op(input, bias, refer, act, grad, alpha, scale);
#else
AT_ERROR("Fused bias leakyrelu is not compiled with GPU support");
#endif
} }
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void gather_points_forward_impl(int b, int c, int n, int npoints,
void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
const Tensor points,
const Tensor idx, Tensor out);
void gather_points_forward_cuda(int b, int c, int n, int npoints,
const Tensor points, const Tensor idx, const Tensor points, const Tensor idx,
Tensor out) { Tensor out) {
GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out); DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
}; idx, out);
}
void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
const Tensor grad_out,
const Tensor idx,
Tensor grad_points);
void gather_points_backward_cuda(int b, int c, int n, int npoints, void gather_points_backward_impl(int b, int c, int n, int npoints,
const Tensor grad_out, const Tensor idx, const Tensor grad_out, const Tensor idx,
Tensor grad_points) { Tensor grad_points) {
GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx, DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
grad_points); idx, grad_points);
}; }
#endif
void gather_points_forward(Tensor points_tensor, Tensor idx_tensor, void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor out_tensor, int b, int c, int n, Tensor out_tensor, int b, int c, int n,
int npoints) { int npoints) {
if (points_tensor.device().is_cuda()) { gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA out_tensor);
gather_points_forward_cuda(b, c, n, npoints, points_tensor, idx_tensor,
out_tensor);
#else
AT_ERROR("gather_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("gather_points is not implemented on CPU");
}
} }
void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor, void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor grad_points_tensor, int b, int c, int n, Tensor grad_points_tensor, int b, int c, int n,
int npoints) { int npoints) {
if (grad_out_tensor.device().is_cuda()) { gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA grad_points_tensor);
gather_points_backward_cuda(b, c, n, npoints, grad_out_tensor, idx_tensor,
grad_points_tensor);
#else
AT_ERROR("gather_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("gather_points is not implemented on CPU");
}
} }
...@@ -3,56 +3,32 @@ ...@@ -3,56 +3,32 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
int nsample, const Tensor points,
const Tensor idx, Tensor out);
void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
const Tensor points, const Tensor idx, const Tensor points, const Tensor idx,
Tensor out) { Tensor out) {
GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx, DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
out); points, idx, out);
}; }
void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints, void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
int nsample, const Tensor grad_out,
const Tensor idx,
Tensor grad_points);
void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
const Tensor grad_out, const Tensor idx, const Tensor grad_out, const Tensor idx,
Tensor grad_points) { Tensor grad_points) {
GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out, DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
idx, grad_points); grad_out, idx, grad_points);
}; }
#endif
void group_points_forward(Tensor points_tensor, Tensor idx_tensor, void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor out_tensor, int b, int c, int n, int npoints, Tensor out_tensor, int b, int c, int n, int npoints,
int nsample) { int nsample) {
if (points_tensor.device().is_cuda()) { DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
#ifdef MMCV_WITH_CUDA points_tensor, idx_tensor, out_tensor);
group_points_forward_cuda(b, c, n, npoints, nsample, points_tensor,
idx_tensor, out_tensor);
#else
AT_ERROR("group_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("group_points is not implemented on CPU");
}
} }
void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor, void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor grad_points_tensor, int b, int c, int n, Tensor grad_points_tensor, int b, int c, int n,
int npoints, int nsample) { int npoints, int nsample) {
if (grad_out_tensor.device().is_cuda()) { group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
#ifdef MMCV_WITH_CUDA idx_tensor, grad_points_tensor);
group_points_backward_cuda(b, c, n, npoints, nsample, grad_out_tensor,
idx_tensor, grad_points_tensor);
#else
AT_ERROR("group_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("group_points is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
#include "pytorch_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
#ifndef HIP_DIFF
#include <cuda_runtime_api.h>
int get_cudart_version() { return CUDART_VERSION; }
#endif
#endif
std::string get_compiling_cuda_version() {
#ifdef MMCV_WITH_CUDA
#ifndef HIP_DIFF
std::ostringstream oss;
// copied from
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
auto printCudaStyleVersion = [&](int v) {
oss << (v / 1000) << "." << (v / 10 % 100);
if (v % 10 != 0) {
oss << "." << (v % 10);
}
};
printCudaStyleVersion(get_cudart_version());
return oss.str();
#else
return std::string("rocm not available");
#endif
#else
return std::string("not available");
#endif
}
// similar to
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
std::string get_compiler_version() {
std::ostringstream ss;
#if defined(__GNUC__)
#ifndef __clang__
{ ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
#endif
#endif
#if defined(__clang_major__)
{
ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
<< __clang_patchlevel__;
}
#endif
#if defined(_MSC_VER)
{ ss << "MSVC " << _MSC_FULL_VER; }
#endif
return ss.str();
}
...@@ -8,225 +8,128 @@ All Rights Reserved 2019-2020. ...@@ -8,225 +8,128 @@ All Rights Reserved 2019-2020.
*/ */
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8; const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
#ifdef MMCV_WITH_CUDA void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
#include <cuda.h>
#include <cuda_runtime_api.h>
#define CHECK_ERROR(state) \
{ gpuAssert((state), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
line);
if (abort) exit(code);
}
}
void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
const Tensor boxes_a,
const int num_b,
const Tensor boxes_b,
Tensor ans_overlap);
void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b, const int num_b, const Tensor boxes_b,
Tensor ans_overlap) { Tensor ans_overlap) {
IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b, DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
ans_overlap); num_b, boxes_b, ans_overlap);
}; }
void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
const Tensor boxes_a,
const int num_b,
const Tensor boxes_b,
Tensor ans_iou);
void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b,
Tensor ans_iou) {
IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
ans_iou);
};
void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long *mask, int boxes_num,
float nms_overlap_thresh);
void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) {
IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh);
};
void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long *mask,
int boxes_num,
float nms_overlap_thresh);
void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) {
IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
nms_overlap_thresh);
};
#endif
void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b, void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long *mask,
Tensor ans_overlap) { int boxes_num, float nms_overlap_thresh) {
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry] DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, mask, boxes_num,
// params boxes_b: (M, 5) nms_overlap_thresh);
// params ans_overlap: (N, M) }
if (boxes_a.device().is_cuda()) { void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
#ifdef MMCV_WITH_CUDA unsigned long long *mask, int boxes_num,
CHECK_CUDA_INPUT(boxes_a); float nms_overlap_thresh) {
CHECK_CUDA_INPUT(boxes_b); DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, mask, boxes_num,
CHECK_CUDA_INPUT(ans_overlap); nms_overlap_thresh);
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
iou3d_boxes_overlap_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b,
ans_overlap);
#else
AT_ERROR("iou3d_boxes_overlap_bev is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_boxes_overlap_bev is not implemented on CPU");
}
} }
void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b, void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
Tensor ans_iou) { Tensor ans_overlap) {
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry] // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
// params boxes_b: (M, 5) // params boxes_b: (M, 5)
// params ans_overlap: (N, M) // params ans_overlap: (N, M)
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
if (boxes_a.device().is_cuda()) { iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
#ifdef MMCV_WITH_CUDA ans_overlap);
CHECK_CUDA_INPUT(boxes_a);
CHECK_CUDA_INPUT(boxes_b);
CHECK_CUDA_INPUT(ans_iou);
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
iou3d_boxes_iou_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b, ans_iou);
#else
AT_ERROR("iou3d_boxes_iou_bev is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_boxes_iou_bev is not implemented on CPU");
}
} }
void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num, void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh) { float nms_overlap_thresh) {
// params boxes: (N, 5) [x1, y1, x2, y2, ry] // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
// params keep: (N) // params keep: (N)
CHECK_CONTIGUOUS(boxes);
CHECK_CONTIGUOUS(keep);
if (boxes.device().is_cuda()) { int boxes_num = boxes.size(0);
#ifdef MMCV_WITH_CUDA int64_t *keep_data = keep.data_ptr<int64_t>();
CHECK_CUDA_INPUT(boxes); int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
CHECK_CONTIGUOUS(keep);
int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
Tensor mask = Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong)); at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data = unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>(); (unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_forward_cuda(boxes, mask_data, boxes_num, nms_overlap_thresh); iou3d_nms3d_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
at::Tensor mask_cpu = mask.to(at::kCPU); at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host = unsigned long long *mask_host =
(unsigned long long *)mask_cpu.data_ptr<int64_t>(); (unsigned long long *)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv_cpu(col_blocks); std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks); memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0; int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) { for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS; int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS; int inblock = i % THREADS_PER_BLOCK_NMS;
if (!(remv_cpu[nblock] & (1ULL << inblock))) { if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i; keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks; unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) { for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j]; remv_cpu[j] |= p[j];
}
} }
} }
if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
*keep_num_data = num_to_keep; *keep_num_data = num_to_keep;
#else
AT_ERROR("iou3d_nms is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_nms is not implemented on CPU");
} }
} }
void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num, void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh) { float nms_overlap_thresh) {
// params boxes: (N, 5) [x1, y1, x2, y2, ry] // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
// params keep: (N) // params keep: (N)
if (boxes.device().is_cuda()) { CHECK_CONTIGUOUS(boxes);
#ifdef MMCV_WITH_CUDA CHECK_CONTIGUOUS(keep);
CHECK_CUDA_INPUT(boxes);
CHECK_CONTIGUOUS(keep);
int boxes_num = boxes.size(0); int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>(); int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>(); int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
Tensor mask = Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong)); at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data = unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>(); (unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_normal_forward_cuda(boxes, mask_data, boxes_num, iou3d_nms3d_normal_forward_impl(boxes, mask_data, boxes_num,
nms_overlap_thresh); nms_overlap_thresh);
at::Tensor mask_cpu = mask.to(at::kCPU); at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host = unsigned long long *mask_host =
(unsigned long long *)mask_cpu.data_ptr<int64_t>(); (unsigned long long *)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv_cpu(col_blocks); std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks); memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0; int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) { for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS; int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS; int inblock = i % THREADS_PER_BLOCK_NMS;
if (!(remv_cpu[nblock] & (1ULL << inblock))) { if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i; keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks; unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) { for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j]; remv_cpu[j] |= p[j];
}
} }
} }
if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
*keep_num_data = num_to_keep;
#else
AT_ERROR("iou3d_nms_normal is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_nms_normal is not implemented on CPU");
} }
*keep_num_data = num_to_keep;
} }
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
using namespace parrots; using namespace parrots;
#ifdef MMCV_WITH_CUDA #ifdef MMCV_WITH_CUDA
void iou3d_boxes_iou_bev_forward_cuda_parrots( void iou3d_boxes_overlap_bev_forward_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins, CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) { OperatorBase::out_list_t& outs) {
auto boxes_a = buildATensor(ctx, ins[0]); auto boxes_a = buildATensor(ctx, ins[0]);
...@@ -16,12 +16,12 @@ void iou3d_boxes_iou_bev_forward_cuda_parrots( ...@@ -16,12 +16,12 @@ void iou3d_boxes_iou_bev_forward_cuda_parrots(
auto ans_iou = buildATensor(ctx, outs[0]); auto ans_iou = buildATensor(ctx, outs[0]);
iou3d_boxes_iou_bev_forward(boxes_a, boxes_b, ans_iou); iou3d_boxes_overlap_bev_forward(boxes_a, boxes_b, ans_iou);
} }
void iou3d_nms_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr, void iou3d_nms3d_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) { OperatorBase::out_list_t& outs) {
float nms_overlap_thresh; float nms_overlap_thresh;
SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done(); SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();
...@@ -30,13 +30,13 @@ void iou3d_nms_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr, ...@@ -30,13 +30,13 @@ void iou3d_nms_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
auto keep = buildATensor(ctx, outs[0]); auto keep = buildATensor(ctx, outs[0]);
auto keep_num = buildATensor(ctx, outs[1]); auto keep_num = buildATensor(ctx, outs[1]);
iou3d_nms_forward(boxes, keep, keep_num, nms_overlap_thresh); iou3d_nms3d_forward(boxes, keep, keep_num, nms_overlap_thresh);
} }
void iou3d_nms_normal_forward_cuda_parrots(CudaContext& ctx, void iou3d_nms3d_normal_forward_cuda_parrots(CudaContext& ctx,
const SSElement& attr, const SSElement& attr,
const OperatorBase::in_list_t& ins, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) { OperatorBase::out_list_t& outs) {
float nms_overlap_thresh; float nms_overlap_thresh;
SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done(); SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();
...@@ -45,26 +45,26 @@ void iou3d_nms_normal_forward_cuda_parrots(CudaContext& ctx, ...@@ -45,26 +45,26 @@ void iou3d_nms_normal_forward_cuda_parrots(CudaContext& ctx,
auto keep = buildATensor(ctx, outs[0]); auto keep = buildATensor(ctx, outs[0]);
auto keep_num = buildATensor(ctx, outs[1]); auto keep_num = buildATensor(ctx, outs[1]);
iou3d_nms_normal_forward(boxes, keep, keep_num, nms_overlap_thresh); iou3d_nms3d_normal_forward(boxes, keep, keep_num, nms_overlap_thresh);
} }
PARROTS_EXTENSION_REGISTER(iou3d_boxes_iou_bev_forward) PARROTS_EXTENSION_REGISTER(iou3d_boxes_overlap_bev_forward)
.input(2) .input(2)
.output(1) .output(1)
.apply(iou3d_boxes_iou_bev_forward_cuda_parrots) .apply(iou3d_boxes_overlap_bev_forward_cuda_parrots)
.done(); .done();
PARROTS_EXTENSION_REGISTER(iou3d_nms_forward) PARROTS_EXTENSION_REGISTER(iou3d_nms3d_forward)
.attr("nms_overlap_thresh") .attr("nms_overlap_thresh")
.input(1) .input(1)
.output(2) .output(2)
.apply(iou3d_nms_forward_cuda_parrots) .apply(iou3d_nms3d_forward_cuda_parrots)
.done(); .done();
PARROTS_EXTENSION_REGISTER(iou3d_nms_normal_forward) PARROTS_EXTENSION_REGISTER(iou3d_nms3d_normal_forward)
.attr("nms_overlap_thresh") .attr("nms_overlap_thresh")
.input(1) .input(1)
.output(2) .output(2)
.apply(iou3d_nms_normal_forward_cuda_parrots) .apply(iou3d_nms3d_normal_forward_cuda_parrots)
.done(); .done();
#endif #endif
...@@ -4,13 +4,13 @@ ...@@ -4,13 +4,13 @@
#include <torch/extension.h> #include <torch/extension.h>
using namespace at; using namespace at;
void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b, void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
Tensor ans_iou); Tensor ans_overlap);
void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num, void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh); float nms_overlap_thresh);
void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num, void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh); float nms_overlap_thresh);
#endif // IOU_3D_PYTORCH_H #endif // IOU_3D_PYTORCH_H
...@@ -2,31 +2,16 @@ ...@@ -2,31 +2,16 @@
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
const Tensor xyz, const Tensor new_xyz,
Tensor idx, Tensor dist2);
void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
const Tensor new_xyz, Tensor idx, Tensor dist2) { const Tensor new_xyz, Tensor idx, Tensor dist2) {
KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2); DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
dist2);
} }
#endif
void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor, void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
Tensor dist2_tensor, int b, int n, int m, int nsample) { Tensor dist2_tensor, int b, int n, int m, int nsample) {
if (new_xyz_tensor.device().is_cuda()) { knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
#ifdef MMCV_WITH_CUDA dist2_tensor);
CHECK_CUDA_INPUT(new_xyz_tensor);
CHECK_CUDA_INPUT(xyz_tensor);
knn_forward_cuda(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
dist2_tensor);
#else
AT_ERROR("knn is not compiled with GPU support");
#endif
} else {
AT_ERROR("knn is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int kernel_h,
const int kernel_w, const int pad_h,
const int pad_w);
void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int height,
const int width, const int channels);
void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col, const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) { const int pad_h, const int pad_w) {
// im: (n, ic, h, w), kernel size (kh, kw) DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh) col, kernel_h, kernel_w, pad_h, pad_w);
MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
kernel_h, kernel_w, pad_h, pad_w);
} }
void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx, void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height, const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) { int width, int channels) {
// im: (n, ic, h, w), kernel size (kh, kw) DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh) im, height, width, channels);
MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
width, channels);
} }
#endif
void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx, void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col, const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) { const int pad_h, const int pad_w) {
if (im.device().is_cuda()) { masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
#ifdef MMCV_WITH_CUDA kernel_w, pad_h, pad_w);
CHECK_CUDA_INPUT(im);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(col);
masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
} }
void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx, void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height, const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) { int width, int channels) {
if (col.device().is_cuda()) { masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
#ifdef MMCV_WITH_CUDA channels);
CHECK_CUDA_INPUT(col);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(im);
masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
channels);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
} }
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
}
void min_area_polygons(const Tensor pointsets, Tensor polygons) {
min_area_polygons_impl(pointsets, polygons);
}
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "min_area_polygons_pytorch.h"
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void min_area_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
auto pointsets = buildATensor(ctx, ins[0]);
auto polygons = buildATensor(ctx, outs[0]);
min_area_polygons(pointsets, polygons);
}
PARROTS_EXTENSION_REGISTER(min_area_polygons)
.input(1)
.output(1)
.apply(min_area_polygons_cuda_parrots)
.done();
#endif
// Copyright (c) OpenMMLab. All rights reserved
#ifndef MIN_AREA_POLYGONS_PYTORCH_H
#define MIN_AREA_POLYGONS_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void min_area_polygons(const Tensor pointsets, Tensor polygons);
#endif // MIN_AREA_POLYGONS_PYTORCH_H
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment