Commit fdeee889 authored by limm's avatar limm
Browse files

release v1.6.1 of mmcv

parent df465820
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor data_col) {
DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, parallel_imgs,
deformable_group, data_col);
}
#ifdef MMCV_WITH_CUDA
void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor data_col);
void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor grad_im);
void deformable_col2im_coord(
Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int parallel_imgs,
const int deformable_group, Tensor grad_offset);
#endif
void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor grad_im) {
DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, parallel_imgs,
deformable_group, grad_im);
}
void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor data_col);
void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor grad_im);
void deformable_col2im_coord_cpu(
void deformable_col2im_coord_impl(
Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int parallel_imgs,
const int deformable_group, Tensor grad_offset);
const int deformable_group, Tensor grad_offset) {
DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
data_offset, channels, height, width, ksize_h, ksize_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
parallel_imgs, deformable_group, grad_offset);
}
void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
at::Tensor *gradOutput, at::Tensor weight, int kH,
......@@ -227,17 +216,9 @@ void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
output_buffer.size(2), output_buffer.size(3)});
for (int elt = 0; elt < batchSize / im2col_step; elt++) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
#endif
} else {
deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
}
deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view({group, weight.size(0) / group, weight.size(1),
......@@ -373,29 +354,15 @@ void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
{gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
inputHeight, inputWidth, kH, kW, padH, padW, dH,
dW, dilationH, dilationW, im2col_step,
deformable_group, gradOffset[elt]);
deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group,
gradInput[elt]);
#endif
} else {
deformable_col2im_coord_cpu(columns, input[elt], offset[elt], nInputPlane,
inputHeight, inputWidth, kH, kW, padH, padW,
dH, dW, dilationH, dilationW, im2col_step,
deformable_group, gradOffset[elt]);
deformable_col2im_cpu(columns, offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group,
gradInput[elt]);
}
deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
inputHeight, inputWidth, kH, kW, padH, padW,
dH, dW, dilationH, dilationW, im2col_step,
deformable_group, gradOffset[elt]);
deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group,
gradInput[elt]);
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)});
......@@ -508,17 +475,9 @@ void deform_conv_backward_parameters(Tensor input, Tensor offset,
deformable_group * 2 * kH * kW, outputHeight, outputWidth});
for (int elt = 0; elt < batchSize / im2col_step; elt++) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
#endif
} else {
deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
}
deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
// divide into group
gradOutputBuffer = gradOutputBuffer.view(
......
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
template <typename T>
T deformable_im2col_bilinear_cpu(const T *input, const int data_width,
const int height, const int width, T h, T w) {
if (h <= -1 || height <= h || w <= -1 || width <= w) {
return 0;
}
int h_low = floor(h);
int w_low = floor(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
T lh = h - h_low;
T lw = w - w_low;
T hh = 1 - lh, hw = 1 - lw;
T v1 = 0;
if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
T v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
v2 = input[h_low * data_width + w_high];
T v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
v3 = input[h_high * data_width + w_low];
T v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
v4 = input[h_high * data_width + w_high];
T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename T>
T get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
const int height, const int width) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (h == argmax_h_low && w == argmax_w_low)
weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
if (h == argmax_h_low && w == argmax_w_high)
weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
if (h == argmax_h_high && w == argmax_w_low)
weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
if (h == argmax_h_high && w == argmax_w_high)
weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
return weight;
}
template <typename T>
T get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
const int width, const T *im_data,
const int data_width, const int bp_dir) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (bp_dir == 0) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += -1 * (argmax_w - argmax_w_low) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_w - argmax_w_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
} else if (bp_dir == 1) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += -1 * (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
}
return weight;
}
template <typename T>
void deformable_im2col_cpu_kernel(
const int n, const T *data_im, const T *data_offset, const int height,
const int width, const int kernel_h, const int kernel_w, const int pad_h,
const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int num_channels, const int deformable_group, const int height_col,
const int width_col, T *data_col) {
for (int index = 0; index < n; index++) {
// index index of output matrix
const int w_col = index % width_col;
const int h_col = (index / width_col) % height_col;
const int b_col = (index / width_col / height_col) % batch_size;
const int c_im = (index / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
// compute deformable group index
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
T *data_col_ptr =
data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
const T *data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const T *data_offset_ptr =
data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
T val = static_cast<T>(0);
const T h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
val = deformable_im2col_bilinear_cpu(data_im_ptr, width, height,
width, h_im, w_im);
*data_col_ptr = val;
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
template <typename T>
void deformable_col2im_cpu_kernel(
const int n, const T *data_col, const T *data_offset, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int deformable_group, const int height_col, const int width_col,
T *grad_im) {
for (int index = 0; index < n; index++) {
const int j = (index / width_col / height_col / batch_size) % kernel_w;
const int i =
(index / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c =
index / width_col / height_col / batch_size / kernel_w / kernel_h;
// compute the start and end of the output
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = index % width_col;
int h_out = (index / width_col) % height_col;
int b = (index / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const T *data_offset_ptr =
data_offset + (b * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
const T cur_top_grad = data_col[index];
const int cur_h = (int)cur_inv_h_data;
const int cur_w = (int)cur_inv_w_data;
for (int dy = -2; dy <= 2; dy++) {
for (int dx = -2; dx <= 2; dx++) {
if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1) {
int cur_bottom_grad_pos =
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
T weight =
get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data,
cur_h + dy, cur_w + dx, height, width);
*(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
}
}
}
}
}
template <typename T>
void deformable_col2im_coord_cpu_kernel(
const int n, const T *data_col, const T *data_im, const T *data_offset,
const int channels, const int height, const int width, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int offset_channels, const int deformable_group, const int height_col,
const int width_col, T *grad_offset) {
for (int index = 0; index < n; index++) {
T val = 0;
int w = index % width_col;
int h = (index / width_col) % height_col;
int c = (index / width_col / height_col) % offset_channels;
int b = (index / width_col / height_col) / offset_channels;
// compute the start and end of the output
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const T *data_col_ptr = data_col + deformable_group_index *
channel_per_deformable_group *
batch_size * width_col * height_col;
const T *data_im_ptr =
data_im + (b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h / kernel_w *
height * width;
const T *data_offset_ptr =
data_offset + (b * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
col_c += col_step) {
const int col_pos =
(((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i =
(col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr =
(((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr =
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
w_out);
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
T inv_h = h_in + i * dilation_h + offset_h;
T inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
inv_h = inv_w = -2;
const T weight = get_coordinate_weight_cpu(
inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
width, bp_dir);
val += weight * data_col_ptr[col_pos];
cnt += 1;
}
grad_offset[index] = val;
}
}
void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor data_col) {
int height_col =
(height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col =
(width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = channels * height_col * width_col * parallel_imgs;
int channel_per_deformable_group = channels / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.scalar_type(), "deformable_im2col_cpu", [&] {
deformable_im2col_cpu_kernel<scalar_t>(
num_kernels, data_im.data_ptr<scalar_t>(),
data_offset.data_ptr<scalar_t>(), height, width, ksize_h, ksize_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, parallel_imgs, channels,
deformable_group, height_col, width_col,
data_col.data_ptr<scalar_t>());
});
}
void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
const int channels, const int height,
const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
Tensor grad_im) {
// todo: make sure parallel_imgs is passed in correctly
int height_col =
(height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col =
(width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels =
channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
int channel_per_deformable_group = channels / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
deformable_col2im_cpu_kernel<scalar_t>(
num_kernels, data_col_, data_offset_, channels, height, width,
ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
dilation_w, channel_per_deformable_group, parallel_imgs,
deformable_group, height_col, width_col, grad_im_);
}));
}
void deformable_col2im_coord_cpu(
Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int parallel_imgs,
const int deformable_group, Tensor grad_offset) {
int height_col =
(height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col =
(width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
deformable_group * parallel_imgs;
int channel_per_deformable_group =
channels * ksize_h * ksize_w / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "deformable_col2im_coord_cpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
deformable_col2im_coord_cpu_kernel<scalar_t>(
num_kernels, data_col_, data_im_, data_offset_, channels, height,
width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
2 * ksize_h * ksize_w * deformable_group, deformable_group,
height_col, width_col, grad_offset_);
}));
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
Tensor offset, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale,
int sampling_ratio, float gamma);
void DeformRoIPoolBackwardCUDAKernelLauncher(
Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio, float gamma);
void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height,
int pooled_width, float spatial_scale,
int sampling_ratio, float gamma) {
DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
output, pooled_height, pooled_width, spatial_scale,
sampling_ratio, gamma);
}
void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset,
int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio,
float gamma) {
DeformRoIPoolBackwardCUDAKernelLauncher(
grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
pooled_width, spatial_scale, sampling_ratio, gamma);
DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
offset, grad_input, grad_offset, pooled_height,
pooled_width, spatial_scale, sampling_ratio, gamma);
}
#endif
void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
Tensor output, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio,
float gamma) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(output);
deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
pooled_width, spatial_scale, sampling_ratio,
gamma);
#else
AT_ERROR("DeformRoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformRoIPool is not implemented on CPU");
}
deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
pooled_width, spatial_scale, sampling_ratio,
gamma);
}
void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
......@@ -61,22 +36,7 @@ void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
Tensor grad_offset, int pooled_height,
int pooled_width, float spatial_scale,
int sampling_ratio, float gamma) {
if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_offset);
deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
grad_offset, pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
#else
AT_ERROR("DeformRoIPool is not compiled with GPU support");
#endif
} else {
AT_ERROR("DeformRoIPool is not implemented on CPU");
}
deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
grad_offset, pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
Tensor num_valid) {
return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
vertices, mask, num_valid);
}
Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
Tensor num_valid) {
return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
}
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "diff_iou_rotated_pytorch.h"
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void diff_iou_rotated_sort_vertices_forward_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
at::Tensor boxes, scores, dets;
auto vertices = buildATensor(ctx, ins[0]);
auto mask = buildATensor(ctx, ins[1]);
auto num_valid = buildATensor(ctx, ins[2]);
auto out =
diff_iou_rotated_sort_vertices_forward_cuda(vertices, mask, num_valid);
updateDArray(ctx, out, outs[0]);
}
PARROTS_EXTENSION_REGISTER(diff_iou_rotated_sort_vertices_forward)
.input(3)
.output(1)
.apply(diff_iou_rotated_sort_vertices_forward_cuda_parrots)
.done();
#endif
// Copyright (c) OpenMMLab. All rights reserved
#ifndef DIFF_IOU_ROTATED_PYTORCH_H
#define DIFF_IOU_ROTATED_PYTORCH_H
#include <torch/extension.h>
using namespace at;
Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
Tensor num_valid);
#endif // DIFF_IOU_ROTATED_PYTORCH_H
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor output,
const float gamma,
const float alpha);
void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight,
Tensor grad_input,
const float gamma,
const float alpha);
void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor output,
const float gamma,
const float alpha);
void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
Tensor weight, Tensor buff,
Tensor grad_input,
const float gamma,
const float alpha);
void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
gamma, alpha);
DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
output, gamma, alpha);
}
void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
Tensor weight, Tensor grad_input,
float gamma, float alpha) {
SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
gamma, alpha);
DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
grad_input, gamma, alpha);
}
void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
gamma, alpha);
DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
output, gamma, alpha);
}
void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
Tensor weight, Tensor buff,
Tensor grad_input, float gamma,
float alpha) {
SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
grad_input, gamma, alpha);
DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
buff, grad_input, gamma, alpha);
}
#endif
void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma,
alpha);
#else
AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
}
sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
}
void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
Tensor grad_input, float gamma, float alpha) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(grad_input);
sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
alpha);
#else
AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
}
sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
alpha);
}
void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
Tensor output, float gamma, float alpha) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(output);
softmax_focal_loss_forward_cuda(input, target, weight, output, gamma,
alpha);
#else
AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
}
softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
}
void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
Tensor buff, Tensor grad_input, float gamma,
float alpha) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(target);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(buff);
CHECK_CUDA_INPUT(grad_input);
softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
gamma, alpha);
#else
AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
#endif
} else {
AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
}
softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
gamma, alpha);
}
......@@ -2,61 +2,33 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
const float *dataset,
float *temp, int *idxs);
void furthest_point_sampling_forward_cuda(int b, int n, int m,
const float *dataset, float *temp,
int *idxs) {
FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
void furthest_point_sampling_forward_impl(Tensor points_tensor,
Tensor temp_tensor, Tensor idx_tensor,
int b, int n, int m) {
DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
temp_tensor, idx_tensor, b, n, m);
}
void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
int b, int n, int m, const float *dataset, float *temp, int *idxs);
void furthest_point_sampling_with_dist_forward_cuda(int b, int n, int m,
const float *dataset,
float *temp, int *idxs) {
FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
idxs);
void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
Tensor temp_tensor,
Tensor idx_tensor, int b,
int n, int m) {
DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
points_tensor, temp_tensor, idx_tensor, b, n, m);
}
#endif
void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
Tensor idx_tensor, int b, int n, int m) {
if (points_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
const float *points = points_tensor.data_ptr<float>();
float *temp = temp_tensor.data_ptr<float>();
int *idx = idx_tensor.data_ptr<int>();
furthest_point_sampling_forward_cuda(b, n, m, points, temp, idx);
#else
AT_ERROR("furthest_point_sampling is not compiled with GPU support");
#endif
} else {
AT_ERROR("furthest_point_sampling is not implemented on CPU");
}
furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
b, n, m);
}
void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
Tensor temp_tensor,
Tensor idx_tensor, int b, int n,
int m) {
if (points_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
const float *points = points_tensor.data<float>();
float *temp = temp_tensor.data<float>();
int *idx = idx_tensor.data<int>();
furthest_point_sampling_with_dist_forward_cuda(b, n, m, points, temp, idx);
#else
AT_ERROR(
"furthest_point_sampling_with_dist is not compiled with GPU support");
#endif
} else {
AT_ERROR("furthest_point_sampling_with_dist is not implemented on CPU");
}
furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
idx_tensor, b, n, m);
}
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// from
// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp
#include "pytorch_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor &input,
const torch::Tensor &bias,
const torch::Tensor &refer, int act,
int grad, float alpha, float scale);
/*
Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
Augmentation (ADA)
=======================================================================
1. Definitions
"Licensor" means any person or entity that distributes its Work.
"Software" means the original work of authorship made available under
this License.
"Work" means the Software and any additions to or derivative works of
the Software that are made available under this License.
The terms "reproduce," "reproduction," "derivative works," and
"distribution" have the meaning as provided under U.S. copyright law;
provided, however, that for the purposes of this License, derivative
works shall not include works that remain separable from, or merely
link (or bind by name) to the interfaces of, the Work.
Works, including the Software, are "made available" under this License
by including in or with the Work either (a) a copyright notice
referencing the applicability of this License to the Work, or (b) a
copy of this License.
2. License Grants
2.1 Copyright Grant. Subject to the terms and conditions of this
License, each Licensor grants to you a perpetual, worldwide,
non-exclusive, royalty-free, copyright license to reproduce,
prepare derivative works of, publicly display, publicly perform,
sublicense and distribute its Work and any resulting derivative
works in any form.
3. Limitations
3.1 Redistribution. You may reproduce or distribute the Work only
if (a) you do so under this License, (b) you include a complete
copy of this License with your distribution, and (c) you retain
without modification any copyright, patent, trademark, or
attribution notices that are present in the Work.
3.2 Derivative Works. You may specify that additional or different
terms apply to the use, reproduction, and distribution of your
derivative works of the Work ("Your Terms") only if (a) Your Terms
provide that the use limitation in Section 3.3 applies to your
derivative works, and (b) you identify the specific derivative
works that are subject to Your Terms. Notwithstanding Your Terms,
this License (including the redistribution requirements in Section
3.1) will continue to apply to the Work itself.
#endif
3.3 Use Limitation. The Work and any derivative works thereof only
may be used or intended for use non-commercially. Notwithstanding
the foregoing, NVIDIA and its affiliates may use the Work and any
derivative works commercially. As used herein, "non-commercially"
means for research or evaluation purposes only.
3.4 Patent Claims. If you bring or threaten to bring a patent claim
against any Licensor (including any claim, cross-claim or
counterclaim in a lawsuit) to enforce any patents that you allege
are infringed by any Work, then your rights under this License from
such Licensor (including the grant in Section 2.1) will terminate
immediately.
3.5 Trademarks. This License does not grant any rights to use any
Licensor’s or its affiliates’ names, logos, or trademarks, except
as necessary to reproduce the notices described in this License.
3.6 Termination. If you violate any term of this License, then your
rights under this License (including the grant in Section 2.1) will
terminate immediately.
4. Disclaimer of Warranty.
THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
THIS LICENSE.
5. Limitation of Liability.
EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
THE POSSIBILITY OF SUCH DAMAGES.
=======================================================================
*/
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
const torch::Tensor& bias,
const torch::Tensor& refer, int act,
int grad, float alpha, float scale) {
return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
act, grad, alpha, scale);
}
torch::Tensor fused_bias_leakyrelu(const torch::Tensor &input,
const torch::Tensor &bias,
const torch::Tensor &refer, int act,
torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
const torch::Tensor& bias,
const torch::Tensor& refer, int act,
int grad, float alpha, float scale) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA(input);
CHECK_CUDA(bias);
return fused_bias_leakyrelu_op(input, bias, refer, act, grad, alpha, scale);
#else
AT_ERROR("Fused bias leakyrelu is not compiled with GPU support");
#endif
return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
scale);
}
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
const Tensor points,
const Tensor idx, Tensor out);
void gather_points_forward_cuda(int b, int c, int n, int npoints,
void gather_points_forward_impl(int b, int c, int n, int npoints,
const Tensor points, const Tensor idx,
Tensor out) {
GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
};
void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
const Tensor grad_out,
const Tensor idx,
Tensor grad_points);
DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
idx, out);
}
void gather_points_backward_cuda(int b, int c, int n, int npoints,
void gather_points_backward_impl(int b, int c, int n, int npoints,
const Tensor grad_out, const Tensor idx,
Tensor grad_points) {
GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
grad_points);
};
#endif
DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
idx, grad_points);
}
void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor out_tensor, int b, int c, int n,
int npoints) {
if (points_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
gather_points_forward_cuda(b, c, n, npoints, points_tensor, idx_tensor,
out_tensor);
#else
AT_ERROR("gather_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("gather_points is not implemented on CPU");
}
gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
out_tensor);
}
void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor grad_points_tensor, int b, int c, int n,
int npoints) {
if (grad_out_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
gather_points_backward_cuda(b, c, n, npoints, grad_out_tensor, idx_tensor,
grad_points_tensor);
#else
AT_ERROR("gather_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("gather_points is not implemented on CPU");
}
gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
grad_points_tensor);
}
......@@ -3,56 +3,32 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
int nsample, const Tensor points,
const Tensor idx, Tensor out);
void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
const Tensor points, const Tensor idx,
Tensor out) {
GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
out);
};
DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
points, idx, out);
}
void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
int nsample, const Tensor grad_out,
const Tensor idx,
Tensor grad_points);
void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
const Tensor grad_out, const Tensor idx,
Tensor grad_points) {
GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
idx, grad_points);
};
#endif
DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
grad_out, idx, grad_points);
}
void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
Tensor out_tensor, int b, int c, int n, int npoints,
int nsample) {
if (points_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
group_points_forward_cuda(b, c, n, npoints, nsample, points_tensor,
idx_tensor, out_tensor);
#else
AT_ERROR("group_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("group_points is not implemented on CPU");
}
DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
points_tensor, idx_tensor, out_tensor);
}
void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
Tensor grad_points_tensor, int b, int c, int n,
int npoints, int nsample) {
if (grad_out_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
group_points_backward_cuda(b, c, n, npoints, nsample, grad_out_tensor,
idx_tensor, grad_points_tensor);
#else
AT_ERROR("group_points is not compiled with GPU support");
#endif
} else {
AT_ERROR("group_points is not implemented on CPU");
}
group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
idx_tensor, grad_points_tensor);
}
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
#include "pytorch_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
#ifndef HIP_DIFF
#include <cuda_runtime_api.h>
int get_cudart_version() { return CUDART_VERSION; }
#endif
#endif
std::string get_compiling_cuda_version() {
#ifdef MMCV_WITH_CUDA
#ifndef HIP_DIFF
std::ostringstream oss;
// copied from
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
auto printCudaStyleVersion = [&](int v) {
oss << (v / 1000) << "." << (v / 10 % 100);
if (v % 10 != 0) {
oss << "." << (v % 10);
}
};
printCudaStyleVersion(get_cudart_version());
return oss.str();
#else
return std::string("rocm not available");
#endif
#else
return std::string("not available");
#endif
}
// similar to
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
std::string get_compiler_version() {
std::ostringstream ss;
#if defined(__GNUC__)
#ifndef __clang__
{ ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
#endif
#endif
#if defined(__clang_major__)
{
ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
<< __clang_patchlevel__;
}
#endif
#if defined(_MSC_VER)
{ ss << "MSVC " << _MSC_FULL_VER; }
#endif
return ss.str();
}
......@@ -8,225 +8,128 @@ All Rights Reserved 2019-2020.
*/
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
#ifdef MMCV_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime_api.h>
#define CHECK_ERROR(state) \
{ gpuAssert((state), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
line);
if (abort) exit(code);
}
}
void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
const Tensor boxes_a,
const int num_b,
const Tensor boxes_b,
Tensor ans_overlap);
void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b,
Tensor ans_overlap) {
IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
ans_overlap);
};
void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
const Tensor boxes_a,
const int num_b,
const Tensor boxes_b,
Tensor ans_iou);
void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b,
Tensor ans_iou) {
IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
ans_iou);
};
void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long *mask, int boxes_num,
float nms_overlap_thresh);
void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) {
IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh);
};
void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long *mask,
int boxes_num,
float nms_overlap_thresh);
void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) {
IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
nms_overlap_thresh);
};
#endif
DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
num_b, boxes_b, ans_overlap);
}
void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
Tensor ans_overlap) {
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh) {
DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, mask, boxes_num,
nms_overlap_thresh);
}
if (boxes_a.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes_a);
CHECK_CUDA_INPUT(boxes_b);
CHECK_CUDA_INPUT(ans_overlap);
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
iou3d_boxes_overlap_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b,
ans_overlap);
#else
AT_ERROR("iou3d_boxes_overlap_bev is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_boxes_overlap_bev is not implemented on CPU");
}
void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
unsigned long long *mask, int boxes_num,
float nms_overlap_thresh) {
DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, mask, boxes_num,
nms_overlap_thresh);
}
void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
Tensor ans_iou) {
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
Tensor ans_overlap) {
// params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
if (boxes_a.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes_a);
CHECK_CUDA_INPUT(boxes_b);
CHECK_CUDA_INPUT(ans_iou);
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
iou3d_boxes_iou_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b, ans_iou);
#else
AT_ERROR("iou3d_boxes_iou_bev is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_boxes_iou_bev is not implemented on CPU");
}
iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
ans_overlap);
}
void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh) {
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh) {
// params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
// params keep: (N)
CHECK_CONTIGUOUS(boxes);
CHECK_CONTIGUOUS(keep);
if (boxes.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes);
CHECK_CONTIGUOUS(keep);
int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_forward_cuda(boxes, mask_data, boxes_num, nms_overlap_thresh);
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms3d_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host =
(unsigned long long *)mask_cpu.data_ptr<int64_t>();
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host =
(unsigned long long *)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
}
if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
*keep_num_data = num_to_keep;
#else
AT_ERROR("iou3d_nms is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_nms is not implemented on CPU");
}
}
void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh) {
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh) {
// params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
// params keep: (N)
if (boxes.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes);
CHECK_CONTIGUOUS(keep);
CHECK_CONTIGUOUS(boxes);
CHECK_CONTIGUOUS(keep);
int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
int boxes_num = boxes.size(0);
int64_t *keep_data = keep.data_ptr<int64_t>();
int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms_normal_forward_cuda(boxes, mask_data, boxes_num,
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
unsigned long long *mask_data =
(unsigned long long *)mask.data_ptr<int64_t>();
iou3d_nms3d_normal_forward_impl(boxes, mask_data, boxes_num,
nms_overlap_thresh);
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host =
(unsigned long long *)mask_cpu.data_ptr<int64_t>();
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long *mask_host =
(unsigned long long *)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
std::vector<unsigned long long> remv_cpu(col_blocks);
memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / THREADS_PER_BLOCK_NMS;
int inblock = i % THREADS_PER_BLOCK_NMS;
if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
if (!(remv_cpu[nblock] & (1ULL << inblock))) {
keep_data[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_cpu[j] |= p[j];
}
}
if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
*keep_num_data = num_to_keep;
#else
AT_ERROR("iou3d_nms_normal is not compiled with GPU support");
#endif
} else {
AT_ERROR("iou3d_nms_normal is not implemented on CPU");
}
*keep_num_data = num_to_keep;
}
......@@ -8,7 +8,7 @@
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void iou3d_boxes_iou_bev_forward_cuda_parrots(
void iou3d_boxes_overlap_bev_forward_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
auto boxes_a = buildATensor(ctx, ins[0]);
......@@ -16,12 +16,12 @@ void iou3d_boxes_iou_bev_forward_cuda_parrots(
auto ans_iou = buildATensor(ctx, outs[0]);
iou3d_boxes_iou_bev_forward(boxes_a, boxes_b, ans_iou);
iou3d_boxes_overlap_bev_forward(boxes_a, boxes_b, ans_iou);
}
void iou3d_nms_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
void iou3d_nms3d_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float nms_overlap_thresh;
SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();
......@@ -30,13 +30,13 @@ void iou3d_nms_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
auto keep = buildATensor(ctx, outs[0]);
auto keep_num = buildATensor(ctx, outs[1]);
iou3d_nms_forward(boxes, keep, keep_num, nms_overlap_thresh);
iou3d_nms3d_forward(boxes, keep, keep_num, nms_overlap_thresh);
}
void iou3d_nms_normal_forward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
void iou3d_nms3d_normal_forward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float nms_overlap_thresh;
SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();
......@@ -45,26 +45,26 @@ void iou3d_nms_normal_forward_cuda_parrots(CudaContext& ctx,
auto keep = buildATensor(ctx, outs[0]);
auto keep_num = buildATensor(ctx, outs[1]);
iou3d_nms_normal_forward(boxes, keep, keep_num, nms_overlap_thresh);
iou3d_nms3d_normal_forward(boxes, keep, keep_num, nms_overlap_thresh);
}
PARROTS_EXTENSION_REGISTER(iou3d_boxes_iou_bev_forward)
PARROTS_EXTENSION_REGISTER(iou3d_boxes_overlap_bev_forward)
.input(2)
.output(1)
.apply(iou3d_boxes_iou_bev_forward_cuda_parrots)
.apply(iou3d_boxes_overlap_bev_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(iou3d_nms_forward)
PARROTS_EXTENSION_REGISTER(iou3d_nms3d_forward)
.attr("nms_overlap_thresh")
.input(1)
.output(2)
.apply(iou3d_nms_forward_cuda_parrots)
.apply(iou3d_nms3d_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(iou3d_nms_normal_forward)
PARROTS_EXTENSION_REGISTER(iou3d_nms3d_normal_forward)
.attr("nms_overlap_thresh")
.input(1)
.output(2)
.apply(iou3d_nms_normal_forward_cuda_parrots)
.apply(iou3d_nms3d_normal_forward_cuda_parrots)
.done();
#endif
......@@ -4,13 +4,13 @@
#include <torch/extension.h>
using namespace at;
void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
Tensor ans_iou);
void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
Tensor ans_overlap);
void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh);
void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh);
void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh);
void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
float nms_overlap_thresh);
#endif // IOU_3D_PYTORCH_H
......@@ -2,31 +2,16 @@
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
const Tensor xyz, const Tensor new_xyz,
Tensor idx, Tensor dist2);
void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
const Tensor new_xyz, Tensor idx, Tensor dist2) {
KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
dist2);
}
#endif
void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
Tensor dist2_tensor, int b, int n, int m, int nsample) {
if (new_xyz_tensor.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(new_xyz_tensor);
CHECK_CUDA_INPUT(xyz_tensor);
knn_forward_cuda(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
dist2_tensor);
#else
AT_ERROR("knn is not compiled with GPU support");
#endif
} else {
AT_ERROR("knn is not implemented on CPU");
}
knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
dist2_tensor);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#ifdef MMCV_WITH_CUDA
void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int kernel_h,
const int kernel_w, const int pad_h,
const int pad_w);
void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
const Tensor mask_h_idx,
const Tensor mask_w_idx,
Tensor top_data, const int height,
const int width, const int channels);
void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
kernel_h, kernel_w, pad_h, pad_w);
DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
col, kernel_h, kernel_w, pad_h, pad_w);
}
void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
width, channels);
DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
im, height, width, channels);
}
#endif
void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w) {
if (im.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(im);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(col);
masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
kernel_w, pad_h, pad_w);
}
void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height,
int width, int channels) {
if (col.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(col);
CHECK_CUDA_INPUT(mask_h_idx);
CHECK_CUDA_INPUT(mask_w_idx);
CHECK_CUDA_INPUT(im);
masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
channels);
#else
AT_ERROR("MaskConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("MaskConv is not implemented on CPU");
}
masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
channels);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
}
void min_area_polygons(const Tensor pointsets, Tensor polygons) {
min_area_polygons_impl(pointsets, polygons);
}
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "min_area_polygons_pytorch.h"
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void min_area_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
auto pointsets = buildATensor(ctx, ins[0]);
auto polygons = buildATensor(ctx, outs[0]);
min_area_polygons(pointsets, polygons);
}
PARROTS_EXTENSION_REGISTER(min_area_polygons)
.input(1)
.output(1)
.apply(min_area_polygons_cuda_parrots)
.done();
#endif
// Copyright (c) OpenMMLab. All rights reserved
#ifndef MIN_AREA_POLYGONS_PYTORCH_H
#define MIN_AREA_POLYGONS_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void min_area_polygons(const Tensor pointsets, Tensor polygons);
#endif // MIN_AREA_POLYGONS_PYTORCH_H
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment