Commit fdeee889 authored by limm's avatar limm
Browse files

release v1.6.1 of mmcv

parent df465820
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
#include "pytorch_cpp_helper.hpp"
Tensor bottom_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get height
int64_t height = input.size(2);
output.copy_(input);
for (int64_t ind = 1; ind < height; ind <<= 1) {
Tensor max_temp = at::slice(output, 2, ind, height);
Tensor cur_temp = at::slice(output, 2, ind, height).clone();
Tensor next_temp = at::slice(output, 2, 0, height - ind).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor bottom_pool_backward(Tensor input, Tensor grad_output) {
auto output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(2, 0);
max_val.copy_(input_temp);
max_ind.fill_(0);
auto output_temp = output.select(2, 0);
auto grad_output_temp = grad_output.select(2, 0);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(2);
auto gt_mask = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 0; ind < height - 1; ++ind) {
input_temp = input.select(2, ind + 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, ind + 1);
grad_output_temp = grad_output.select(2, ind + 1).unsqueeze(2);
output.scatter_add_(2, un_max_ind, grad_output_temp);
}
return output;
}
Tensor left_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get width
int64_t width = input.size(3);
output.copy_(input);
for (int64_t ind = 1; ind < width; ind <<= 1) {
Tensor max_temp = at::slice(output, 3, 0, width - ind);
Tensor cur_temp = at::slice(output, 3, 0, width - ind).clone();
Tensor next_temp = at::slice(output, 3, ind, width).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor left_pool_backward(Tensor input, Tensor grad_output) {
auto output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(3, width - 1);
max_val.copy_(input_temp);
max_ind.fill_(width - 1);
auto output_temp = output.select(3, width - 1);
auto grad_output_temp = grad_output.select(3, width - 1);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(3);
auto gt_mask = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 1; ind < width; ++ind) {
input_temp = input.select(3, width - ind - 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, width - ind - 1);
grad_output_temp = grad_output.select(3, width - ind - 1).unsqueeze(3);
output.scatter_add_(3, un_max_ind, grad_output_temp);
}
return output;
}
Tensor right_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get width
int64_t width = input.size(3);
output.copy_(input);
for (int64_t ind = 1; ind < width; ind <<= 1) {
Tensor max_temp = at::slice(output, 3, ind, width);
Tensor cur_temp = at::slice(output, 3, ind, width).clone();
Tensor next_temp = at::slice(output, 3, 0, width - ind).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor right_pool_backward(Tensor input, Tensor grad_output) {
Tensor output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(3, 0);
max_val.copy_(input_temp);
max_ind.fill_(0);
auto output_temp = output.select(3, 0);
auto grad_output_temp = grad_output.select(3, 0);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(3);
auto gt_mask = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 0; ind < width - 1; ++ind) {
input_temp = input.select(3, ind + 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, ind + 1);
grad_output_temp = grad_output.select(3, ind + 1).unsqueeze(3);
output.scatter_add_(3, un_max_ind, grad_output_temp);
}
return output;
}
Tensor top_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get height
int64_t height = input.size(2);
output.copy_(input);
for (int64_t ind = 1; ind < height; ind <<= 1) {
Tensor max_temp = at::slice(output, 2, 0, height - ind);
Tensor cur_temp = at::slice(output, 2, 0, height - ind).clone();
Tensor next_temp = at::slice(output, 2, ind, height).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor top_pool_backward(Tensor input, Tensor grad_output) {
auto output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(2, height - 1);
max_val.copy_(input_temp);
max_ind.fill_(height - 1);
auto output_temp = output.select(2, height - 1);
auto grad_output_temp = grad_output.select(2, height - 1);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(2);
auto gt_mask = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 1; ind < height; ++ind) {
input_temp = input.select(2, height - ind - 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, height - ind - 1);
grad_output_temp = grad_output.select(2, height - ind - 1).unsqueeze(2);
output.scatter_add_(2, un_max_ind, grad_output_temp);
}
return output;
}
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cpu/ActiveRotatingFilter_cpu.cpp
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
template <typename T>
void active_rotated_filter_forward_cpu_kernel(
const T* weightData, const int* indicesData, const int num_output_planes,
const int num_input_planes, const int num_orientations, const int kH,
const int kW, const int num_rotations, T* outputData) {
const int nEntry = num_orientations * kH * kW;
int i, j, l;
int k;
#pragma omp parallel for private(i, j, l, k)
for (i = 0; i < num_output_planes; i++) {
for (j = 0; j < num_input_planes; j++) {
for (l = 0; l < nEntry; l++) {
int weightIndex = i * num_input_planes * nEntry + j * nEntry + l;
T val = *(weightData + weightIndex);
for (k = 0; k < num_rotations; k++) {
int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
T* target = outputData +
i * (num_rotations * num_input_planes * nEntry) +
k * (num_input_planes * nEntry) + j * (nEntry) + index;
*target = val;
}
}
}
}
}
template <typename T>
void active_rotated_filter_backward_cpu_kernel(
const T* gradOutputData, const int* indicesData,
const int num_output_planes, const int num_input_planes,
const int num_orientations, const int kH, const int kW,
const int num_rotations, T* gradInputData) {
const int nEntry = num_orientations * kH * kW;
int i, j, l;
int k;
#pragma omp parallel for private(i, j, l, k)
for (i = 0; i < num_output_planes; i++) {
for (j = 0; j < num_input_planes; j++) {
for (l = 0; l < nEntry; l++) {
int gradInputIndex = i * num_input_planes * nEntry + j * nEntry + l;
T* val = gradInputData + gradInputIndex;
*val = 0;
for (k = 0; k < num_rotations; k++) {
int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
const T* target =
gradOutputData + i * (num_rotations * num_input_planes * nEntry) +
k * (num_input_planes * nEntry) + j * (nEntry) + index;
*val = *val + *target;
}
}
}
}
}
void ActiveRotatedFilterForwardCPULauncher(const Tensor input,
const Tensor indices,
Tensor output) {
const int num_output_planes = input.size(0);
const int num_input_planes = input.size(1);
const int num_orientations = input.size(2);
const int kH = input.size(3);
const int kW = input.size(4);
const int num_rotations = indices.size(3);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "active_rotated_filter_forward_cpu_kernel", [&] {
active_rotated_filter_forward_cpu_kernel<scalar_t>(
input.data_ptr<scalar_t>(), indices.data_ptr<int>(),
num_output_planes, num_input_planes, num_orientations, kH, kW,
num_rotations, output.data_ptr<scalar_t>());
});
}
void ActiveRotatedFilterBackwardCPULauncher(const Tensor grad_out,
const Tensor indices,
Tensor grad_in) {
const int num_orientations = indices.size(0);
const int kH = indices.size(1);
const int kW = indices.size(2);
const int num_rotations = indices.size(3);
const int num_output_planes = grad_out.size(0) / num_rotations;
const int num_input_planes = grad_out.size(1) / num_orientations;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_out.scalar_type(), "active_rotated_filter_backward_cpu_kernel", [&] {
active_rotated_filter_backward_cpu_kernel<scalar_t>(
grad_out.data_ptr<scalar_t>(), indices.data_ptr<int>(),
num_output_planes, num_input_planes, num_orientations, kH, kW,
num_rotations, grad_in.data_ptr<scalar_t>());
});
}
void active_rotated_filter_forward_cpu(const Tensor input, const Tensor indices,
Tensor output) {
ActiveRotatedFilterForwardCPULauncher(input, indices, output);
}
void active_rotated_filter_backward_cpu(const Tensor grad_out,
const Tensor indices, Tensor grad_in) {
ActiveRotatedFilterBackwardCPULauncher(grad_out, indices, grad_in);
}
void active_rotated_filter_forward_impl(const Tensor input,
const Tensor indices, Tensor output);
void active_rotated_filter_backward_impl(const Tensor grad_out,
const Tensor indices, Tensor grad_in);
REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CPU,
active_rotated_filter_forward_cpu);
REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CPU,
active_rotated_filter_backward_cpu);
......@@ -59,7 +59,7 @@ Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
const float iou_threshold) {
auto result = at::empty({0}, dets.options());
AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms_rotated", [&] {
AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] {
result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
});
return result;
......
// Copyright (c) OpenMMLab. All rights reserved
// It is modified from https://github.com/WenmuZhou/PAN.pytorch
#include <queue>
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
......@@ -39,7 +41,7 @@ std::vector<std::vector<float>> pixel_group_cpu(
Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
assert(score.dim() == 2);
assert(mask.dim() == 2);
assert(embedding_dim.dim() == 3);
assert(embedding.dim() == 3);
int height = score.size(0);
int width = score.size(1);
assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
......@@ -103,7 +105,7 @@ std::vector<std::vector<float>> pixel_group_cpu(
float dis = 0;
auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
for (size_t i = 0; i < embedding_dim; i++) {
for (size_t i = 0; i < size_t(embedding_dim); i++) {
dis +=
pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
// ignore further computing if dis is big enough
......
......@@ -395,7 +395,6 @@ void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise) {
int output_size = grad_output.numel();
int channels = grad_input.size(1);
int height = grad_input.size(2);
int width = grad_input.size(3);
......@@ -431,8 +430,6 @@ void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
......@@ -442,15 +439,15 @@ void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
sampling_ratio, aligned, clockwise);
}
void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sample_ratio,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise);
void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sample_ratio, bool aligned,
int sampling_ratio, bool aligned,
bool clockwise);
REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CPU,
roi_align_rotated_forward_cpu);
......
// modified from
// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
template <typename T>
T bilinear_interpolate(const T* input, const int height, const int width, T y,
T x, const int index /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
if (y <= 0) y = 0;
if (x <= 0) x = 0;
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
// do bilinear interpolation
T v1 = input[y_low * width + x_low];
T v2 = input[y_low * width + x_high];
T v3 = input[y_high * width + x_low];
T v4 = input[y_high * width + x_high];
const T v_low = fma(v2 - v1, lx, v1);
const T v_high = fma(v4 - v3, lx, v3);
const T val = fma(v_high - v_low, ly, v_low);
return val;
}
template <typename scalar_t>
void rotated_feature_align_forward_cpu_kernel(
const int nthreads, const int points, const scalar_t* bottom_data,
const scalar_t* best_bboxes, const scalar_t spatial_scale,
const int channels, const int height, const int width, scalar_t* top_data) {
for (int index = 0; index < nthreads; index++) {
int w = index % width;
int h = (index / width) % height;
int c = (index / width / height) % channels;
int n = index / width / height / channels;
const scalar_t* bbox_offset =
best_bboxes + ((n * height + h) * width + w) * 5;
scalar_t roi_y = bbox_offset[0] * spatial_scale;
scalar_t roi_x = bbox_offset[1] * spatial_scale;
scalar_t px[5] = {roi_x, 0, 0, 0, 0};
scalar_t py[5] = {roi_y, 0, 0, 0, 0};
if (points > 1) {
scalar_t roi_w = bbox_offset[2] * spatial_scale;
scalar_t roi_h = bbox_offset[3] * spatial_scale;
scalar_t roi_a = bbox_offset[4];
scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
scalar_t wx = cosa * w_2, wy = sina * w_2;
scalar_t hx = -sina * h_2, hy = cosa * h_2;
px[1] = roi_x + wx + hx;
py[1] = roi_y + wy + hy;
px[2] = roi_x - wx + hx;
py[2] = roi_y - wy + hy;
px[3] = roi_x - wx - hx;
py[3] = roi_y - wy - hy;
px[4] = roi_x + wx - hx;
py[4] = roi_y + wy - hy;
}
const scalar_t* offset_bottom_data =
bottom_data + (n * channels + c) * height * width;
scalar_t output_val = bottom_data[index];
for (int i = 0; i < points; i++) {
output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
width, py[i], px[i], i);
}
top_data[index] = output_val;
}
}
template <typename T>
void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
T& w1, T& w2, T& w3, T& w4, int& x_low,
int& x_high, int& y_low, int& y_high,
const int index) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y <= 0) y = 0;
if (x <= 0) x = 0;
y_low = (int)y;
x_low = (int)x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
template <typename scalar_t>
inline void valueAdd(scalar_t* address, scalar_t val) {
scalar_t old = *address;
*address = (old + val);
}
template <typename scalar_t>
void rotated_feature_align_backward_cpu_kernel(
const int nthreads, const int points, const scalar_t* top_diff,
const scalar_t* best_bboxes, const scalar_t spatial_scale,
const int channels, const int height, const int width,
scalar_t* bottom_diff) {
for (int index = 0; index < nthreads; index++) {
int w = index % width;
int h = (index / width) % height;
int c = (index / width / height) % channels;
int n = index / width / height / channels;
const scalar_t* bbox_offset =
best_bboxes + ((n * height + h) * width + w) * 5;
scalar_t roi_y = bbox_offset[0] * spatial_scale;
scalar_t roi_x = bbox_offset[1] * spatial_scale;
scalar_t px[5] = {roi_x, 0, 0, 0, 0};
scalar_t py[5] = {roi_y, 0, 0, 0, 0};
if (points > 1) {
scalar_t roi_w = bbox_offset[2] * spatial_scale;
scalar_t roi_h = bbox_offset[3] * spatial_scale;
scalar_t roi_a = bbox_offset[4];
scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
scalar_t wx = cosa * w_2, wy = sina * w_2;
scalar_t hx = -sina * h_2, hy = cosa * h_2;
px[1] = roi_x + wx + hx;
py[1] = roi_y + wy + hy;
px[2] = roi_x - wx + hx;
py[2] = roi_y - wy + hy;
px[3] = roi_x - wx - hx;
py[3] = roi_y - wy - hy;
px[4] = roi_x + wx - hx;
py[4] = roi_y + wy - hy;
}
scalar_t* offset_bottom_diff =
bottom_diff + (n * channels + c) * height * width;
scalar_t value_top_diff = top_diff[index];
valueAdd(bottom_diff + index, value_top_diff);
for (int i = 0; i < points; i++) {
scalar_t w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
w2, w3, w4, x_low, x_high, y_low,
y_high, i);
scalar_t g1 = value_top_diff * w1;
scalar_t g2 = value_top_diff * w2;
scalar_t g3 = value_top_diff * w3;
scalar_t g4 = value_top_diff * w4;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
valueAdd(offset_bottom_diff + y_low * width + x_low, g1);
valueAdd(offset_bottom_diff + y_low * width + x_high, g2);
valueAdd(offset_bottom_diff + y_high * width + x_low, g3);
valueAdd(offset_bottom_diff + y_high * width + x_high, g4);
}
}
}
}
void rotated_feature_align_forward_cpu(const Tensor features,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor output) {
const int output_size = features.numel();
AT_DISPATCH_FLOATING_TYPES(
features.scalar_type(), "rotated_feature_align_forward_cpu_kernel", [&] {
const scalar_t* bottom_data = features.data_ptr<scalar_t>();
const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
scalar_t* top_data = output.data_ptr<scalar_t>();
rotated_feature_align_forward_cpu_kernel<scalar_t>(
output_size, points, bottom_data, bboxes_data,
scalar_t(spatial_scale), features.size(1), features.size(2),
features.size(3), top_data);
});
}
void rotated_feature_align_backward_cpu(const Tensor top_grad,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor bottom_grad) {
const int output_size = top_grad.numel();
AT_DISPATCH_FLOATING_TYPES(
top_grad.scalar_type(), "rotated_feature_align_backward_cpu_kernel", [&] {
const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();
const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();
rotated_feature_align_backward_cpu_kernel<scalar_t>(
output_size, points, top_diff, bboxes_data, scalar_t(spatial_scale),
top_grad.size(1), top_grad.size(2), top_grad.size(3), bottom_diff);
});
}
void rotated_feature_align_forward_impl(const Tensor features,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor output);
void rotated_feature_align_backward_impl(const Tensor top_grad,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor bottom_grad);
REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CPU,
rotated_feature_align_forward_cpu);
REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CPU,
rotated_feature_align_backward_cpu);
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <torch/script.h>
#include <utils/spconv/spconv/geometry.h>
#include <utils/spconv/spconv/indice.h>
#include "pytorch_cpp_helper.hpp"
namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) {
if (transpose)
return getIndicePairsDeConv<Index, IndexGrid, NDim>(
indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
kernelSize.data(), stride.data(), padding.data(), dilation.data(),
outSpatialShape.data());
else
return getIndicePairsConv<Index, IndexGrid, NDim>(
indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
kernelSize.data(), stride.data(), padding.data(), dilation.data(),
outSpatialShape.data());
}
};
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) {
return getIndicePairsSubM<Index, IndexGrid, NDim>(
indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
}
};
} // namespace functor
#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
NDIM>;
#define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
DECLARE_CPU_INDEX(int);
DECLARE_CPU_INDEX(long);
#undef DECLARE_CPU_INDEX
#undef DECLARE_CPU_SPECS_INDEX_NDIM
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <torch/script.h>
#include <utils/spconv/spconv/maxpool.h>
#include "pytorch_cpp_helper.hpp"
namespace functor {
template <typename scalar_t, typename Index>
struct SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, Index> {
void operator()(const tv::CPU &d, tv::TensorView<scalar_t> outFeatures,
tv::TensorView<const scalar_t> inFeatures,
tv::TensorView<const Index> indices, int size) {
int stride = outFeatures.dim(1);
auto outFeaturesData = outFeatures.data();
auto inFeaturesData = inFeatures.data();
auto indicesIn = indices.subview(0).data();
auto indicesOut = indices.subview(1).data();
Index idxi, idxo;
for (int row = 0; row < size; row++) {
idxi = indicesIn[row] * stride;
idxo = indicesOut[row] * stride;
for (int plane = 0; plane < stride; ++plane)
if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
}
}
};
template <typename scalar_t, typename Index>
struct SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, Index> {
void operator()(const tv::CPU &d, tv::TensorView<const scalar_t> outFeatures,
tv::TensorView<const scalar_t> inFeatures,
tv::TensorView<const scalar_t> fout,
tv::TensorView<scalar_t> fin,
tv::TensorView<const Index> indices, int size) {
int stride = outFeatures.dim(1);
auto outFeaturesData = outFeatures.data();
auto inFeaturesData = inFeatures.data();
auto foutData = fout.data();
auto finData = fin.data();
auto indicesIn = indices.subview(0).data();
auto indicesOut = indices.subview(1).data();
Index idxi, idxo;
for (int row = 0; row < size; row++) {
idxi = indicesIn[row] * stride;
idxo = indicesOut[row] * stride;
for (int plane = 0; plane < stride; ++plane)
if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
finData[idxi + plane] += foutData[idxo + plane];
}
}
};
} // namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
DECLARE_CPU_SPECS_T_INDEX(T, int); \
DECLARE_CPU_SPECS_T_INDEX(T, long);
DECLARE_CPU_SPECS(float);
DECLARE_CPU_SPECS(double);
DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <torch/script.h>
#include <utils/spconv/spconv/reordering.h>
#include "pytorch_cpp_helper.hpp"
namespace functor {
template <typename scalar_t, typename Index>
struct SparseGatherFunctor<tv::CPU, scalar_t, Index> {
void operator()(const tv::CPU& d, tv::TensorView<scalar_t> buffer,
tv::TensorView<const scalar_t> features,
tv::TensorView<const Index> indices, int size) {
int numPlanes = features.dim(1);
for (int i = 0; i < size; ++i) {
std::memcpy(buffer.data() + i * numPlanes,
features.data() + indices[i] * numPlanes,
sizeof(scalar_t) * numPlanes);
}
}
};
template <typename scalar_t, typename Index>
struct SparseScatterAddFunctor<tv::CPU, scalar_t, Index> {
void operator()(const tv::CPU& d, tv::TensorView<scalar_t> outFeatures,
tv::TensorView<const scalar_t> buffer,
tv::TensorView<const Index> indices, int size, bool stable) {
int numPlanes = outFeatures.dim(1);
const scalar_t* buf = buffer.data();
scalar_t* out = outFeatures.data();
for (int i = 0; i < size; ++i) {
buf = buffer.data() + i * numPlanes;
out = outFeatures.data() + indices[i] * numPlanes;
for (int j = 0; j < numPlanes; ++j) {
out[j] += buf[j];
}
}
}
};
} // namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(scalar_t, Index) \
template struct functor::SparseGatherFunctor<tv::CPU, scalar_t, Index>; \
template struct functor::SparseScatterAddFunctor<tv::CPU, scalar_t, Index>;
#define DECLARE_CPU_SPECS(scalar_t) \
DECLARE_CPU_SPECS_T_INDEX(scalar_t, int); \
DECLARE_CPU_SPECS_T_INDEX(scalar_t, long);
DECLARE_CPU_SPECS(float);
DECLARE_CPU_SPECS(double);
DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
......@@ -26,13 +26,22 @@ void dynamic_voxelize_forward_cpu_kernel(
coor[ndim_minus_1 - j] = c;
}
if (failed)
memset(&coors[i][0], -1, NDim * sizeof(T_int));
else
memcpy(&coors[i][0], &coor[0], NDim * sizeof(T_int));
// memcpy and memset will cause problem because of the memory distribution
// discontinuity of TensorAccessor, so here using loops to replace memcpy
// or memset
if (failed) {
for (int k = 0; k < NDim; ++k) {
coors[i][k] = -1;
}
} else {
for (int k = 0; k < NDim; ++k) {
coors[i][k] = coor[k];
}
}
}
delete[] coor;
return;
}
template <typename T, typename T_int>
......@@ -72,14 +81,21 @@ void hard_voxelize_forward_cpu_kernel(
voxel_num += 1;
coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
memcpy(&coors[voxelidx][0], &coor[i][0], NDim * sizeof(T_int));
// memcpy will cause problem because of the memory distribution
// discontinuity of TensorAccessor, so here using loops to replace memcpy
for (int k = 0; k < NDim; ++k) {
coors[voxelidx][k] = coor[i][k];
}
}
// put points into voxel
num = num_points_per_voxel[voxelidx];
if (max_points == -1 || num < max_points) {
memcpy(&voxels[voxelidx][num][0], &points[i][0],
num_features * sizeof(T));
// memcpy will cause problem because of the memory distribution
// discontinuity of TensorAccessor, so here using loops to replace memcpy
for (int k = 0; k < num_features; ++k) {
voxels[voxelidx][num][k] = points[i][k];
}
num_points_per_voxel[voxelidx] += 1;
}
}
......
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
#include "active_rotated_filter_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
const Tensor indices,
Tensor output) {
int num_output_planes = input.size(0);
int num_input_planes = input.size(1);
int num_orientations = input.size(2);
int kH = input.size(3);
int kW = input.size(4);
int num_rotations = indices.size(3);
int nEntry = num_orientations * kH * kW;
int output_size = input.numel();
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "active_rotated_filter_forward_cuda_kernel", [&] {
active_rotated_filter_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.data_ptr<scalar_t>(),
indices.data_ptr<int>(), num_input_planes, num_output_planes,
num_orientations, num_rotations, nEntry,
output.data_ptr<scalar_t>());
});
AT_CUDA_CHECK(cudaGetLastError());
}
void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
const Tensor indices,
Tensor grad_in) {
int num_orientations = indices.size(0);
int kH = indices.size(1);
int kW = indices.size(2);
int num_rotations = indices.size(3);
int num_output_planes = grad_out.size(0) / num_rotations;
int num_input_planes = grad_out.size(1) / num_orientations;
int nEntry = num_orientations * kH * kW;
int output_size = grad_in.numel();
at::cuda::CUDAGuard device_guard(indices.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_out.scalar_type(), "active_rotated_filter_backward_cuda_kernel",
[&] {
active_rotated_filter_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_out.data_ptr<scalar_t>(),
indices.data_ptr<int>(), num_input_planes, num_output_planes,
num_orientations, num_rotations, nEntry,
grad_in.data_ptr<scalar_t>());
});
AT_CUDA_CHECK(cudaGetLastError());
}
......@@ -13,7 +13,7 @@ void AssignScoreWithKForwardCUDAKernelLauncher(
at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(DIVUP(B * O * N1 * K, THREADS_PER_BLOCK));
dim3 blocks(GET_BLOCKS(B * O * N1 * K, THREADS_PER_BLOCK));
dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......@@ -36,9 +36,9 @@ void AssignScoreWithKBackwardCUDAKernelLauncher(
at::cuda::CUDAGuard device_guard(grad_out.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks1(DIVUP(B * M * O, THREADS_PER_BLOCK));
dim3 blocks1(GET_BLOCKS(B * M * O, THREADS_PER_BLOCK));
dim3 threads1(THREADS_PER_BLOCK);
dim3 blocks2(DIVUP(B * N1 * K * M, THREADS_PER_BLOCK));
dim3 blocks2(GET_BLOCKS(B * N1 * K * M, THREADS_PER_BLOCK));
dim3 threads2(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
......@@ -22,7 +22,7 @@ void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);
dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
......@@ -2,6 +2,22 @@
#include "bbox_overlaps_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
// Disable fp16 on ROCm device
#ifndef HIP_DIFF
#if __CUDA_ARCH__ >= 530
template <>
__global__ void bbox_overlaps_cuda_kernel<at::Half>(
const at::Half* bbox1, const at::Half* bbox2, at::Half* ious,
const int num_bbox1, const int num_bbox2, const int mode,
const bool aligned, const int offset) {
bbox_overlaps_cuda_kernel_half(reinterpret_cast<const __half*>(bbox1),
reinterpret_cast<const __half*>(bbox2),
reinterpret_cast<__half*>(ious), num_bbox1,
num_bbox2, mode, aligned, offset);
}
#endif // __CUDA_ARCH__ >= 530
#endif // HIP_DIFF
void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
Tensor ious, const int mode,
const bool aligned, const int offset) {
......
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
#include "chamfer_distance_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void ChamferDistanceForwardCUDAKernelLauncher(
const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
const Tensor dist2, const Tensor idx1, const Tensor idx2) {
int batch_size = xyz1.size(0);
int n = xyz1.size(1);
int m = xyz2.size(1);
at::cuda::CUDAGuard device_guard(xyz1.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
xyz1.scalar_type(), "chamfer_distance_forward_cuda_kernel", [&] {
chamfer_distance_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK, 0, stream>>>(
batch_size, n, xyz1.data_ptr<scalar_t>(), m,
xyz2.data_ptr<scalar_t>(), dist1.data_ptr<scalar_t>(),
idx1.data_ptr<int>());
});
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
xyz1.scalar_type(), "chamfer_distance_forward_cuda_kernel", [&] {
chamfer_distance_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK, 0, stream>>>(
batch_size, m, xyz2.data_ptr<scalar_t>(), n,
xyz1.data_ptr<scalar_t>(), dist2.data_ptr<scalar_t>(),
idx2.data_ptr<int>());
});
AT_CUDA_CHECK(cudaGetLastError());
}
void ChamferDistanceBackwardCUDAKernelLauncher(
const Tensor xyz1, const Tensor xyz2, Tensor grad_xyz1, Tensor grad_xyz2,
Tensor grad_dist1, Tensor grad_dist2, Tensor idx1, Tensor idx2) {
int batch_size = xyz1.size(0);
int n = xyz1.size(1);
int m = xyz2.size(1);
at::cuda::CUDAGuard device_guard(xyz1.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
xyz1.scalar_type(), "chamfer_distance_backward_cuda_kernel", [&] {
chamfer_distance_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK / 2, 0, stream>>>(
batch_size, m, xyz1.data_ptr<scalar_t>(), n,
xyz2.data_ptr<scalar_t>(), grad_dist1.data_ptr<scalar_t>(),
idx1.data_ptr<int>(), grad_xyz1.data_ptr<scalar_t>(),
grad_xyz2.data_ptr<scalar_t>());
});
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
xyz1.scalar_type(), "chamfer_distance_backward_cuda_kernel", [&] {
chamfer_distance_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK / 2, 0, stream>>>(
batch_size, n, xyz2.data_ptr<scalar_t>(), m,
xyz1.data_ptr<scalar_t>(), grad_dist2.data_ptr<scalar_t>(),
idx2.data_ptr<int>(), grad_xyz2.data_ptr<scalar_t>(),
grad_xyz1.data_ptr<scalar_t>());
});
AT_CUDA_CHECK(cudaGetLastError());
}
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/iou/src/convex_iou_kernel.cu
#include "convex_iou_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
Tensor ious) {
int output_size = ious.numel();
int num_pointsets = pointsets.size(0);
int num_polygons = polygons.size(0);
at::cuda::CUDAGuard device_guard(pointsets.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
pointsets.scalar_type(), "convex_iou_cuda_kernel", ([&] {
convex_iou_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
polygons.data_ptr<scalar_t>(), ious.data_ptr<scalar_t>());
}));
AT_CUDA_CHECK(cudaGetLastError());
}
void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
Tensor output) {
int output_size = output.numel();
int num_pointsets = pointsets.size(0);
int num_polygons = polygons.size(0);
at::cuda::CUDAGuard device_guard(pointsets.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
pointsets.scalar_type(), "convex_giou_cuda_kernel", ([&] {
convex_giou_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
polygons.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
}));
AT_CUDA_CHECK(cudaGetLastError());
}
......@@ -24,8 +24,8 @@ void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
const int threads = THREADS_FORWARD;
const dim3 blocks(batch_size, oH, oW);
const dim3 threads(WARP_SIZE, 4, 4);
const dim3 blocks(batch_size, (oH + 3) >> 2, (oW + 3) >> 2);
at::cuda::CUDAGuard device_guard(input1.device());
......@@ -56,17 +56,20 @@ void CorrelationBackwardCUDAKernelLauncher(
const int iW = input1.size(3);
const int C = input1.size(1);
const dim3 blocks(C, iH, iW);
const dim3 threads(THREADS_BACKWARD, THREADS_BACKWARD);
auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
const dim3 blocks(batch_size, iH, iW);
const dim3 threads(THREADS_PER_BLOCK);
at::cuda::CUDAGuard device_guard(input1.device());
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input1.scalar_type(), "correlation_backward_cuda", ([&] {
const int grad_cache_size = patchH * patchW * sizeof(scalar_t);
TensorAcc4R input1_acc =
input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
TensorAcc4R input2_acc =
input2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
TensorAcc4R grad_input1_acc =
grad_input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
TensorAcc4R grad_input2_acc =
......@@ -74,20 +77,18 @@ void CorrelationBackwardCUDAKernelLauncher(
TensorAcc5R grad_output_acc =
grad_output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();
for (int n = 0; n < batch_size; ++n) {
correlation_backward_cuda_kernel_input1<scalar_t>
<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH,
patchW, padH, padW, dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW, n);
}
correlation_backward_cuda_kernel_input1<scalar_t>
<<<blocks, threads, grad_cache_size,
at::cuda::getCurrentCUDAStream()>>>(
grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH,
patchW, padH, padW, dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW);
for (int n = 0; n < batch_size; ++n) {
correlation_backward_cuda_kernel_input2<scalar_t>
<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH,
patchW, padH, padW, dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW, n);
}
correlation_backward_cuda_kernel_input2<scalar_t>
<<<blocks, threads, grad_cache_size,
at::cuda::getCurrentCUDAStream()>>>(
grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH,
patchW, padH, padW, dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW);
}));
}
......@@ -570,20 +570,15 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
const Tensor boxes_b,
Tensor ans_overlap);
void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
const Tensor boxes_a,
const int num_b,
const Tensor boxes_b,
Tensor ans_iou);
void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long* mask,
int boxes_num,
float nms_overlap_thresh);
void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long* mask, int boxes_num,
float nms_overlap_thresh);
void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long* mask,
int boxes_num,
float nms_overlap_thresh);
void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long* mask,
int boxes_num,
float nms_overlap_thresh);
void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b,
......@@ -592,45 +587,35 @@ void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
ans_overlap);
};
void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b,
Tensor ans_iou) {
IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
ans_iou);
};
void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long* mask,
int boxes_num, float nms_overlap_thresh) {
IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh);
void iou3d_nms3d_forward_cuda(const Tensor boxes, unsigned long long* mask,
int boxes_num, float nms_overlap_thresh) {
IoU3DNMS3DForwardCUDAKernelLauncher(boxes, mask, boxes_num,
nms_overlap_thresh);
};
void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long* mask,
int boxes_num, float nms_overlap_thresh) {
IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
nms_overlap_thresh);
void iou3d_nms3d_normal_forward_cuda(const Tensor boxes,
unsigned long long* mask, int boxes_num,
float nms_overlap_thresh) {
IoU3DNMS3DNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
nms_overlap_thresh);
};
void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b,
Tensor ans_overlap);
void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b,
Tensor ans_iou);
void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long* mask,
int boxes_num, float nms_overlap_thresh);
void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long* mask,
int boxes_num, float nms_overlap_thresh);
void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long* mask,
int boxes_num, float nms_overlap_thresh);
void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
unsigned long long* mask, int boxes_num,
float nms_overlap_thresh);
REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,
iou3d_boxes_overlap_bev_forward_cuda);
REGISTER_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, CUDA,
iou3d_boxes_iou_bev_forward_cuda);
REGISTER_DEVICE_IMPL(iou3d_nms_forward_impl, CUDA, iou3d_nms_forward_cuda);
REGISTER_DEVICE_IMPL(iou3d_nms_normal_forward_impl, CUDA,
iou3d_nms_normal_forward_cuda);
REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, CUDA, iou3d_nms3d_forward_cuda);
REGISTER_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, CUDA,
iou3d_nms3d_normal_forward_cuda);
void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
const Tensor xyz, const Tensor new_xyz,
......@@ -924,20 +909,20 @@ REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);
void ROIAlignRotatedForwardCUDAKernelLauncher(
const at::Tensor features, const at::Tensor rois, const float spatial_scale,
const int sample_num, const bool aligned, const bool clockwise,
const at::Tensor input, const at::Tensor rois, const float spatial_scale,
const int sampling_ratio, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, at::Tensor output);
void ROIAlignRotatedBackwardCUDAKernelLauncher(
const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
const int sample_num, const bool aligned, const bool clockwise,
const int sampling_ratio, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sample_ratio,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
......@@ -947,11 +932,11 @@ void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
AT_ERROR("wrong roi size");
}
int num_channels = features.size(1);
int data_height = features.size(2);
int data_width = features.size(3);
int num_channels = input.size(1);
int data_height = input.size(2);
int data_width = input.size(3);
ROIAlignRotatedForwardCUDAKernelLauncher(
features, rois, spatial_scale, sample_ratio, aligned, clockwise,
input, rois, spatial_scale, sampling_ratio, aligned, clockwise,
num_channels, data_height, data_width, num_rois, aligned_height,
aligned_width, output);
}
......@@ -959,7 +944,7 @@ void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sample_ratio, bool aligned,
int sampling_ratio, bool aligned,
bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
......@@ -972,26 +957,101 @@ void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
int data_height = bottom_grad.size(2);
int data_width = bottom_grad.size(3);
ROIAlignRotatedBackwardCUDAKernelLauncher(
top_grad, rois, spatial_scale, sample_ratio, aligned, clockwise,
top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,
num_channels, data_height, data_width, num_rois, aligned_height,
aligned_width, bottom_grad);
}
void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sample_ratio,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise);
void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sample_ratio, bool aligned,
int sampling_ratio, bool aligned,
bool clockwise);
REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
roi_align_rotated_forward_cuda);
REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
roi_align_rotated_backward_cuda);
void RiROIAlignRotatedForwardCUDAKernelLauncher(
const at::Tensor features, const at::Tensor rois, const float spatial_scale,
const int num_samples, const bool clockwise, const int channels,
const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, const int num_orientations,
at::Tensor output);
void RiROIAlignRotatedBackwardCUDAKernelLauncher(
const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
const int num_samples, const bool clockwise, const int channels,
const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, const int num_orientations,
at::Tensor bottom_grad);
void riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,
Tensor output, int pooled_height,
int pooled_width, float spatial_scale,
int num_samples, int num_orientations,
bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
CHECK_CONTIGUOUS(features);
CHECK_CONTIGUOUS(rois);
int num_channels = features.size(1) / num_orientations;
int data_height = features.size(2);
int data_width = features.size(3);
RiROIAlignRotatedForwardCUDAKernelLauncher(
features, rois, spatial_scale, num_samples, clockwise, num_channels,
data_height, data_width, num_rois, pooled_height, pooled_width,
num_orientations, output);
}
void riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int pooled_height,
int pooled_width, float spatial_scale,
int num_samples, int num_orientations,
bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
CHECK_CONTIGUOUS(top_grad);
CHECK_CONTIGUOUS(rois);
int num_channels = bottom_grad.size(1) / num_orientations;
int data_height = bottom_grad.size(2);
int data_width = bottom_grad.size(3);
RiROIAlignRotatedBackwardCUDAKernelLauncher(
top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
data_height, data_width, num_rois, pooled_height, pooled_width,
num_orientations, bottom_grad);
}
void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
Tensor output, int pooled_height,
int pooled_width, float spatial_scale,
int num_samples, int num_orientations,
bool clockwise);
void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int pooled_height,
int pooled_width, float spatial_scale,
int num_samples, int num_orientations,
bool clockwise);
REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,
riroi_align_rotated_forward_cuda);
REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,
riroi_align_rotated_backward_cuda);
void RoiawarePool3dForwardCUDAKernelLauncher(
int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
int out_y, int out_z, const Tensor rois, const Tensor pts,
......@@ -1321,6 +1381,12 @@ int HardVoxelizeForwardCUDAKernelLauncher(
const std::vector<float> coors_range, const int max_points,
const int max_voxels, const int NDim = 3);
int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
const std::vector<float> coors_range, const int max_points,
const int max_voxels, const int NDim = 3);
void DynamicVoxelizeForwardCUDAKernelLauncher(
const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size, const std::vector<float> coors_range,
......@@ -1338,6 +1404,16 @@ int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,
max_points, max_voxels, NDim);
};
int nondeterministic_hard_voxelize_forward_cuda(
const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
const std::vector<float> coors_range, const int max_points,
const int max_voxels, const int NDim) {
return NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
max_points, max_voxels, NDim);
};
void dynamic_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
......@@ -1354,11 +1430,361 @@ int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
const int max_points, const int max_voxels,
const int NDim);
int nondeterministic_hard_voxelize_forward_impl(
const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
const std::vector<float> coors_range, const int max_points,
const int max_voxels, const int NDim);
void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const int NDim);
REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
hard_voxelize_forward_cuda);
REGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, CUDA,
nondeterministic_hard_voxelize_forward_cuda);
REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
dynamic_voxelize_forward_cuda);
void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
const Tensor best_bboxes,
const float spatial_scale,
const int points,
Tensor output);
void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
const Tensor best_bboxes,
const float spatial_scale,
const int points,
Tensor bottom_grad);
void rotated_feature_align_forward_cuda(const Tensor features,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor output) {
RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,
spatial_scale, points, output);
};
void rotated_feature_align_backward_cuda(const Tensor top_grad,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor bottom_grad) {
RotatedFeatureAlignBackwardCUDAKernelLauncher(
top_grad, best_bboxes, spatial_scale, points, bottom_grad);
};
void rotated_feature_align_forward_impl(const Tensor features,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor output);
void rotated_feature_align_backward_impl(const Tensor top_grad,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor bottom_grad);
REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,
rotated_feature_align_forward_cuda);
REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,
rotated_feature_align_backward_cuda);
void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
const at::Tensor polygons,
const int rows, const int cols,
at::Tensor output);
void points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,
Tensor output, const int rows,
const int cols) {
PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,
output);
};
void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
Tensor output, const int rows,
const int cols);
REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
points_in_polygons_forward_cuda);
torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numAct);
torch::Tensor indice_maxpool_forward_cuda(torch::Tensor features,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numAct) {
return IndiceMaxpoolForwardCUDAKernelLauncher(features, indicePairs,
indiceNum, numAct);
};
torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numAct);
REGISTER_DEVICE_IMPL(indice_maxpool_forward_impl, CUDA,
indice_maxpool_forward_cuda);
torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum);
torch::Tensor indice_maxpool_backward_cuda(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum) {
return IndiceMaxpoolBackwardCUDAKernelLauncher(features, outFeatures, outGrad,
indicePairs, indiceNum);
};
torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum);
REGISTER_DEVICE_IMPL(indice_maxpool_backward_impl, CUDA,
indice_maxpool_backward_cuda)
torch::Tensor IndiceConvForwardCUDAKernelLauncher(
torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
int64_t _subM);
torch::Tensor indice_conv_forward_cuda(torch::Tensor features,
torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse,
int64_t _subM) {
return IndiceConvForwardCUDAKernelLauncher(
features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);
};
torch::Tensor indice_conv_forward_impl(torch::Tensor features,
torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse,
int64_t _subM);
REGISTER_DEVICE_IMPL(indice_conv_forward_impl, CUDA, indice_conv_forward_cuda);
std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM);
std::vector<torch::Tensor> indice_conv_backward_cuda(
torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM) {
return IndiceConvBackwardCUDAKernelLauncher(
features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
};
std::vector<torch::Tensor> indice_conv_backward_impl(
torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM);
REGISTER_DEVICE_IMPL(indice_conv_backward_impl, CUDA,
indice_conv_backward_cuda);
torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM);
torch::Tensor fused_indice_conv_batchnorm_forward_cuda(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
return FusedIndiceConvBatchnormCUDAKernelLauncher(features, filters, bias,
indicePairs, indiceNum,
numActOut, _inverse, _subM);
};
torch::Tensor fused_indice_conv_batchnorm_forward_impl(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM);
REGISTER_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl, CUDA,
fused_indice_conv_batchnorm_forward_cuda)
void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);
void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);
}
void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);
REGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);
void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
const Tensor indices,
Tensor output);
void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
const Tensor indices,
Tensor grad_in);
void active_rotated_filter_forward_cuda(const Tensor input,
const Tensor indices, Tensor output) {
ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);
};
void active_rotated_filter_backward_cuda(const Tensor grad_out,
const Tensor indices, Tensor grad_in) {
ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);
};
void active_rotated_filter_forward_impl(const Tensor input,
const Tensor indices, Tensor output);
void active_rotated_filter_backward_impl(const Tensor grad_out,
const Tensor indices, Tensor grad_in);
REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,
active_rotated_filter_forward_cuda);
REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,
active_rotated_filter_backward_cuda);
void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
Tensor ious);
void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
Tensor output);
void convex_iou_cuda(const Tensor pointsets, const Tensor polygons,
Tensor ious) {
ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);
}
void convex_giou_cuda(const Tensor pointsets, const Tensor polygons,
Tensor output) {
ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);
}
void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
Tensor ious);
void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
Tensor output);
REGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);
REGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);
Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(Tensor vertices,
Tensor mask,
Tensor num_valid);
Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
Tensor num_valid) {
return DiffIoURotatedSortVerticesCUDAKernelLauncher(vertices, mask,
num_valid);
}
Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
Tensor num_valid);
REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, CUDA,
diff_iou_rotated_sort_vertices_forward_cuda);
void ChamferDistanceForwardCUDAKernelLauncher(
const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
const Tensor dist2, const Tensor idx1, const Tensor idx2);
void ChamferDistanceBackwardCUDAKernelLauncher(
const Tensor xyz1, const Tensor xyz2, Tensor grad_xyz1, Tensor grad_xyz2,
Tensor grad_dist1, Tensor grad_dist2, Tensor idx1, Tensor idx2);
void chamfer_distance_forward_cuda(const Tensor xyz1, const Tensor xyz2,
const Tensor dist1, const Tensor dist2,
const Tensor idx1, const Tensor idx2) {
ChamferDistanceForwardCUDAKernelLauncher(xyz1, xyz2, dist1, dist2, idx1,
idx2);
};
void chamfer_distance_backward_cuda(const Tensor xyz1, const Tensor xyz2,
Tensor gradxyz1, Tensor gradxyz2,
Tensor graddist1, Tensor graddist2,
Tensor idx1, Tensor idx2) {
ChamferDistanceBackwardCUDAKernelLauncher(xyz1, xyz2, gradxyz1, gradxyz2,
graddist1, graddist2, idx1, idx2);
};
void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
const Tensor dist1, const Tensor dist2,
const Tensor idx1, const Tensor idx2);
void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
Tensor gradxyz1, Tensor gradxyz2,
Tensor graddist1, Tensor graddist2,
Tensor idx1, Tensor idx2);
REGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA,
chamfer_distance_forward_cuda);
REGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA,
chamfer_distance_backward_cuda);
void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
Tensor output, int pooled_height,
int pooled_width, float spatial_scale);
void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale);
void PrROIPoolCoorBackwardCUDAKernelLauncher(
Tensor output, Tensor grad_output, Tensor input, Tensor rois,
Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);
void prroi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale) {
PrROIPoolForwardCUDAKernelLauncher(input, rois, output, pooled_height,
pooled_width, spatial_scale);
}
void prroi_pool_backward_cuda(Tensor grad_output, Tensor rois,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale) {
PrROIPoolBackwardCUDAKernelLauncher(grad_output, rois, grad_input,
pooled_height, pooled_width,
spatial_scale);
}
void prroi_pool_coor_backward_cuda(Tensor output, Tensor grad_output,
Tensor input, Tensor rois, Tensor grad_rois,
int pooled_height, int pooled_width,
float spatial_scale) {
PrROIPoolCoorBackwardCUDAKernelLauncher(output, grad_output, input, rois,
grad_rois, pooled_height,
pooled_width, spatial_scale);
}
void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale);
void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale);
void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
Tensor input, Tensor rois, Tensor grad_rois,
int pooled_height, int pooled_width,
float spatial_scale);
REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);
REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);
REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,
prroi_pool_coor_backward_cuda);
// Copyright (c) OpenMMLab. All rights reserved
// Adapted from
// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu # noqa
#include "diff_iou_rotated_cuda_kernel.cuh"
#include "pytorch_cpp_helper.hpp"
#include "pytorch_cuda_helper.hpp"
at::Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(at::Tensor vertices,
at::Tensor mask,
at::Tensor num_valid) {
at::cuda::CUDAGuard device_guard(vertices.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
CHECK_CONTIGUOUS(vertices);
CHECK_CONTIGUOUS(mask);
CHECK_CONTIGUOUS(num_valid);
CHECK_CUDA(vertices);
CHECK_CUDA(mask);
CHECK_CUDA(num_valid);
int b = vertices.size(0);
int n = vertices.size(1);
int m = vertices.size(2);
at::Tensor idx =
torch::zeros({b, n, MAX_NUM_VERT_IDX},
at::device(vertices.device()).dtype(at::ScalarType::Int));
diff_iou_rotated_sort_vertices_forward_cuda_kernel<<<b, opt_n_thread(n), 0,
stream>>>(
b, n, m, vertices.data_ptr<float>(), mask.data_ptr<bool>(),
num_valid.data_ptr<int>(), idx.data_ptr<int>());
AT_CUDA_CHECK(cudaGetLastError());
return idx;
}
#include <cuda_runtime_api.h>
#include <torch/script.h>
#include <utils/spconv/spconv/indice.h>
#include <utils/spconv/spconv/reordering.h>
#include "../spconv_utils.h"
#include "pytorch_cuda_helper.hpp"
torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
at::cuda::CUDAGuard device_guard(features.device());
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output =
torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.scalar_type(), "FusedIndiceConvBatchnormKernel", [&] {
auto outputBufferBlob = torch::from_blob(
outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(
inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else {
functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob =
torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
scatterFtor;
scatterFtor(
tv::CPU(), tv::torch2tv<scalar_t>(output),
tv::torch2tv<const scalar_t>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
} else {
functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
scatterFtor;
scatterFtor(
tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
tv::torch2tv<const scalar_t>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
TV_CHECK_CUDA_ERR();
}
});
}
return output;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment