Commit fdeee889 authored by limm's avatar limm
Browse files

release v1.6.1 of mmcv

parent df465820
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
#include "pytorch_cpp_helper.hpp"
Tensor bottom_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get height
int64_t height = input.size(2);
output.copy_(input);
for (int64_t ind = 1; ind < height; ind <<= 1) {
Tensor max_temp = at::slice(output, 2, ind, height);
Tensor cur_temp = at::slice(output, 2, ind, height).clone();
Tensor next_temp = at::slice(output, 2, 0, height - ind).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor bottom_pool_backward(Tensor input, Tensor grad_output) {
auto output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(2, 0);
max_val.copy_(input_temp);
max_ind.fill_(0);
auto output_temp = output.select(2, 0);
auto grad_output_temp = grad_output.select(2, 0);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(2);
auto gt_mask = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 0; ind < height - 1; ++ind) {
input_temp = input.select(2, ind + 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, ind + 1);
grad_output_temp = grad_output.select(2, ind + 1).unsqueeze(2);
output.scatter_add_(2, un_max_ind, grad_output_temp);
}
return output;
}
Tensor left_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get width
int64_t width = input.size(3);
output.copy_(input);
for (int64_t ind = 1; ind < width; ind <<= 1) {
Tensor max_temp = at::slice(output, 3, 0, width - ind);
Tensor cur_temp = at::slice(output, 3, 0, width - ind).clone();
Tensor next_temp = at::slice(output, 3, ind, width).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor left_pool_backward(Tensor input, Tensor grad_output) {
auto output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(3, width - 1);
max_val.copy_(input_temp);
max_ind.fill_(width - 1);
auto output_temp = output.select(3, width - 1);
auto grad_output_temp = grad_output.select(3, width - 1);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(3);
auto gt_mask = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 1; ind < width; ++ind) {
input_temp = input.select(3, width - ind - 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, width - ind - 1);
grad_output_temp = grad_output.select(3, width - ind - 1).unsqueeze(3);
output.scatter_add_(3, un_max_ind, grad_output_temp);
}
return output;
}
Tensor right_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get width
int64_t width = input.size(3);
output.copy_(input);
for (int64_t ind = 1; ind < width; ind <<= 1) {
Tensor max_temp = at::slice(output, 3, ind, width);
Tensor cur_temp = at::slice(output, 3, ind, width).clone();
Tensor next_temp = at::slice(output, 3, 0, width - ind).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor right_pool_backward(Tensor input, Tensor grad_output) {
Tensor output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(3, 0);
max_val.copy_(input_temp);
max_ind.fill_(0);
auto output_temp = output.select(3, 0);
auto grad_output_temp = grad_output.select(3, 0);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(3);
auto gt_mask = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, height},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 0; ind < width - 1; ++ind) {
input_temp = input.select(3, ind + 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, ind + 1);
grad_output_temp = grad_output.select(3, ind + 1).unsqueeze(3);
output.scatter_add_(3, un_max_ind, grad_output_temp);
}
return output;
}
Tensor top_pool_forward(Tensor input) {
// Initialize output
Tensor output = at::zeros_like(input);
// Get height
int64_t height = input.size(2);
output.copy_(input);
for (int64_t ind = 1; ind < height; ind <<= 1) {
Tensor max_temp = at::slice(output, 2, 0, height - ind);
Tensor cur_temp = at::slice(output, 2, 0, height - ind).clone();
Tensor next_temp = at::slice(output, 2, ind, height).clone();
at::max_out(max_temp, cur_temp, next_temp);
}
return output;
}
Tensor top_pool_backward(Tensor input, Tensor grad_output) {
auto output = at::zeros_like(input);
int32_t batch = input.size(0);
int32_t channel = input.size(1);
int32_t height = input.size(2);
int32_t width = input.size(3);
auto max_val = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
auto max_ind = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kLong));
auto input_temp = input.select(2, height - 1);
max_val.copy_(input_temp);
max_ind.fill_(height - 1);
auto output_temp = output.select(2, height - 1);
auto grad_output_temp = grad_output.select(2, height - 1);
output_temp.copy_(grad_output_temp);
auto un_max_ind = max_ind.unsqueeze(2);
auto gt_mask = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kBool));
auto max_temp = torch::zeros({batch, channel, width},
at::device(at::kCUDA).dtype(at::kFloat));
for (int32_t ind = 1; ind < height; ++ind) {
input_temp = input.select(2, height - ind - 1);
at::gt_out(gt_mask, input_temp, max_val);
at::masked_select_out(max_temp, input_temp, gt_mask);
max_val.masked_scatter_(gt_mask, max_temp);
max_ind.masked_fill_(gt_mask, height - ind - 1);
grad_output_temp = grad_output.select(2, height - ind - 1).unsqueeze(2);
output.scatter_add_(2, un_max_ind, grad_output_temp);
}
return output;
}
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cpu/ActiveRotatingFilter_cpu.cpp
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
template <typename T>
void active_rotated_filter_forward_cpu_kernel(
const T* weightData, const int* indicesData, const int num_output_planes,
const int num_input_planes, const int num_orientations, const int kH,
const int kW, const int num_rotations, T* outputData) {
const int nEntry = num_orientations * kH * kW;
int i, j, l;
int k;
#pragma omp parallel for private(i, j, l, k)
for (i = 0; i < num_output_planes; i++) {
for (j = 0; j < num_input_planes; j++) {
for (l = 0; l < nEntry; l++) {
int weightIndex = i * num_input_planes * nEntry + j * nEntry + l;
T val = *(weightData + weightIndex);
for (k = 0; k < num_rotations; k++) {
int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
T* target = outputData +
i * (num_rotations * num_input_planes * nEntry) +
k * (num_input_planes * nEntry) + j * (nEntry) + index;
*target = val;
}
}
}
}
}
template <typename T>
void active_rotated_filter_backward_cpu_kernel(
const T* gradOutputData, const int* indicesData,
const int num_output_planes, const int num_input_planes,
const int num_orientations, const int kH, const int kW,
const int num_rotations, T* gradInputData) {
const int nEntry = num_orientations * kH * kW;
int i, j, l;
int k;
#pragma omp parallel for private(i, j, l, k)
for (i = 0; i < num_output_planes; i++) {
for (j = 0; j < num_input_planes; j++) {
for (l = 0; l < nEntry; l++) {
int gradInputIndex = i * num_input_planes * nEntry + j * nEntry + l;
T* val = gradInputData + gradInputIndex;
*val = 0;
for (k = 0; k < num_rotations; k++) {
int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
const T* target =
gradOutputData + i * (num_rotations * num_input_planes * nEntry) +
k * (num_input_planes * nEntry) + j * (nEntry) + index;
*val = *val + *target;
}
}
}
}
}
void ActiveRotatedFilterForwardCPULauncher(const Tensor input,
const Tensor indices,
Tensor output) {
const int num_output_planes = input.size(0);
const int num_input_planes = input.size(1);
const int num_orientations = input.size(2);
const int kH = input.size(3);
const int kW = input.size(4);
const int num_rotations = indices.size(3);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "active_rotated_filter_forward_cpu_kernel", [&] {
active_rotated_filter_forward_cpu_kernel<scalar_t>(
input.data_ptr<scalar_t>(), indices.data_ptr<int>(),
num_output_planes, num_input_planes, num_orientations, kH, kW,
num_rotations, output.data_ptr<scalar_t>());
});
}
void ActiveRotatedFilterBackwardCPULauncher(const Tensor grad_out,
const Tensor indices,
Tensor grad_in) {
const int num_orientations = indices.size(0);
const int kH = indices.size(1);
const int kW = indices.size(2);
const int num_rotations = indices.size(3);
const int num_output_planes = grad_out.size(0) / num_rotations;
const int num_input_planes = grad_out.size(1) / num_orientations;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_out.scalar_type(), "active_rotated_filter_backward_cpu_kernel", [&] {
active_rotated_filter_backward_cpu_kernel<scalar_t>(
grad_out.data_ptr<scalar_t>(), indices.data_ptr<int>(),
num_output_planes, num_input_planes, num_orientations, kH, kW,
num_rotations, grad_in.data_ptr<scalar_t>());
});
}
void active_rotated_filter_forward_cpu(const Tensor input, const Tensor indices,
Tensor output) {
ActiveRotatedFilterForwardCPULauncher(input, indices, output);
}
void active_rotated_filter_backward_cpu(const Tensor grad_out,
const Tensor indices, Tensor grad_in) {
ActiveRotatedFilterBackwardCPULauncher(grad_out, indices, grad_in);
}
void active_rotated_filter_forward_impl(const Tensor input,
const Tensor indices, Tensor output);
void active_rotated_filter_backward_impl(const Tensor grad_out,
const Tensor indices, Tensor grad_in);
REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CPU,
active_rotated_filter_forward_cpu);
REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CPU,
active_rotated_filter_backward_cpu);
...@@ -59,7 +59,7 @@ Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores, ...@@ -59,7 +59,7 @@ Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores, Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
const float iou_threshold) { const float iou_threshold) {
auto result = at::empty({0}, dets.options()); auto result = at::empty({0}, dets.options());
AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms_rotated", [&] { AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] {
result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold); result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
}); });
return result; return result;
......
// Copyright (c) OpenMMLab. All rights reserved // Copyright (c) OpenMMLab. All rights reserved
// It is modified from https://github.com/WenmuZhou/PAN.pytorch // It is modified from https://github.com/WenmuZhou/PAN.pytorch
#include <queue>
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp" #include "pytorch_device_registry.hpp"
...@@ -39,7 +41,7 @@ std::vector<std::vector<float>> pixel_group_cpu( ...@@ -39,7 +41,7 @@ std::vector<std::vector<float>> pixel_group_cpu(
Tensor kernel_contour, int kernel_region_num, float dis_threshold) { Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
assert(score.dim() == 2); assert(score.dim() == 2);
assert(mask.dim() == 2); assert(mask.dim() == 2);
assert(embedding_dim.dim() == 3); assert(embedding.dim() == 3);
int height = score.size(0); int height = score.size(0);
int width = score.size(1); int width = score.size(1);
assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1)); assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
...@@ -103,7 +105,7 @@ std::vector<std::vector<float>> pixel_group_cpu( ...@@ -103,7 +105,7 @@ std::vector<std::vector<float>> pixel_group_cpu(
float dis = 0; float dis = 0;
auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim; auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
for (size_t i = 0; i < embedding_dim; i++) { for (size_t i = 0; i < size_t(embedding_dim); i++) {
dis += dis +=
pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2); pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
// ignore further computing if dis is big enough // ignore further computing if dis is big enough
......
...@@ -395,7 +395,6 @@ void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois, ...@@ -395,7 +395,6 @@ void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
int aligned_width, float spatial_scale, int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned, int sampling_ratio, bool aligned,
bool clockwise) { bool clockwise) {
int output_size = grad_output.numel();
int channels = grad_input.size(1); int channels = grad_input.size(1);
int height = grad_input.size(2); int height = grad_input.size(2);
int width = grad_input.size(3); int width = grad_input.size(3);
...@@ -431,8 +430,6 @@ void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois, ...@@ -431,8 +430,6 @@ void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
int aligned_width, float spatial_scale, int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned, int sampling_ratio, bool aligned,
bool clockwise) { bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1); int size_rois = rois.size(1);
if (size_rois != 6) { if (size_rois != 6) {
AT_ERROR("wrong roi size"); AT_ERROR("wrong roi size");
...@@ -442,15 +439,15 @@ void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois, ...@@ -442,15 +439,15 @@ void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
sampling_ratio, aligned, clockwise); sampling_ratio, aligned, clockwise);
} }
void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output, void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width, int aligned_height, int aligned_width,
float spatial_scale, int sample_ratio, float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise); bool aligned, bool clockwise);
void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois, void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height, Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale, int aligned_width, float spatial_scale,
int sample_ratio, bool aligned, int sampling_ratio, bool aligned,
bool clockwise); bool clockwise);
REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CPU, REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CPU,
roi_align_rotated_forward_cpu); roi_align_rotated_forward_cpu);
......
// modified from
// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
template <typename T>
T bilinear_interpolate(const T* input, const int height, const int width, T y,
T x, const int index /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
if (y <= 0) y = 0;
if (x <= 0) x = 0;
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
// do bilinear interpolation
T v1 = input[y_low * width + x_low];
T v2 = input[y_low * width + x_high];
T v3 = input[y_high * width + x_low];
T v4 = input[y_high * width + x_high];
const T v_low = fma(v2 - v1, lx, v1);
const T v_high = fma(v4 - v3, lx, v3);
const T val = fma(v_high - v_low, ly, v_low);
return val;
}
template <typename scalar_t>
void rotated_feature_align_forward_cpu_kernel(
const int nthreads, const int points, const scalar_t* bottom_data,
const scalar_t* best_bboxes, const scalar_t spatial_scale,
const int channels, const int height, const int width, scalar_t* top_data) {
for (int index = 0; index < nthreads; index++) {
int w = index % width;
int h = (index / width) % height;
int c = (index / width / height) % channels;
int n = index / width / height / channels;
const scalar_t* bbox_offset =
best_bboxes + ((n * height + h) * width + w) * 5;
scalar_t roi_y = bbox_offset[0] * spatial_scale;
scalar_t roi_x = bbox_offset[1] * spatial_scale;
scalar_t px[5] = {roi_x, 0, 0, 0, 0};
scalar_t py[5] = {roi_y, 0, 0, 0, 0};
if (points > 1) {
scalar_t roi_w = bbox_offset[2] * spatial_scale;
scalar_t roi_h = bbox_offset[3] * spatial_scale;
scalar_t roi_a = bbox_offset[4];
scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
scalar_t wx = cosa * w_2, wy = sina * w_2;
scalar_t hx = -sina * h_2, hy = cosa * h_2;
px[1] = roi_x + wx + hx;
py[1] = roi_y + wy + hy;
px[2] = roi_x - wx + hx;
py[2] = roi_y - wy + hy;
px[3] = roi_x - wx - hx;
py[3] = roi_y - wy - hy;
px[4] = roi_x + wx - hx;
py[4] = roi_y + wy - hy;
}
const scalar_t* offset_bottom_data =
bottom_data + (n * channels + c) * height * width;
scalar_t output_val = bottom_data[index];
for (int i = 0; i < points; i++) {
output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
width, py[i], px[i], i);
}
top_data[index] = output_val;
}
}
template <typename T>
void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
T& w1, T& w2, T& w3, T& w4, int& x_low,
int& x_high, int& y_low, int& y_high,
const int index) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y <= 0) y = 0;
if (x <= 0) x = 0;
y_low = (int)y;
x_low = (int)x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
template <typename scalar_t>
inline void valueAdd(scalar_t* address, scalar_t val) {
scalar_t old = *address;
*address = (old + val);
}
template <typename scalar_t>
void rotated_feature_align_backward_cpu_kernel(
const int nthreads, const int points, const scalar_t* top_diff,
const scalar_t* best_bboxes, const scalar_t spatial_scale,
const int channels, const int height, const int width,
scalar_t* bottom_diff) {
for (int index = 0; index < nthreads; index++) {
int w = index % width;
int h = (index / width) % height;
int c = (index / width / height) % channels;
int n = index / width / height / channels;
const scalar_t* bbox_offset =
best_bboxes + ((n * height + h) * width + w) * 5;
scalar_t roi_y = bbox_offset[0] * spatial_scale;
scalar_t roi_x = bbox_offset[1] * spatial_scale;
scalar_t px[5] = {roi_x, 0, 0, 0, 0};
scalar_t py[5] = {roi_y, 0, 0, 0, 0};
if (points > 1) {
scalar_t roi_w = bbox_offset[2] * spatial_scale;
scalar_t roi_h = bbox_offset[3] * spatial_scale;
scalar_t roi_a = bbox_offset[4];
scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
scalar_t wx = cosa * w_2, wy = sina * w_2;
scalar_t hx = -sina * h_2, hy = cosa * h_2;
px[1] = roi_x + wx + hx;
py[1] = roi_y + wy + hy;
px[2] = roi_x - wx + hx;
py[2] = roi_y - wy + hy;
px[3] = roi_x - wx - hx;
py[3] = roi_y - wy - hy;
px[4] = roi_x + wx - hx;
py[4] = roi_y + wy - hy;
}
scalar_t* offset_bottom_diff =
bottom_diff + (n * channels + c) * height * width;
scalar_t value_top_diff = top_diff[index];
valueAdd(bottom_diff + index, value_top_diff);
for (int i = 0; i < points; i++) {
scalar_t w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
w2, w3, w4, x_low, x_high, y_low,
y_high, i);
scalar_t g1 = value_top_diff * w1;
scalar_t g2 = value_top_diff * w2;
scalar_t g3 = value_top_diff * w3;
scalar_t g4 = value_top_diff * w4;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
valueAdd(offset_bottom_diff + y_low * width + x_low, g1);
valueAdd(offset_bottom_diff + y_low * width + x_high, g2);
valueAdd(offset_bottom_diff + y_high * width + x_low, g3);
valueAdd(offset_bottom_diff + y_high * width + x_high, g4);
}
}
}
}
void rotated_feature_align_forward_cpu(const Tensor features,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor output) {
const int output_size = features.numel();
AT_DISPATCH_FLOATING_TYPES(
features.scalar_type(), "rotated_feature_align_forward_cpu_kernel", [&] {
const scalar_t* bottom_data = features.data_ptr<scalar_t>();
const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
scalar_t* top_data = output.data_ptr<scalar_t>();
rotated_feature_align_forward_cpu_kernel<scalar_t>(
output_size, points, bottom_data, bboxes_data,
scalar_t(spatial_scale), features.size(1), features.size(2),
features.size(3), top_data);
});
}
void rotated_feature_align_backward_cpu(const Tensor top_grad,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor bottom_grad) {
const int output_size = top_grad.numel();
AT_DISPATCH_FLOATING_TYPES(
top_grad.scalar_type(), "rotated_feature_align_backward_cpu_kernel", [&] {
const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();
const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();
rotated_feature_align_backward_cpu_kernel<scalar_t>(
output_size, points, top_diff, bboxes_data, scalar_t(spatial_scale),
top_grad.size(1), top_grad.size(2), top_grad.size(3), bottom_diff);
});
}
void rotated_feature_align_forward_impl(const Tensor features,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor output);
void rotated_feature_align_backward_impl(const Tensor top_grad,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor bottom_grad);
REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CPU,
rotated_feature_align_forward_cpu);
REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CPU,
rotated_feature_align_backward_cpu);
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <torch/script.h>
#include <utils/spconv/spconv/geometry.h>
#include <utils/spconv/spconv/indice.h>
#include "pytorch_cpp_helper.hpp"
namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) {
if (transpose)
return getIndicePairsDeConv<Index, IndexGrid, NDim>(
indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
kernelSize.data(), stride.data(), padding.data(), dilation.data(),
outSpatialShape.data());
else
return getIndicePairsConv<Index, IndexGrid, NDim>(
indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
kernelSize.data(), stride.data(), padding.data(), dilation.data(),
outSpatialShape.data());
}
};
template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) {
return getIndicePairsSubM<Index, IndexGrid, NDim>(
indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
}
};
} // namespace functor
#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
NDIM>;
#define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
DECLARE_CPU_INDEX(int);
DECLARE_CPU_INDEX(long);
#undef DECLARE_CPU_INDEX
#undef DECLARE_CPU_SPECS_INDEX_NDIM
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <torch/script.h>
#include <utils/spconv/spconv/maxpool.h>
#include "pytorch_cpp_helper.hpp"
namespace functor {
template <typename scalar_t, typename Index>
struct SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, Index> {
void operator()(const tv::CPU &d, tv::TensorView<scalar_t> outFeatures,
tv::TensorView<const scalar_t> inFeatures,
tv::TensorView<const Index> indices, int size) {
int stride = outFeatures.dim(1);
auto outFeaturesData = outFeatures.data();
auto inFeaturesData = inFeatures.data();
auto indicesIn = indices.subview(0).data();
auto indicesOut = indices.subview(1).data();
Index idxi, idxo;
for (int row = 0; row < size; row++) {
idxi = indicesIn[row] * stride;
idxo = indicesOut[row] * stride;
for (int plane = 0; plane < stride; ++plane)
if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
}
}
};
template <typename scalar_t, typename Index>
struct SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, Index> {
void operator()(const tv::CPU &d, tv::TensorView<const scalar_t> outFeatures,
tv::TensorView<const scalar_t> inFeatures,
tv::TensorView<const scalar_t> fout,
tv::TensorView<scalar_t> fin,
tv::TensorView<const Index> indices, int size) {
int stride = outFeatures.dim(1);
auto outFeaturesData = outFeatures.data();
auto inFeaturesData = inFeatures.data();
auto foutData = fout.data();
auto finData = fin.data();
auto indicesIn = indices.subview(0).data();
auto indicesOut = indices.subview(1).data();
Index idxi, idxo;
for (int row = 0; row < size; row++) {
idxi = indicesIn[row] * stride;
idxo = indicesOut[row] * stride;
for (int plane = 0; plane < stride; ++plane)
if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
finData[idxi + plane] += foutData[idxo + plane];
}
}
};
} // namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
DECLARE_CPU_SPECS_T_INDEX(T, int); \
DECLARE_CPU_SPECS_T_INDEX(T, long);
DECLARE_CPU_SPECS(float);
DECLARE_CPU_SPECS(double);
DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <torch/script.h>
#include <utils/spconv/spconv/reordering.h>
#include "pytorch_cpp_helper.hpp"
namespace functor {
template <typename scalar_t, typename Index>
struct SparseGatherFunctor<tv::CPU, scalar_t, Index> {
void operator()(const tv::CPU& d, tv::TensorView<scalar_t> buffer,
tv::TensorView<const scalar_t> features,
tv::TensorView<const Index> indices, int size) {
int numPlanes = features.dim(1);
for (int i = 0; i < size; ++i) {
std::memcpy(buffer.data() + i * numPlanes,
features.data() + indices[i] * numPlanes,
sizeof(scalar_t) * numPlanes);
}
}
};
template <typename scalar_t, typename Index>
struct SparseScatterAddFunctor<tv::CPU, scalar_t, Index> {
void operator()(const tv::CPU& d, tv::TensorView<scalar_t> outFeatures,
tv::TensorView<const scalar_t> buffer,
tv::TensorView<const Index> indices, int size, bool stable) {
int numPlanes = outFeatures.dim(1);
const scalar_t* buf = buffer.data();
scalar_t* out = outFeatures.data();
for (int i = 0; i < size; ++i) {
buf = buffer.data() + i * numPlanes;
out = outFeatures.data() + indices[i] * numPlanes;
for (int j = 0; j < numPlanes; ++j) {
out[j] += buf[j];
}
}
}
};
} // namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(scalar_t, Index) \
template struct functor::SparseGatherFunctor<tv::CPU, scalar_t, Index>; \
template struct functor::SparseScatterAddFunctor<tv::CPU, scalar_t, Index>;
#define DECLARE_CPU_SPECS(scalar_t) \
DECLARE_CPU_SPECS_T_INDEX(scalar_t, int); \
DECLARE_CPU_SPECS_T_INDEX(scalar_t, long);
DECLARE_CPU_SPECS(float);
DECLARE_CPU_SPECS(double);
DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
...@@ -26,13 +26,22 @@ void dynamic_voxelize_forward_cpu_kernel( ...@@ -26,13 +26,22 @@ void dynamic_voxelize_forward_cpu_kernel(
coor[ndim_minus_1 - j] = c; coor[ndim_minus_1 - j] = c;
} }
if (failed) // memcpy and memset will cause problem because of the memory distribution
memset(&coors[i][0], -1, NDim * sizeof(T_int)); // discontinuity of TensorAccessor, so here using loops to replace memcpy
else // or memset
memcpy(&coors[i][0], &coor[0], NDim * sizeof(T_int)); if (failed) {
for (int k = 0; k < NDim; ++k) {
coors[i][k] = -1;
}
} else {
for (int k = 0; k < NDim; ++k) {
coors[i][k] = coor[k];
}
}
} }
delete[] coor; delete[] coor;
return;
} }
template <typename T, typename T_int> template <typename T, typename T_int>
...@@ -72,14 +81,21 @@ void hard_voxelize_forward_cpu_kernel( ...@@ -72,14 +81,21 @@ void hard_voxelize_forward_cpu_kernel(
voxel_num += 1; voxel_num += 1;
coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx; coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
memcpy(&coors[voxelidx][0], &coor[i][0], NDim * sizeof(T_int)); // memcpy will cause problem because of the memory distribution
// discontinuity of TensorAccessor, so here using loops to replace memcpy
for (int k = 0; k < NDim; ++k) {
coors[voxelidx][k] = coor[i][k];
}
} }
// put points into voxel // put points into voxel
num = num_points_per_voxel[voxelidx]; num = num_points_per_voxel[voxelidx];
if (max_points == -1 || num < max_points) { if (max_points == -1 || num < max_points) {
memcpy(&voxels[voxelidx][num][0], &points[i][0], // memcpy will cause problem because of the memory distribution
num_features * sizeof(T)); // discontinuity of TensorAccessor, so here using loops to replace memcpy
for (int k = 0; k < num_features; ++k) {
voxels[voxelidx][num][k] = points[i][k];
}
num_points_per_voxel[voxelidx] += 1; num_points_per_voxel[voxelidx] += 1;
} }
} }
......
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
#include "active_rotated_filter_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
const Tensor indices,
Tensor output) {
int num_output_planes = input.size(0);
int num_input_planes = input.size(1);
int num_orientations = input.size(2);
int kH = input.size(3);
int kW = input.size(4);
int num_rotations = indices.size(3);
int nEntry = num_orientations * kH * kW;
int output_size = input.numel();
at::cuda::CUDAGuard device_guard(input.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "active_rotated_filter_forward_cuda_kernel", [&] {
active_rotated_filter_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.data_ptr<scalar_t>(),
indices.data_ptr<int>(), num_input_planes, num_output_planes,
num_orientations, num_rotations, nEntry,
output.data_ptr<scalar_t>());
});
AT_CUDA_CHECK(cudaGetLastError());
}
void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
const Tensor indices,
Tensor grad_in) {
int num_orientations = indices.size(0);
int kH = indices.size(1);
int kW = indices.size(2);
int num_rotations = indices.size(3);
int num_output_planes = grad_out.size(0) / num_rotations;
int num_input_planes = grad_out.size(1) / num_orientations;
int nEntry = num_orientations * kH * kW;
int output_size = grad_in.numel();
at::cuda::CUDAGuard device_guard(indices.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_out.scalar_type(), "active_rotated_filter_backward_cuda_kernel",
[&] {
active_rotated_filter_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_out.data_ptr<scalar_t>(),
indices.data_ptr<int>(), num_input_planes, num_output_planes,
num_orientations, num_rotations, nEntry,
grad_in.data_ptr<scalar_t>());
});
AT_CUDA_CHECK(cudaGetLastError());
}
...@@ -13,7 +13,7 @@ void AssignScoreWithKForwardCUDAKernelLauncher( ...@@ -13,7 +13,7 @@ void AssignScoreWithKForwardCUDAKernelLauncher(
at::cuda::CUDAGuard device_guard(points.device()); at::cuda::CUDAGuard device_guard(points.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(DIVUP(B * O * N1 * K, THREADS_PER_BLOCK)); dim3 blocks(GET_BLOCKS(B * O * N1 * K, THREADS_PER_BLOCK));
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
...@@ -36,9 +36,9 @@ void AssignScoreWithKBackwardCUDAKernelLauncher( ...@@ -36,9 +36,9 @@ void AssignScoreWithKBackwardCUDAKernelLauncher(
at::cuda::CUDAGuard device_guard(grad_out.device()); at::cuda::CUDAGuard device_guard(grad_out.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks1(DIVUP(B * M * O, THREADS_PER_BLOCK)); dim3 blocks1(GET_BLOCKS(B * M * O, THREADS_PER_BLOCK));
dim3 threads1(THREADS_PER_BLOCK); dim3 threads1(THREADS_PER_BLOCK);
dim3 blocks2(DIVUP(B * N1 * K * M, THREADS_PER_BLOCK)); dim3 blocks2(GET_BLOCKS(B * N1 * K * M, THREADS_PER_BLOCK));
dim3 threads2(THREADS_PER_BLOCK); dim3 threads2(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
...@@ -22,7 +22,7 @@ void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius, ...@@ -22,7 +22,7 @@ void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
cudaStream_t stream = at::cuda::getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// blockIdx.x(col), blockIdx.y(row) // blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
dim3 threads(THREADS_PER_BLOCK); dim3 threads(THREADS_PER_BLOCK);
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
......
...@@ -2,6 +2,22 @@ ...@@ -2,6 +2,22 @@
#include "bbox_overlaps_cuda_kernel.cuh" #include "bbox_overlaps_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
// Disable fp16 on ROCm device
#ifndef HIP_DIFF
#if __CUDA_ARCH__ >= 530
template <>
__global__ void bbox_overlaps_cuda_kernel<at::Half>(
const at::Half* bbox1, const at::Half* bbox2, at::Half* ious,
const int num_bbox1, const int num_bbox2, const int mode,
const bool aligned, const int offset) {
bbox_overlaps_cuda_kernel_half(reinterpret_cast<const __half*>(bbox1),
reinterpret_cast<const __half*>(bbox2),
reinterpret_cast<__half*>(ious), num_bbox1,
num_bbox2, mode, aligned, offset);
}
#endif // __CUDA_ARCH__ >= 530
#endif // HIP_DIFF
void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2, void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
Tensor ious, const int mode, Tensor ious, const int mode,
const bool aligned, const int offset) { const bool aligned, const int offset) {
......
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
#include "chamfer_distance_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void ChamferDistanceForwardCUDAKernelLauncher(
const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
const Tensor dist2, const Tensor idx1, const Tensor idx2) {
int batch_size = xyz1.size(0);
int n = xyz1.size(1);
int m = xyz2.size(1);
at::cuda::CUDAGuard device_guard(xyz1.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
xyz1.scalar_type(), "chamfer_distance_forward_cuda_kernel", [&] {
chamfer_distance_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK, 0, stream>>>(
batch_size, n, xyz1.data_ptr<scalar_t>(), m,
xyz2.data_ptr<scalar_t>(), dist1.data_ptr<scalar_t>(),
idx1.data_ptr<int>());
});
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
xyz1.scalar_type(), "chamfer_distance_forward_cuda_kernel", [&] {
chamfer_distance_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK, 0, stream>>>(
batch_size, m, xyz2.data_ptr<scalar_t>(), n,
xyz1.data_ptr<scalar_t>(), dist2.data_ptr<scalar_t>(),
idx2.data_ptr<int>());
});
AT_CUDA_CHECK(cudaGetLastError());
}
void ChamferDistanceBackwardCUDAKernelLauncher(
const Tensor xyz1, const Tensor xyz2, Tensor grad_xyz1, Tensor grad_xyz2,
Tensor grad_dist1, Tensor grad_dist2, Tensor idx1, Tensor idx2) {
int batch_size = xyz1.size(0);
int n = xyz1.size(1);
int m = xyz2.size(1);
at::cuda::CUDAGuard device_guard(xyz1.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
xyz1.scalar_type(), "chamfer_distance_backward_cuda_kernel", [&] {
chamfer_distance_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK / 2, 0, stream>>>(
batch_size, m, xyz1.data_ptr<scalar_t>(), n,
xyz2.data_ptr<scalar_t>(), grad_dist1.data_ptr<scalar_t>(),
idx1.data_ptr<int>(), grad_xyz1.data_ptr<scalar_t>(),
grad_xyz2.data_ptr<scalar_t>());
});
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
xyz1.scalar_type(), "chamfer_distance_backward_cuda_kernel", [&] {
chamfer_distance_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK / 2, 0, stream>>>(
batch_size, n, xyz2.data_ptr<scalar_t>(), m,
xyz1.data_ptr<scalar_t>(), grad_dist2.data_ptr<scalar_t>(),
idx2.data_ptr<int>(), grad_xyz2.data_ptr<scalar_t>(),
grad_xyz1.data_ptr<scalar_t>());
});
AT_CUDA_CHECK(cudaGetLastError());
}
// Copyright (c) OpenMMLab. All rights reserved
// modified from
// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/iou/src/convex_iou_kernel.cu
#include "convex_iou_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
Tensor ious) {
int output_size = ious.numel();
int num_pointsets = pointsets.size(0);
int num_polygons = polygons.size(0);
at::cuda::CUDAGuard device_guard(pointsets.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
pointsets.scalar_type(), "convex_iou_cuda_kernel", ([&] {
convex_iou_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
polygons.data_ptr<scalar_t>(), ious.data_ptr<scalar_t>());
}));
AT_CUDA_CHECK(cudaGetLastError());
}
void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
Tensor output) {
int output_size = output.numel();
int num_pointsets = pointsets.size(0);
int num_polygons = polygons.size(0);
at::cuda::CUDAGuard device_guard(pointsets.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
pointsets.scalar_type(), "convex_giou_cuda_kernel", ([&] {
convex_giou_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
polygons.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
}));
AT_CUDA_CHECK(cudaGetLastError());
}
...@@ -24,8 +24,8 @@ void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2, ...@@ -24,8 +24,8 @@ void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous(); auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous(); auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
const int threads = THREADS_FORWARD; const dim3 threads(WARP_SIZE, 4, 4);
const dim3 blocks(batch_size, oH, oW); const dim3 blocks(batch_size, (oH + 3) >> 2, (oW + 3) >> 2);
at::cuda::CUDAGuard device_guard(input1.device()); at::cuda::CUDAGuard device_guard(input1.device());
...@@ -56,17 +56,20 @@ void CorrelationBackwardCUDAKernelLauncher( ...@@ -56,17 +56,20 @@ void CorrelationBackwardCUDAKernelLauncher(
const int iW = input1.size(3); const int iW = input1.size(3);
const int C = input1.size(1); const int C = input1.size(1);
const dim3 blocks(C, iH, iW); auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
const dim3 threads(THREADS_BACKWARD, THREADS_BACKWARD); auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
const dim3 blocks(batch_size, iH, iW);
const dim3 threads(THREADS_PER_BLOCK);
at::cuda::CUDAGuard device_guard(input1.device()); at::cuda::CUDAGuard device_guard(input1.device());
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input1.scalar_type(), "correlation_backward_cuda", ([&] { input1.scalar_type(), "correlation_backward_cuda", ([&] {
const int grad_cache_size = patchH * patchW * sizeof(scalar_t);
TensorAcc4R input1_acc = TensorAcc4R input1_acc =
input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>(); trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
TensorAcc4R input2_acc = TensorAcc4R input2_acc =
input2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>(); trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
TensorAcc4R grad_input1_acc = TensorAcc4R grad_input1_acc =
grad_input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>(); grad_input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
TensorAcc4R grad_input2_acc = TensorAcc4R grad_input2_acc =
...@@ -74,20 +77,18 @@ void CorrelationBackwardCUDAKernelLauncher( ...@@ -74,20 +77,18 @@ void CorrelationBackwardCUDAKernelLauncher(
TensorAcc5R grad_output_acc = TensorAcc5R grad_output_acc =
grad_output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>(); grad_output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();
for (int n = 0; n < batch_size; ++n) { correlation_backward_cuda_kernel_input1<scalar_t>
correlation_backward_cuda_kernel_input1<scalar_t> <<<blocks, threads, grad_cache_size,
<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>( at::cuda::getCurrentCUDAStream()>>>(
grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH, grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH,
patchW, padH, padW, dilationH, dilationW, dilation_patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW, n); dilation_patchW, dH, dW);
}
for (int n = 0; n < batch_size; ++n) { correlation_backward_cuda_kernel_input2<scalar_t>
correlation_backward_cuda_kernel_input2<scalar_t> <<<blocks, threads, grad_cache_size,
<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>( at::cuda::getCurrentCUDAStream()>>>(
grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH, grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH,
patchW, padH, padW, dilationH, dilationW, dilation_patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
dilation_patchW, dH, dW, n); dilation_patchW, dH, dW);
}
})); }));
} }
...@@ -570,20 +570,15 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a, ...@@ -570,20 +570,15 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
const Tensor boxes_b, const Tensor boxes_b,
Tensor ans_overlap); Tensor ans_overlap);
void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a, void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes,
const Tensor boxes_a, unsigned long long* mask,
const int num_b, int boxes_num,
const Tensor boxes_b, float nms_overlap_thresh);
Tensor ans_iou);
void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes, void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long* mask, int boxes_num, unsigned long long* mask,
float nms_overlap_thresh); int boxes_num,
float nms_overlap_thresh);
void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
unsigned long long* mask,
int boxes_num,
float nms_overlap_thresh);
void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a, void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b, const int num_b, const Tensor boxes_b,
...@@ -592,45 +587,35 @@ void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a, ...@@ -592,45 +587,35 @@ void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
ans_overlap); ans_overlap);
}; };
void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a, void iou3d_nms3d_forward_cuda(const Tensor boxes, unsigned long long* mask,
const int num_b, const Tensor boxes_b, int boxes_num, float nms_overlap_thresh) {
Tensor ans_iou) { IoU3DNMS3DForwardCUDAKernelLauncher(boxes, mask, boxes_num,
IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b, nms_overlap_thresh);
ans_iou);
};
void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long* mask,
int boxes_num, float nms_overlap_thresh) {
IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh);
}; };
void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long* mask, void iou3d_nms3d_normal_forward_cuda(const Tensor boxes,
int boxes_num, float nms_overlap_thresh) { unsigned long long* mask, int boxes_num,
IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num, float nms_overlap_thresh) {
nms_overlap_thresh); IoU3DNMS3DNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
nms_overlap_thresh);
}; };
void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a, void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
const int num_b, const Tensor boxes_b, const int num_b, const Tensor boxes_b,
Tensor ans_overlap); Tensor ans_overlap);
void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a, void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long* mask,
const int num_b, const Tensor boxes_b, int boxes_num, float nms_overlap_thresh);
Tensor ans_iou);
void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long* mask,
int boxes_num, float nms_overlap_thresh);
void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long* mask, void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
int boxes_num, float nms_overlap_thresh); unsigned long long* mask, int boxes_num,
float nms_overlap_thresh);
REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA, REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,
iou3d_boxes_overlap_bev_forward_cuda); iou3d_boxes_overlap_bev_forward_cuda);
REGISTER_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, CUDA, REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, CUDA, iou3d_nms3d_forward_cuda);
iou3d_boxes_iou_bev_forward_cuda); REGISTER_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, CUDA,
REGISTER_DEVICE_IMPL(iou3d_nms_forward_impl, CUDA, iou3d_nms_forward_cuda); iou3d_nms3d_normal_forward_cuda);
REGISTER_DEVICE_IMPL(iou3d_nms_normal_forward_impl, CUDA,
iou3d_nms_normal_forward_cuda);
void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample, void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
const Tensor xyz, const Tensor new_xyz, const Tensor xyz, const Tensor new_xyz,
...@@ -924,20 +909,20 @@ REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda); ...@@ -924,20 +909,20 @@ REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda); REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);
void ROIAlignRotatedForwardCUDAKernelLauncher( void ROIAlignRotatedForwardCUDAKernelLauncher(
const at::Tensor features, const at::Tensor rois, const float spatial_scale, const at::Tensor input, const at::Tensor rois, const float spatial_scale,
const int sample_num, const bool aligned, const bool clockwise, const int sampling_ratio, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois, const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, at::Tensor output); const int pooled_height, const int pooled_width, at::Tensor output);
void ROIAlignRotatedBackwardCUDAKernelLauncher( void ROIAlignRotatedBackwardCUDAKernelLauncher(
const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale, const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
const int sample_num, const bool aligned, const bool clockwise, const int sampling_ratio, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois, const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, at::Tensor bottom_grad); const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output, void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width, int aligned_height, int aligned_width,
float spatial_scale, int sample_ratio, float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise) { bool aligned, bool clockwise) {
// Number of ROIs // Number of ROIs
int num_rois = rois.size(0); int num_rois = rois.size(0);
...@@ -947,11 +932,11 @@ void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output, ...@@ -947,11 +932,11 @@ void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
AT_ERROR("wrong roi size"); AT_ERROR("wrong roi size");
} }
int num_channels = features.size(1); int num_channels = input.size(1);
int data_height = features.size(2); int data_height = input.size(2);
int data_width = features.size(3); int data_width = input.size(3);
ROIAlignRotatedForwardCUDAKernelLauncher( ROIAlignRotatedForwardCUDAKernelLauncher(
features, rois, spatial_scale, sample_ratio, aligned, clockwise, input, rois, spatial_scale, sampling_ratio, aligned, clockwise,
num_channels, data_height, data_width, num_rois, aligned_height, num_channels, data_height, data_width, num_rois, aligned_height,
aligned_width, output); aligned_width, output);
} }
...@@ -959,7 +944,7 @@ void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output, ...@@ -959,7 +944,7 @@ void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois, void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height, Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale, int aligned_width, float spatial_scale,
int sample_ratio, bool aligned, int sampling_ratio, bool aligned,
bool clockwise) { bool clockwise) {
// Number of ROIs // Number of ROIs
int num_rois = rois.size(0); int num_rois = rois.size(0);
...@@ -972,26 +957,101 @@ void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois, ...@@ -972,26 +957,101 @@ void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
int data_height = bottom_grad.size(2); int data_height = bottom_grad.size(2);
int data_width = bottom_grad.size(3); int data_width = bottom_grad.size(3);
ROIAlignRotatedBackwardCUDAKernelLauncher( ROIAlignRotatedBackwardCUDAKernelLauncher(
top_grad, rois, spatial_scale, sample_ratio, aligned, clockwise, top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,
num_channels, data_height, data_width, num_rois, aligned_height, num_channels, data_height, data_width, num_rois, aligned_height,
aligned_width, bottom_grad); aligned_width, bottom_grad);
} }
void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output, void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width, int aligned_height, int aligned_width,
float spatial_scale, int sample_ratio, float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise); bool aligned, bool clockwise);
void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois, void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height, Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale, int aligned_width, float spatial_scale,
int sample_ratio, bool aligned, int sampling_ratio, bool aligned,
bool clockwise); bool clockwise);
REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA, REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
roi_align_rotated_forward_cuda); roi_align_rotated_forward_cuda);
REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA, REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
roi_align_rotated_backward_cuda); roi_align_rotated_backward_cuda);
void RiROIAlignRotatedForwardCUDAKernelLauncher(
const at::Tensor features, const at::Tensor rois, const float spatial_scale,
const int num_samples, const bool clockwise, const int channels,
const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, const int num_orientations,
at::Tensor output);
void RiROIAlignRotatedBackwardCUDAKernelLauncher(
const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
const int num_samples, const bool clockwise, const int channels,
const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, const int num_orientations,
at::Tensor bottom_grad);
void riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,
Tensor output, int pooled_height,
int pooled_width, float spatial_scale,
int num_samples, int num_orientations,
bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
CHECK_CONTIGUOUS(features);
CHECK_CONTIGUOUS(rois);
int num_channels = features.size(1) / num_orientations;
int data_height = features.size(2);
int data_width = features.size(3);
RiROIAlignRotatedForwardCUDAKernelLauncher(
features, rois, spatial_scale, num_samples, clockwise, num_channels,
data_height, data_width, num_rois, pooled_height, pooled_width,
num_orientations, output);
}
void riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int pooled_height,
int pooled_width, float spatial_scale,
int num_samples, int num_orientations,
bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
CHECK_CONTIGUOUS(top_grad);
CHECK_CONTIGUOUS(rois);
int num_channels = bottom_grad.size(1) / num_orientations;
int data_height = bottom_grad.size(2);
int data_width = bottom_grad.size(3);
RiROIAlignRotatedBackwardCUDAKernelLauncher(
top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
data_height, data_width, num_rois, pooled_height, pooled_width,
num_orientations, bottom_grad);
}
void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
Tensor output, int pooled_height,
int pooled_width, float spatial_scale,
int num_samples, int num_orientations,
bool clockwise);
void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int pooled_height,
int pooled_width, float spatial_scale,
int num_samples, int num_orientations,
bool clockwise);
REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,
riroi_align_rotated_forward_cuda);
REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,
riroi_align_rotated_backward_cuda);
void RoiawarePool3dForwardCUDAKernelLauncher( void RoiawarePool3dForwardCUDAKernelLauncher(
int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x, int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
int out_y, int out_z, const Tensor rois, const Tensor pts, int out_y, int out_z, const Tensor rois, const Tensor pts,
...@@ -1321,6 +1381,12 @@ int HardVoxelizeForwardCUDAKernelLauncher( ...@@ -1321,6 +1381,12 @@ int HardVoxelizeForwardCUDAKernelLauncher(
const std::vector<float> coors_range, const int max_points, const std::vector<float> coors_range, const int max_points,
const int max_voxels, const int NDim = 3); const int max_voxels, const int NDim = 3);
int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
const std::vector<float> coors_range, const int max_points,
const int max_voxels, const int NDim = 3);
void DynamicVoxelizeForwardCUDAKernelLauncher( void DynamicVoxelizeForwardCUDAKernelLauncher(
const at::Tensor& points, at::Tensor& coors, const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size, const std::vector<float> coors_range, const std::vector<float> voxel_size, const std::vector<float> coors_range,
...@@ -1338,6 +1404,16 @@ int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels, ...@@ -1338,6 +1404,16 @@ int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,
max_points, max_voxels, NDim); max_points, max_voxels, NDim);
}; };
int nondeterministic_hard_voxelize_forward_cuda(
const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
const std::vector<float> coors_range, const int max_points,
const int max_voxels, const int NDim) {
return NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
max_points, max_voxels, NDim);
};
void dynamic_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& coors, void dynamic_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size, const std::vector<float> voxel_size,
const std::vector<float> coors_range, const std::vector<float> coors_range,
...@@ -1354,11 +1430,361 @@ int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels, ...@@ -1354,11 +1430,361 @@ int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
const int max_points, const int max_voxels, const int max_points, const int max_voxels,
const int NDim); const int NDim);
int nondeterministic_hard_voxelize_forward_impl(
const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
const std::vector<float> coors_range, const int max_points,
const int max_voxels, const int NDim);
void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors, void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
const std::vector<float> voxel_size, const std::vector<float> voxel_size,
const std::vector<float> coors_range, const std::vector<float> coors_range,
const int NDim); const int NDim);
REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA, REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
hard_voxelize_forward_cuda); hard_voxelize_forward_cuda);
REGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, CUDA,
nondeterministic_hard_voxelize_forward_cuda);
REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA, REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
dynamic_voxelize_forward_cuda); dynamic_voxelize_forward_cuda);
void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
const Tensor best_bboxes,
const float spatial_scale,
const int points,
Tensor output);
void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
const Tensor best_bboxes,
const float spatial_scale,
const int points,
Tensor bottom_grad);
void rotated_feature_align_forward_cuda(const Tensor features,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor output) {
RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,
spatial_scale, points, output);
};
void rotated_feature_align_backward_cuda(const Tensor top_grad,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor bottom_grad) {
RotatedFeatureAlignBackwardCUDAKernelLauncher(
top_grad, best_bboxes, spatial_scale, points, bottom_grad);
};
void rotated_feature_align_forward_impl(const Tensor features,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor output);
void rotated_feature_align_backward_impl(const Tensor top_grad,
const Tensor best_bboxes,
const float spatial_scale,
const int points, Tensor bottom_grad);
REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,
rotated_feature_align_forward_cuda);
REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,
rotated_feature_align_backward_cuda);
void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
const at::Tensor polygons,
const int rows, const int cols,
at::Tensor output);
void points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,
Tensor output, const int rows,
const int cols) {
PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,
output);
};
void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
Tensor output, const int rows,
const int cols);
REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
points_in_polygons_forward_cuda);
torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numAct);
torch::Tensor indice_maxpool_forward_cuda(torch::Tensor features,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numAct) {
return IndiceMaxpoolForwardCUDAKernelLauncher(features, indicePairs,
indiceNum, numAct);
};
torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numAct);
REGISTER_DEVICE_IMPL(indice_maxpool_forward_impl, CUDA,
indice_maxpool_forward_cuda);
torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum);
torch::Tensor indice_maxpool_backward_cuda(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum) {
return IndiceMaxpoolBackwardCUDAKernelLauncher(features, outFeatures, outGrad,
indicePairs, indiceNum);
};
torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum);
REGISTER_DEVICE_IMPL(indice_maxpool_backward_impl, CUDA,
indice_maxpool_backward_cuda)
torch::Tensor IndiceConvForwardCUDAKernelLauncher(
torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
int64_t _subM);
torch::Tensor indice_conv_forward_cuda(torch::Tensor features,
torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse,
int64_t _subM) {
return IndiceConvForwardCUDAKernelLauncher(
features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);
};
torch::Tensor indice_conv_forward_impl(torch::Tensor features,
torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse,
int64_t _subM);
REGISTER_DEVICE_IMPL(indice_conv_forward_impl, CUDA, indice_conv_forward_cuda);
std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM);
std::vector<torch::Tensor> indice_conv_backward_cuda(
torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM) {
return IndiceConvBackwardCUDAKernelLauncher(
features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
};
std::vector<torch::Tensor> indice_conv_backward_impl(
torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM);
REGISTER_DEVICE_IMPL(indice_conv_backward_impl, CUDA,
indice_conv_backward_cuda);
torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM);
torch::Tensor fused_indice_conv_batchnorm_forward_cuda(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
return FusedIndiceConvBatchnormCUDAKernelLauncher(features, filters, bias,
indicePairs, indiceNum,
numActOut, _inverse, _subM);
};
torch::Tensor fused_indice_conv_batchnorm_forward_impl(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM);
REGISTER_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl, CUDA,
fused_indice_conv_batchnorm_forward_cuda)
void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);
void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);
}
void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);
REGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);
void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
const Tensor indices,
Tensor output);
void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
const Tensor indices,
Tensor grad_in);
void active_rotated_filter_forward_cuda(const Tensor input,
const Tensor indices, Tensor output) {
ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);
};
void active_rotated_filter_backward_cuda(const Tensor grad_out,
const Tensor indices, Tensor grad_in) {
ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);
};
void active_rotated_filter_forward_impl(const Tensor input,
const Tensor indices, Tensor output);
void active_rotated_filter_backward_impl(const Tensor grad_out,
const Tensor indices, Tensor grad_in);
REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,
active_rotated_filter_forward_cuda);
REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,
active_rotated_filter_backward_cuda);
void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
Tensor ious);
void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
Tensor output);
void convex_iou_cuda(const Tensor pointsets, const Tensor polygons,
Tensor ious) {
ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);
}
void convex_giou_cuda(const Tensor pointsets, const Tensor polygons,
Tensor output) {
ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);
}
void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
Tensor ious);
void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
Tensor output);
REGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);
REGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);
Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(Tensor vertices,
Tensor mask,
Tensor num_valid);
Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
Tensor num_valid) {
return DiffIoURotatedSortVerticesCUDAKernelLauncher(vertices, mask,
num_valid);
}
Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
Tensor num_valid);
REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, CUDA,
diff_iou_rotated_sort_vertices_forward_cuda);
void ChamferDistanceForwardCUDAKernelLauncher(
const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
const Tensor dist2, const Tensor idx1, const Tensor idx2);
void ChamferDistanceBackwardCUDAKernelLauncher(
const Tensor xyz1, const Tensor xyz2, Tensor grad_xyz1, Tensor grad_xyz2,
Tensor grad_dist1, Tensor grad_dist2, Tensor idx1, Tensor idx2);
void chamfer_distance_forward_cuda(const Tensor xyz1, const Tensor xyz2,
const Tensor dist1, const Tensor dist2,
const Tensor idx1, const Tensor idx2) {
ChamferDistanceForwardCUDAKernelLauncher(xyz1, xyz2, dist1, dist2, idx1,
idx2);
};
void chamfer_distance_backward_cuda(const Tensor xyz1, const Tensor xyz2,
Tensor gradxyz1, Tensor gradxyz2,
Tensor graddist1, Tensor graddist2,
Tensor idx1, Tensor idx2) {
ChamferDistanceBackwardCUDAKernelLauncher(xyz1, xyz2, gradxyz1, gradxyz2,
graddist1, graddist2, idx1, idx2);
};
void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
const Tensor dist1, const Tensor dist2,
const Tensor idx1, const Tensor idx2);
void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
Tensor gradxyz1, Tensor gradxyz2,
Tensor graddist1, Tensor graddist2,
Tensor idx1, Tensor idx2);
REGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA,
chamfer_distance_forward_cuda);
REGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA,
chamfer_distance_backward_cuda);
void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
Tensor output, int pooled_height,
int pooled_width, float spatial_scale);
void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale);
void PrROIPoolCoorBackwardCUDAKernelLauncher(
Tensor output, Tensor grad_output, Tensor input, Tensor rois,
Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);
void prroi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale) {
PrROIPoolForwardCUDAKernelLauncher(input, rois, output, pooled_height,
pooled_width, spatial_scale);
}
void prroi_pool_backward_cuda(Tensor grad_output, Tensor rois,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale) {
PrROIPoolBackwardCUDAKernelLauncher(grad_output, rois, grad_input,
pooled_height, pooled_width,
spatial_scale);
}
void prroi_pool_coor_backward_cuda(Tensor output, Tensor grad_output,
Tensor input, Tensor rois, Tensor grad_rois,
int pooled_height, int pooled_width,
float spatial_scale) {
PrROIPoolCoorBackwardCUDAKernelLauncher(output, grad_output, input, rois,
grad_rois, pooled_height,
pooled_width, spatial_scale);
}
void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale);
void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale);
void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
Tensor input, Tensor rois, Tensor grad_rois,
int pooled_height, int pooled_width,
float spatial_scale);
REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);
REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);
REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,
prroi_pool_coor_backward_cuda);
// Copyright (c) OpenMMLab. All rights reserved
// Adapted from
// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu # noqa
#include "diff_iou_rotated_cuda_kernel.cuh"
#include "pytorch_cpp_helper.hpp"
#include "pytorch_cuda_helper.hpp"
at::Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(at::Tensor vertices,
at::Tensor mask,
at::Tensor num_valid) {
at::cuda::CUDAGuard device_guard(vertices.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
CHECK_CONTIGUOUS(vertices);
CHECK_CONTIGUOUS(mask);
CHECK_CONTIGUOUS(num_valid);
CHECK_CUDA(vertices);
CHECK_CUDA(mask);
CHECK_CUDA(num_valid);
int b = vertices.size(0);
int n = vertices.size(1);
int m = vertices.size(2);
at::Tensor idx =
torch::zeros({b, n, MAX_NUM_VERT_IDX},
at::device(vertices.device()).dtype(at::ScalarType::Int));
diff_iou_rotated_sort_vertices_forward_cuda_kernel<<<b, opt_n_thread(n), 0,
stream>>>(
b, n, m, vertices.data_ptr<float>(), mask.data_ptr<bool>(),
num_valid.data_ptr<int>(), idx.data_ptr<int>());
AT_CUDA_CHECK(cudaGetLastError());
return idx;
}
#include <cuda_runtime_api.h>
#include <torch/script.h>
#include <utils/spconv/spconv/indice.h>
#include <utils/spconv/spconv/reordering.h>
#include "../spconv_utils.h"
#include "pytorch_cuda_helper.hpp"
torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
at::cuda::CUDAGuard device_guard(features.device());
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output =
torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.scalar_type(), "FusedIndiceConvBatchnormKernel", [&] {
auto outputBufferBlob = torch::from_blob(
outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(
inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else {
functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
tv::torch2tv<const scalar_t>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob =
torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
scatterFtor;
scatterFtor(
tv::CPU(), tv::torch2tv<scalar_t>(output),
tv::torch2tv<const scalar_t>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
} else {
functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
scatterFtor;
scatterFtor(
tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
tv::torch2tv<const scalar_t>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
TV_CHECK_CUDA_ERR();
}
});
}
return output;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment