"docs/vscode:/vscode.git/clone" did not exist on "efd602c8207c90908fc4cd127235f1b57e741814"
Commit fdeee889 authored by limm's avatar limm
Browse files

release v1.6.1 of mmcv

parent df465820
...@@ -32,12 +32,12 @@ __device__ inline int Loc2Index(const int n, const int c, const int h, ...@@ -32,12 +32,12 @@ __device__ inline int Loc2Index(const int n, const int c, const int h,
#ifndef HIP_DIFF #ifndef HIP_DIFF
/* TODO: move this to a common place */ /* TODO: move this to a common place */
template <typename scalar_t> template <typename scalar_t>
__device__ inline scalar_t mmcv_min(scalar_t a, scalar_t b) { __device__ inline scalar_t min(scalar_t a, scalar_t b) {
return a < b ? a : b; return a < b ? a : b;
} }
template <typename scalar_t> template <typename scalar_t>
__device__ inline scalar_t mmcv_max(scalar_t a, scalar_t b) { __device__ inline scalar_t max(scalar_t a, scalar_t b) {
return a > b ? a : b; return a > b ? a : b;
} }
#endif #endif
......
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu
#ifndef CHAMFER_DISTANCE_CUDA_KERNEL_CUH
#define CHAMFER_DISTANCE_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#define MAX_SHARED_SCALAR_T 6144 // 49152 / 8 = 6144
template <typename scalar_t>
__global__ void chamfer_distance_forward_cuda_kernel(int b, int n,
const scalar_t* xyz, int m,
const scalar_t* xyz2,
scalar_t* result,
int* result_i) {
__shared__ scalar_t buf[MAX_SHARED_SCALAR_T];
for (int i = blockIdx.x; i < b; i += gridDim.x) {
for (int k2 = 0; k2 < m; k2 += THREADS_PER_BLOCK) {
int end_k = min(m, k2 + THREADS_PER_BLOCK) - k2;
for (int j = threadIdx.x; j < end_k * 2; j += blockDim.x) {
buf[j] = xyz2[(i * m + k2) * 2 + j];
}
__syncthreads();
for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
scalar_t x1 = xyz[(i * n + j) * 2 + 0];
scalar_t y1 = xyz[(i * n + j) * 2 + 1];
int best_i = 0;
scalar_t best = 1e10;
int end_ka = end_k & (~2);
if (end_ka == THREADS_PER_BLOCK) {
for (int k = 0; k < THREADS_PER_BLOCK; k += 4) {
#pragma unroll
for (int j = 0; j < 4; ++j) {
scalar_t x2 = buf[(k + j) * 2] - x1;
scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
scalar_t d = x2 * x2 + y2 * y2;
if (d < best) {
best = d;
best_i = k + k2 + j;
}
}
}
} else {
for (int k = 0; k < end_ka; k += 4) {
#pragma unroll
for (int j = 0; j < 4; ++j) {
scalar_t x2 = buf[(k + j) * 2] - x1;
scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
scalar_t d = x2 * x2 + y2 * y2;
if (d < best) {
best = d;
best_i = k + k2 + j;
}
}
}
}
for (int k = end_ka; k < end_k; k++) {
scalar_t x2 = buf[k * 2 + 0] - x1;
scalar_t y2 = buf[k * 2 + 1] - y1;
scalar_t d = x2 * x2 + y2 * y2;
if (k == 0 || d < best) {
best = d;
best_i = k + k2;
}
}
if (k2 == 0 || result[(i * n + j)] > best) {
result[(i * n + j)] = best;
result_i[(i * n + j)] = best_i;
}
}
__syncthreads();
}
}
}
template <typename scalar_t>
__global__ void chamfer_distance_backward_cuda_kernel(
int b, int n, const scalar_t* xyz1, int m, const scalar_t* xyz2,
const scalar_t* grad_dist1, const int* idx1, scalar_t* grad_xyz1,
scalar_t* grad_xyz2) {
for (int i = blockIdx.x; i < b; i += gridDim.x) {
for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
scalar_t x1 = xyz1[(i * n + j) * 2 + 0];
scalar_t y1 = xyz1[(i * n + j) * 2 + 1];
int j2 = idx1[i * n + j];
scalar_t x2 = xyz2[(i * m + j2) * 2 + 0];
scalar_t y2 = xyz2[(i * m + j2) * 2 + 1];
scalar_t g = grad_dist1[i * n + j] * 2;
atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 0]), g * (x1 - x2));
atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 1]), g * (y1 - y2));
atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 0]), -(g * (x1 - x2)));
atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 1]), -(g * (y1 - y2)));
}
}
}
#endif // CHAMFER_DISTANCE_CUDA_KERNEL_CUH
...@@ -7,12 +7,20 @@ ...@@ -7,12 +7,20 @@
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x) i += blockDim.x * gridDim.x)
#define THREADS_PER_BLOCK 512 #define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x) \
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
j += blockDim.y * gridDim.y)
#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) #define THREADS_PER_BLOCK 512
inline int GET_BLOCKS(const int N) { inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; int optimal_block_num = (N + num_threads - 1) / num_threads;
int max_block_num = 4096; int max_block_num = 4096;
return min(optimal_block_num, max_block_num); return min(optimal_block_num, max_block_num);
} }
......
// Copyright (c) OpenMMLab. All rights reserved
#ifndef CONVEX_IOU_CUDA_KERNEL_CUH
#define CONVEX_IOU_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#define MAXN 100
#define NMAX 512
__device__ const double EPS = 1E-8;
__device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); }
struct Point {
double x, y;
__device__ Point() {}
__device__ Point(double x, double y) : x(x), y(y) {}
};
__device__ inline bool point_same(Point& a, Point& b) {
return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0;
}
__device__ inline void swap1(Point* a, Point* b) {
Point temp;
temp.x = a->x;
temp.y = a->y;
a->x = b->x;
a->y = b->y;
b->x = temp.x;
b->y = temp.y;
}
__device__ inline void reverse1(Point* a, const int n) {
for (int i = 0; i < (n - 1) / 2.0; i++) {
Point* j = &(a[i]);
Point* k = &(a[n - 1 - i]);
swap1(j, k);
}
}
__device__ inline double cross(Point o, Point a, Point b) {
return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
}
__device__ inline double dis(Point a, Point b) {
return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
}
__device__ inline double area(Point* ps, int n) {
ps[n] = ps[0];
double res = 0;
for (int i = 0; i < n; i++) {
res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
}
return res / 2.0;
}
__device__ inline double polygon_area_grad(Point* ps, int n,
int* polygon_to_pred_index,
int n_pred, double* grad_C) {
ps[n] = ps[0];
double partion_grad[4 * 30 + 2];
double res = 0;
for (int i = 0; i < n; i++) {
res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
partion_grad[i * 4 + 2] = ps[i + 1].y;
partion_grad[i * 4 + 3] = -ps[i + 1].x;
if (i != n - 1) {
partion_grad[i * 4 + 4] = -ps[i].y;
partion_grad[i * 4 + 5] = ps[i].x;
} else {
partion_grad[0] = -ps[i].y;
partion_grad[1] = ps[i].x;
}
}
for (int i = 0; i < n; i++) {
for (int j = 0; j < n_pred; j++) {
if (i == polygon_to_pred_index[j]) {
grad_C[2 * polygon_to_pred_index[j + n_pred]] =
(partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2;
break;
}
}
for (int j = 0; j < n_pred; j++) {
if (i == polygon_to_pred_index[j]) {
grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] =
(partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2;
break;
}
}
}
return res / 2.0;
}
__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p,
double* cut_grad, int m, int n, int i) {
double s1, s2;
double s2_s1_2;
double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd;
double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd;
s1 = cross(a, b, c);
s2 = cross(a, b, d);
ds1_dxc = -(b.y - a.y);
ds1_dyc = b.x - a.x;
ds2_dxd = ds1_dxc;
ds2_dyd = ds1_dyc;
s2_s1_2 = (s2 - s1) * (s2 - s1);
if (sig(s1) == 0 && sig(s2) == 0) return 2;
if (sig(s2 - s1) == 0) return 0;
dxp_dxc =
((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) /
(s2_s1_2);
dxp_dyc =
((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) /
(s2_s1_2);
dxp_dxd =
((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) /
(s2_s1_2);
dxp_dyd =
((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) /
(s2_s1_2);
dyp_dxc =
((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) /
(s2_s1_2);
dyp_dyc =
((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) /
(s2_s1_2);
dyp_dxd =
((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) /
(s2_s1_2);
dyp_dyd =
((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) /
(s2_s1_2);
p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
if (i == n - 1) {
cut_grad[4 * n * m + 4 * i] = dxp_dxc; // + dyp_dxc;
cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc; // + dyp_dyc;
cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
cut_grad[4 * n * m + 0] = dxp_dxd; // + dyp_dxd;
cut_grad[4 * n * m + 1] = dyp_dxd;
cut_grad[4 * n * m + 2] = dxp_dyd; // + dyp_dyd;
cut_grad[4 * n * m + 3] = dyp_dyd;
} else {
cut_grad[4 * n * m + 4 * i] = dxp_dxc; // + dyp_dxc;
cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc; // + dyp_dyc;
cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd; // + dyp_dxd;
cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd;
cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd; // + dyp_dyd;
cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd;
}
return 1;
}
__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b,
double* cut_grad) {
Point pp[MAXN];
double ccur_grad[MAXN] = {};
int m = 0;
p[n] = p[0];
int k = n;
for (int i = 0; i < n; i++) {
if (sig(cross(a, b, p[i])) > 0) {
pp[m] = p[i];
ccur_grad[4 * n * m + 4 * i] = 1.0;
ccur_grad[4 * n * m + 4 * i + 3] = 1.0;
m++;
}
if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i);
m++;
}
}
n = 0;
for (int i = 0; i < m; i++) {
if (!i || !(point_same(pp[i], pp[i - 1]))) {
p[n] = pp[i];
for (int j = 0; j < 4 * k; j++) {
cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j];
}
n++;
}
}
while (n > 1 && point_same(p[n - 1], p[0])) n--;
}
__device__ inline double intersectArea(Point a, Point b, Point c, Point d,
double* grad_AB, int order,
int convex_n) {
Point o(0, 0);
int res_flag = 0;
int s1 = sig(cross(o, a, b));
int s2 = sig(cross(o, c, d));
if (s1 == 0 || s2 == 0) return 0.0;
if (s1 == -1) {
Point* i = &a;
Point* j = &b;
swap1(i, j);
res_flag = 1;
}
if (s2 == -1) {
Point* i = &c;
Point* j = &d;
swap1(i, j);
}
Point p[10] = {o, a, b};
int n = 3, n0 = 3, n1, n2, n3;
double cut_grad1[MAXN] = {};
double cut_grad2[MAXN] = {};
double cut_grad3[MAXN] = {};
double p1_p_grad[10][10] = {};
double p2_p1_grad[10][10] = {};
double p3_p2_grad[10][10] = {};
double p3_p1_grad[10][10] = {};
double p3_p_grad[10][10] = {};
// 1
polygon_cut(p, n, o, c, cut_grad1);
n1 = n;
for (int i = 0; i < n; i++) {
for (int j = 0; j < 4 * n0; j++) {
if (!(j % 2)) {
p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j];
} else {
p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j];
}
}
}
// 2
polygon_cut(p, n, c, d, cut_grad2);
n2 = n;
for (int i = 0; i < n; i++) {
for (int j = 0; j < 4 * n1; j++) {
if (!(j % 2)) {
p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j];
} else {
p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j];
}
}
}
// 3
polygon_cut(p, n, d, o, cut_grad3);
n3 = n;
for (int i = 0; i < n; i++) {
for (int j = 0; j < 4 * n2; j++) {
if (!(j % 2)) {
p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j];
} else {
p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j];
}
}
}
// mul
// p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1)
for (int i = 0; i < 2 * n3; i++) {
for (int j = 0; j < 2 * n1; j++) {
double sum = 0.0;
for (int m = 0; m < 2 * n2; m++) {
sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j];
}
p3_p1_grad[i][j] = sum;
}
}
// p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0)
for (int i = 0; i < 2 * n3; i++) {
for (int j = 0; j < 2 * n0; j++) {
double sum = 0.0;
for (int m = 0; m < 2 * n1; m++) {
sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j];
}
p3_p_grad[i][j] = sum;
}
}
// calculate S_grad
int polygon_index_box_index[20];
double grad_polygon[20];
double S_grad[6];
for (int i = 0; i < n3; i++) {
polygon_index_box_index[i] = i;
polygon_index_box_index[i + n3] = i;
}
double res =
polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon);
if (s1 * s2 == -1) {
for (int j = 0; j < 2 * 3; j++) {
double sum = 0.0;
for (int m = 0; m < 2 * n3; m++) {
sum = sum - grad_polygon[m] * p3_p_grad[m][j];
}
S_grad[j] = sum;
}
if (order != convex_n - 1) {
if (res_flag) {
grad_AB[2 * order] += S_grad[4];
grad_AB[2 * order + 1] += S_grad[5];
grad_AB[2 * order + 2] += S_grad[2];
grad_AB[2 * order + 3] += S_grad[3];
} else {
grad_AB[2 * order] += S_grad[2];
grad_AB[2 * order + 1] += S_grad[3];
grad_AB[2 * order + 2] += S_grad[4];
grad_AB[2 * order + 3] += S_grad[5];
}
} else {
if (res_flag) {
grad_AB[2 * order] += S_grad[4];
grad_AB[2 * order + 1] += S_grad[5];
grad_AB[0] += S_grad[2];
grad_AB[1] += S_grad[3];
} else {
grad_AB[2 * order] += S_grad[2];
grad_AB[2 * order + 1] += S_grad[3];
grad_AB[0] += S_grad[4];
grad_AB[1] += S_grad[5];
}
}
res = -res;
} else {
for (int j = 0; j < 2 * 3; j++) {
double sum = 0.0;
for (int m = 0; m < 2 * n3; m++) {
sum = sum + grad_polygon[m] * p3_p_grad[m][j];
}
S_grad[j] = sum;
}
if (order != convex_n - 1) {
if (res_flag) {
grad_AB[2 * order] += S_grad[4];
grad_AB[2 * order + 1] += S_grad[5];
grad_AB[2 * order + 2] += S_grad[2];
grad_AB[2 * order + 3] += S_grad[3];
} else {
grad_AB[2 * order] += S_grad[2];
grad_AB[2 * order + 1] += S_grad[3];
grad_AB[2 * order + 2] += S_grad[4];
grad_AB[2 * order + 3] += S_grad[5];
}
} else {
if (res_flag) {
grad_AB[2 * order] += S_grad[4];
grad_AB[2 * order + 1] += S_grad[5];
grad_AB[0] += S_grad[2];
grad_AB[1] += S_grad[3];
} else {
grad_AB[2 * order] += S_grad[2];
grad_AB[2 * order + 1] += S_grad[3];
grad_AB[0] += S_grad[4];
grad_AB[1] += S_grad[5];
}
}
}
return res;
}
__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2,
double* grad_AB) {
if (area(ps1, n1) < 0) reverse1(ps1, n1);
if (area(ps2, n2) < 0) reverse1(ps2, n2);
ps1[n1] = ps1[0];
ps2[n2] = ps2[0];
double res = 0;
for (int i = 0; i < n1; i++) {
for (int j = 0; j < n2; j++) {
res +=
intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1);
}
}
return res;
}
__device__ inline void Jarvis(Point* in_poly, int& n_poly) {
Point p_max, p_k;
int max_index, k_index;
int Stack[NMAX] = {}, top1, top2;
double sign;
Point right_point[10], left_point[10];
for (int i = 0; i < n_poly; i++) {
if (in_poly[i].y < in_poly[0].y ||
in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
Point* j = &(in_poly[0]);
Point* k = &(in_poly[i]);
swap1(j, k);
}
if (i == 0) {
p_max = in_poly[0];
max_index = 0;
}
if (in_poly[i].y > p_max.y ||
in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
p_max = in_poly[i];
max_index = i;
}
}
if (max_index == 0) {
max_index = 1;
p_max = in_poly[max_index];
}
k_index = 0, Stack[0] = 0, top1 = 0;
while (k_index != max_index) {
p_k = p_max;
k_index = max_index;
for (int i = 1; i < n_poly; i++) {
sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
dis(in_poly[Stack[top1]], p_k)))) {
p_k = in_poly[i];
k_index = i;
}
}
top1++;
Stack[top1] = k_index;
}
for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]];
k_index = 0, Stack[0] = 0, top2 = 0;
while (k_index != max_index) {
p_k = p_max;
k_index = max_index;
for (int i = 1; i < n_poly; i++) {
sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
dis(in_poly[Stack[top2]], p_k))) {
p_k = in_poly[i];
k_index = i;
}
}
top2++;
Stack[top2] = k_index;
}
for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]];
for (int i = 0; i < top1 + top2; i++) {
if (i <= top1) {
in_poly[i] = right_point[i];
} else {
in_poly[i] = left_point[top2 - (i - top1)];
}
}
n_poly = top1 + top2;
}
__device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2,
int n2, double* grad_C) {
Point polygon[MAXN];
int n = n1 + n2, n_poly = 0;
for (int i = 0; i < n1; i++) {
for (int j = 0; j < n - n1; j++) {
if (point_same(ps1[i], ps2[j])) {
for (int k = j; k < n - n1 - 1; k++) {
ps2[k] = ps2[k + 1];
}
n2--;
break;
}
}
}
n_poly = n1 + n2;
for (int i = 0; i < n_poly; i++) {
if (i < n1) {
polygon[i] = ps1[i];
} else {
polygon[i] = ps2[i - n1];
}
}
Jarvis(polygon, n_poly);
int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1};
int n_pred = 0;
for (int i = 0; i < n_poly; i++) {
for (int j = 0; j < n1; j++) {
if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) {
polygon_to_pred_index[n_pred] = i;
polygon_to_pred_index[n_pred + n1] = j;
n_pred += 1;
break;
}
}
}
if (n_pred == 0) {
double polygon_area = fabs(area(polygon, n_poly));
for (int i = 0; i < 18; i++) {
grad_C[i] = 0.0;
}
return polygon_area;
} else {
double polygon_area =
polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C);
if (polygon_area < 0) {
for (int i = 0; i < 18; i++) {
grad_C[i] = -grad_C[i];
}
}
return fabs(polygon_area);
}
}
// convex_find and get the polygon_index_box_index
__device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly,
int* points_to_convex_ind) {
int n_input = n_poly;
Point input_poly[20];
for (int i = 0; i < n_input; i++) {
input_poly[i].x = in_poly[i].x;
input_poly[i].y = in_poly[i].y;
}
Point p_max, p_k;
int max_index, k_index;
int Stack[20], top1, top2;
double sign;
Point right_point[10], left_point[10];
for (int i = 0; i < n_poly; i++) {
if (in_poly[i].y < in_poly[0].y ||
in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
Point* j = &(in_poly[0]);
Point* k = &(in_poly[i]);
swap1(j, k);
}
if (i == 0) {
p_max = in_poly[0];
max_index = 0;
}
if (in_poly[i].y > p_max.y ||
in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
p_max = in_poly[i];
max_index = i;
}
}
if (max_index == 0) {
max_index = 1;
p_max = in_poly[max_index];
}
k_index = 0, Stack[0] = 0, top1 = 0;
while (k_index != max_index) {
p_k = p_max;
k_index = max_index;
for (int i = 1; i < n_poly; i++) {
sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
dis(in_poly[Stack[top1]], p_k)))) {
p_k = in_poly[i];
k_index = i;
}
}
top1++;
Stack[top1] = k_index;
}
for (int i = 0; i <= top1; i++) {
right_point[i] = in_poly[Stack[i]];
}
k_index = 0, Stack[0] = 0, top2 = 0;
while (k_index != max_index) {
p_k = p_max;
k_index = max_index;
for (int i = 1; i < n_poly; i++) {
sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
dis(in_poly[Stack[top2]], p_k))) {
p_k = in_poly[i];
k_index = i;
}
}
top2++;
Stack[top2] = k_index;
}
for (int i = top2 - 1; i >= 0; i--) {
left_point[i] = in_poly[Stack[i]];
}
for (int i = 0; i < top1 + top2; i++) {
if (i <= top1) {
in_poly[i] = right_point[i];
} else {
in_poly[i] = left_point[top2 - (i - top1)];
}
}
n_poly = top1 + top2;
for (int i = 0; i < n_poly; i++) {
for (int j = 0; j < n_input; j++) {
if (point_same(in_poly[i], input_poly[j])) {
points_to_convex_ind[i] = j;
break;
}
}
}
}
template <typename T>
__device__ inline float devrIoU(T const* const p, T const* const q,
T* point_grad, const int idx) {
Point ps1[MAXN], ps2[MAXN];
Point convex[MAXN];
for (int i = 0; i < 9; i++) {
convex[i].x = (double)p[i * 2];
convex[i].y = (double)p[i * 2 + 1];
}
int n_convex = 9;
int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
Jarvis_and_index(convex, n_convex, points_to_convex_ind);
int n1 = n_convex;
int n2 = 4;
for (int i = 0; i < n1; i++) {
ps1[i].x = (double)convex[i].x;
ps1[i].y = (double)convex[i].y;
}
for (int i = 0; i < n2; i++) {
ps2[i].x = (double)q[i * 2];
ps2[i].y = (double)q[i * 2 + 1];
}
int polygon_index_box_index[18];
for (int i = 0; i < n1; i++) {
polygon_index_box_index[i] = i;
polygon_index_box_index[i + n1] = i;
}
double grad_A[18] = {};
double grad_AB[18] = {};
double grad_C[18] = {};
double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB);
double S_pred =
polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A);
if (S_pred < 0) {
for (int i = 0; i < n_convex * 2; i++) {
grad_A[i] = -grad_A[i];
}
}
double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
double iou = inter_area / union_area;
double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C);
// printf("%d:live\n", idx);
double rot_giou = iou - (polygon_area - union_area) / polygon_area;
float grad_point_temp[18] = {};
for (int i = 0; i < n_convex; i++) {
int grad_point = points_to_convex_ind[i];
grad_point_temp[2 * grad_point] =
(float)((union_area + inter_area) / (union_area * union_area) *
grad_AB[2 * i] -
iou / union_area * grad_A[2 * i] -
1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) -
(union_area) / polygon_area / polygon_area * grad_C[2 * i]);
grad_point_temp[2 * grad_point + 1] =
(float)((union_area + inter_area) / (union_area * union_area) *
grad_AB[2 * i + 1] -
iou / union_area * grad_A[2 * i + 1] -
1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) -
(union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]);
}
for (int i = 0; i < 9; i++) {
point_grad[2 * i] = grad_point_temp[2 * i];
point_grad[2 * i + 1] = grad_point_temp[2 * i + 1];
}
return (float)rot_giou;
}
template <typename T>
__global__ void convex_giou_cuda_kernel(const int ex_n_boxes,
const int gt_n_boxes, const T* ex_boxes,
const T* gt_boxes, T* point_grad) {
CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
const T* cur_box = ex_boxes + index * 18;
const T* cur_gt_box = gt_boxes + index * 8;
T* cur_grad = point_grad + index * 19;
T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x);
cur_grad[18] = giou;
}
}
__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) {
double s1, s2;
s1 = cross(a, b, c);
s2 = cross(a, b, d);
if (sig(s1) == 0 && sig(s2) == 0) return 2;
if (sig(s2 - s1) == 0) return 0;
p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
return 1;
}
__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) {
Point pp[MAXN];
int m = 0;
p[n] = p[0];
for (int i = 0; i < n; i++) {
if (sig(cross(a, b, p[i])) > 0) {
pp[m] = p[i];
m++;
}
if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
lineCross(a, b, p[i], p[i + 1], pp[m]);
m++;
}
}
n = 0;
for (int i = 0; i < m; i++) {
if (!i || !(point_same(pp[i], pp[i - 1]))) {
p[n] = pp[i];
n++;
}
}
while (n > 1 && point_same(p[n - 1], p[0])) n--;
}
__device__ inline double intersectArea(Point a, Point b, Point c, Point d) {
Point o(0, 0);
int s1 = sig(cross(o, a, b));
int s2 = sig(cross(o, c, d));
if (s1 == 0 || s2 == 0) return 0.0;
if (s1 == -1) {
Point* i = &a;
Point* j = &b;
swap1(i, j);
}
if (s2 == -1) {
Point* i = &c;
Point* j = &d;
swap1(i, j);
}
Point p[10] = {o, a, b};
int n = 3;
polygon_cut(p, n, o, c);
polygon_cut(p, n, c, d);
polygon_cut(p, n, d, o);
double res = area(p, n);
if (s1 * s2 == -1) res = -res;
return res;
}
__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2,
int n2) {
if (area(ps1, n1) < 0) reverse1(ps1, n1);
if (area(ps2, n2) < 0) reverse1(ps2, n2);
ps1[n1] = ps1[0];
ps2[n2] = ps2[0];
double res = 0;
for (int i = 0; i < n1; i++) {
for (int j = 0; j < n2; j++) {
res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]);
}
}
return res;
}
template <typename T>
__device__ inline float devrIoU(T const* const p, T const* const q) {
Point ps1[MAXN], ps2[MAXN];
Point convex[MAXN];
for (int i = 0; i < 9; i++) {
convex[i].x = (double)p[i * 2];
convex[i].y = (double)p[i * 2 + 1];
}
int n_convex = 9;
int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
Jarvis_and_index(convex, n_convex, points_to_convex_ind);
int n1 = n_convex;
for (int i = 0; i < n1; i++) {
ps1[i].x = (double)convex[i].x;
ps1[i].y = (double)convex[i].y;
}
int n2 = 4;
for (int i = 0; i < n2; i++) {
ps2[i].x = (double)q[i * 2];
ps2[i].y = (double)q[i * 2 + 1];
}
double inter_area = intersectAreaO(ps1, n1, ps2, n2);
double S_pred = area(ps1, n1);
double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
double iou = inter_area / union_area;
return (float)iou;
}
template <typename T>
__global__ void convex_iou_cuda_kernel(const int ex_n_boxes,
const int gt_n_boxes, const T* ex_boxes,
const T* gt_boxes, T* iou) {
CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
const T* cur_box = ex_boxes + index * 18;
for (int i = 0; i < gt_n_boxes; i++) {
iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8);
}
}
}
#endif // CONVEX_IOU_CUDA_KERNEL_CUH
...@@ -29,8 +29,8 @@ using namespace torch; ...@@ -29,8 +29,8 @@ using namespace torch;
#define TensorAcc5R PackedTensorAccessor32<scalar_t, 5, RestrictPtrTraits> #define TensorAcc5R PackedTensorAccessor32<scalar_t, 5, RestrictPtrTraits>
#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W) #define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
#define THREADS_FORWARD 32 #define WARP_SIZE 32
#define THREADS_BACKWARD 16 #define FULL_MASK 0xffffffff
template <typename scalar_t> template <typename scalar_t>
__global__ void correlation_forward_cuda_kernel( __global__ void correlation_forward_cuda_kernel(
...@@ -42,8 +42,8 @@ __global__ void correlation_forward_cuda_kernel( ...@@ -42,8 +42,8 @@ __global__ void correlation_forward_cuda_kernel(
const int C = rInput1.size(3); const int C = rInput1.size(3);
const int n = blockIdx.x; const int n = blockIdx.x;
const int h = blockIdx.y; const int h = blockIdx.y * blockDim.y + threadIdx.y;
const int w = blockIdx.z; const int w = blockIdx.z * blockDim.z + threadIdx.z;
const int thread = threadIdx.x; const int thread = threadIdx.x;
const int start_i = -padH + h * dH; const int start_i = -padH + h * dH;
...@@ -52,13 +52,11 @@ __global__ void correlation_forward_cuda_kernel( ...@@ -52,13 +52,11 @@ __global__ void correlation_forward_cuda_kernel(
const int patchRadH = dilation_patchH * (patchH - 1) / 2; const int patchRadH = dilation_patchH * (patchH - 1) / 2;
const int patchRadW = dilation_patchW * (patchW - 1) / 2; const int patchRadW = dilation_patchW * (patchW - 1) / 2;
__shared__ scalar_t prod_sum[THREADS_FORWARD];
for (int ph = 0; ph < patchH; ++ph) { for (int ph = 0; ph < patchH; ++ph) {
int ph_dilated = ph * dilation_patchH - patchRadH; int ph_dilated = ph * dilation_patchH - patchRadH;
for (int pw = 0; pw < patchW; ++pw) { for (int pw = 0; pw < patchW; ++pw) {
int pw_dilated = pw * dilation_patchW - patchRadW; int pw_dilated = pw * dilation_patchW - patchRadW;
prod_sum[thread] = 0; scalar_t prod_sum = 0.0f;
for (int i = 0; i < kH; ++i) { for (int i = 0; i < kH; ++i) {
int i1 = start_i + i * dilationH; int i1 = start_i + i * dilationH;
int i2 = i1 + ph_dilated; int i2 = i1 + ph_dilated;
...@@ -69,23 +67,20 @@ __global__ void correlation_forward_cuda_kernel( ...@@ -69,23 +67,20 @@ __global__ void correlation_forward_cuda_kernel(
int j2 = j1 + pw_dilated; int j2 = j1 + pw_dilated;
if if
WITHIN_BOUNDS(j1, j2, iW, iW) { WITHIN_BOUNDS(j1, j2, iW, iW) {
for (int c = thread; c < C; c += THREADS_FORWARD) { for (int c = thread; c < C; c += WARP_SIZE) {
scalar_t v1 = rInput1[n][i1][j1][c]; scalar_t v1 = rInput1[n][i1][j1][c];
scalar_t v2 = rInput2[n][i2][j2][c]; scalar_t v2 = rInput2[n][i2][j2][c];
prod_sum[thread] += v1 * v2; prod_sum += v1 * v2;
} }
} }
} }
} }
} }
// accumulate // accumulate
__syncthreads(); for (int offset = 16; offset > 0; offset /= 2)
prod_sum += __shfl_down_sync(FULL_MASK, float(prod_sum), offset);
if (thread == 0) { if (thread == 0) {
scalar_t reduce_sum = 0; output[n][ph][pw][h][w] = prod_sum;
for (int index = 0; index < THREADS_FORWARD; ++index) {
reduce_sum += prod_sum[index];
}
output[n][ph][pw][h][w] = reduce_sum;
} }
} }
} }
...@@ -97,9 +92,10 @@ __global__ void correlation_backward_cuda_kernel_input1( ...@@ -97,9 +92,10 @@ __global__ void correlation_backward_cuda_kernel_input1(
TensorAcc4R grad_input1, const int kH, const int kW, const int patchH, TensorAcc4R grad_input1, const int kH, const int kW, const int patchH,
const int patchW, const int padH, const int padW, const int dilationH, const int patchW, const int padH, const int padW, const int dilationH,
const int dilationW, const int dilation_patchH, const int dilation_patchW, const int dilationW, const int dilation_patchH, const int dilation_patchW,
const int dH, const int dW, const int batch) { const int dH, const int dW) {
const int iH = input2.size(2); const int iH = input2.size(1);
const int iW = input2.size(3); const int iW = input2.size(2);
const int C = input2.size(3);
const int H = grad_output.size(3); const int H = grad_output.size(3);
const int W = grad_output.size(4); const int W = grad_output.size(4);
...@@ -107,54 +103,53 @@ __global__ void correlation_backward_cuda_kernel_input1( ...@@ -107,54 +103,53 @@ __global__ void correlation_backward_cuda_kernel_input1(
const int patchRadH = (patchH - 1) / 2; const int patchRadH = (patchH - 1) / 2;
const int patchRadW = (patchW - 1) / 2; const int patchRadW = (patchW - 1) / 2;
const int n = batch; const int n = blockIdx.x;
const int c = blockIdx.x;
const int h = blockIdx.y; const int h = blockIdx.y;
const int w = blockIdx.z; const int w = blockIdx.z;
const int ph_off = threadIdx.x;
const int pw_off = threadIdx.y;
const int h_2 = h + padH; const int h_2 = h + padH;
const int w_2 = w + padW; const int w_2 = w + padW;
const int min_h = h_2 - kH * dilationH; const int min_h = h_2 - kH * dilationH;
const int min_w = w_2 - kW * dilationW; const int min_w = w_2 - kW * dilationW;
__shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD]; extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
prod_sum[ph_off][pw_off] = 0; scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) { const int ph = i / patchW;
const int pw = i % patchW;
int i1 = h + dilation_patchH * (ph - patchRadH); int i1 = h + dilation_patchH * (ph - patchRadH);
for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) { int j1 = w + dilation_patchW * (pw - patchRadW);
int j1 = w + dilation_patchW * (pw - patchRadW);
if (WITHIN_BOUNDS(i1, j1, iH, iW)) { if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
scalar_t val = input2[n][c][i1][j1]; scalar_t grad_val = 0.0f;
for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) { for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
int i2 = (h_3) / dH; int i2 = (h_3) / dH;
if (i2 * dH != h_3) continue; if (i2 * dH != h_3) continue;
for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) { for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
int j2 = (w_3) / dW; int j2 = (w_3) / dW;
if (j2 * dW != w_3) continue; if (j2 * dW != w_3) continue;
if if (WITHIN_BOUNDS(i2, j2, H, W)) {
WITHIN_BOUNDS(i2, j2, H, W) { grad_val += grad_output[n][ph][pw][i2][j2];
prod_sum[ph_off][pw_off] +=
grad_output[n][ph][pw][i2][j2] * val;
}
} }
} }
} }
grad_cache[i] = grad_val;
} }
} }
__syncthreads(); __syncthreads();
if (ph_off == 0 && pw_off == 0) { for (int c = threadIdx.x; c < C; c += blockDim.x) {
scalar_t reduce_sum = 0; scalar_t grad_input_val = 0.0f;
for (int ph = 0; ph < THREADS_BACKWARD; ++ph) { for (int ph = 0; ph < patchH; ++ph) {
for (int pw = 0; pw < THREADS_BACKWARD; ++pw) { int i1 = h + dilation_patchH * (ph - patchRadH);
reduce_sum += prod_sum[ph][pw]; for (int pw = 0; pw < patchW; ++pw) {
int j1 = w + dilation_patchW * (pw - patchRadW);
if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
grad_input_val += input2[n][i1][j1][c] * grad_cache[ph * patchW + pw];
}
} }
} }
grad_input1[n][c][h][w] = reduce_sum; grad_input1[n][c][h][w] = grad_input_val;
} }
} }
...@@ -163,9 +158,10 @@ __global__ void correlation_backward_cuda_kernel_input2( ...@@ -163,9 +158,10 @@ __global__ void correlation_backward_cuda_kernel_input2(
const TensorAcc5R grad_output, const TensorAcc4R input1, const TensorAcc5R grad_output, const TensorAcc4R input1,
TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH, TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH,
int padW, int dilationH, int dilationW, int dilation_patchH, int padW, int dilationH, int dilationW, int dilation_patchH,
int dilation_patchW, int dH, int dW, int batch) { int dilation_patchW, int dH, int dW) {
const int iH = input1.size(2); const int iH = input1.size(1);
const int iW = input1.size(3); const int iW = input1.size(2);
const int C = input1.size(3);
const int patchRadH = (patchH - 1) / 2; const int patchRadH = (patchH - 1) / 2;
const int patchRadW = (patchW - 1) / 2; const int patchRadW = (patchW - 1) / 2;
...@@ -176,56 +172,54 @@ __global__ void correlation_backward_cuda_kernel_input2( ...@@ -176,56 +172,54 @@ __global__ void correlation_backward_cuda_kernel_input2(
const int dilatedKH = kH * dilationH; const int dilatedKH = kH * dilationH;
const int dilatedKW = kW * dilationW; const int dilatedKW = kW * dilationW;
const int n = batch; const int n = blockIdx.x;
const int c = blockIdx.x;
const int h = blockIdx.y; const int h = blockIdx.y;
const int w = blockIdx.z; const int w = blockIdx.z;
const int ph_off = threadIdx.x;
const int pw_off = threadIdx.y;
__shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
prod_sum[ph_off][pw_off] = 0;
for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) { extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
const int ph = i / patchW;
const int pw = i % patchW;
int i1 = h - dilation_patchH * (ph - patchRadH); int i1 = h - dilation_patchH * (ph - patchRadH);
for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) { int j1 = w - dilation_patchW * (pw - patchRadW);
int j1 = w - dilation_patchW * (pw - patchRadW);
if if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
WITHIN_BOUNDS(i1, j1, iH, iW) { scalar_t grad_val = 0.0f;
scalar_t val = input1[n][c][i1][j1];
const int h_2 = i1 + padH;
const int h_2 = i1 + padH; const int w_2 = j1 + padW;
const int w_2 = j1 + padW; const int min_h = h_2 - dilatedKH;
const int min_h = h_2 - dilatedKH; const int min_w = w_2 - dilatedKW;
const int min_w = w_2 - dilatedKW;
for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) { int i2 = (h_3) / dH;
int i2 = (h_3) / dH; if (i2 * dH != h_3) continue;
if (i2 * dH != h_3) continue; for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) { int j2 = (w_3) / dW;
int j2 = (w_3) / dW; if (j2 * dW != w_3) continue;
if (j2 * dW != w_3) continue; if (WITHIN_BOUNDS(i2, j2, H, W)) {
if grad_val += grad_output[n][ph][pw][i2][j2];
WITHIN_BOUNDS(i2, j2, H, W) {
prod_sum[ph_off][pw_off] +=
grad_output[n][ph][pw][i2][j2] * val;
}
}
} }
} }
}
grad_cache[i] = grad_val;
} }
} }
__syncthreads(); __syncthreads();
if (ph_off == 0 && pw_off == 0) { for (int c = threadIdx.x; c < C; c += blockDim.x) {
scalar_t reduce_sum = 0; scalar_t grad_input_val = 0.0f;
for (int ph = 0; ph < THREADS_BACKWARD; ++ph) { for (int ph = 0; ph < patchH; ++ph) {
for (int pw = 0; pw < THREADS_BACKWARD; ++pw) { int i1 = h - dilation_patchH * (ph - patchRadH);
reduce_sum += prod_sum[ph][pw]; for (int pw = 0; pw < patchW; ++pw) {
int j1 = w - dilation_patchW * (pw - patchRadW);
if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
grad_input_val += input1[n][i1][j1][c] * grad_cache[ph * patchW + pw];
}
} }
} }
grad_input2[n][c][h][w] = reduce_sum; grad_input2[n][c][h][w] = grad_input_val;
} }
} }
#endif #endif
// Copyright (c) OpenMMLab. All rights reserved
// Adapted from
// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu # noqa
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#define MAX_NUM_VERT_IDX 9
#define INTERSECTION_OFFSET 8
#define EPSILON 1e-8
inline int opt_n_thread(int work_size) {
const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
return max(min(1 << pow_2, THREADS_PER_BLOCK), 1);
}
/*
compare normalized vertices (vertices around (0,0))
if vertex1 < vertex2 return true.
order: minimum at x-aixs, become larger in anti-clockwise direction
*/
__device__ bool compare_vertices(float x1, float y1, float x2, float y2) {
if (fabs(x1 - x2) < EPSILON && fabs(y2 - y1) < EPSILON)
return false; // if equal, return false
if (y1 > 0 && y2 < 0) return true;
if (y1 < 0 && y2 > 0) return false;
float n1 = x1 * x1 + y1 * y1 + EPSILON;
float n2 = x2 * x2 + y2 * y2 + EPSILON;
float diff = fabs(x1) * x1 / n1 - fabs(x2) * x2 / n2;
if (y1 > 0 && y2 > 0) {
if (diff > EPSILON)
return true;
else
return false;
}
if (y1 < 0 && y2 < 0) {
if (diff < EPSILON)
return true;
else
return false;
}
}
__global__ void diff_iou_rotated_sort_vertices_forward_cuda_kernel(
int b, int n, int m, const float *__restrict__ vertices,
const bool *__restrict__ mask, const int *__restrict__ num_valid,
int *__restrict__ idx) {
int batch_idx = blockIdx.x;
vertices += batch_idx * n * m * 2;
mask += batch_idx * n * m;
num_valid += batch_idx * n;
idx += batch_idx * n * MAX_NUM_VERT_IDX;
int index = threadIdx.x; // index of polygon
int stride = blockDim.x;
for (int i = index; i < n; i += stride) {
int pad; // index of arbitrary invalid intersection point (not box corner!)
for (int j = INTERSECTION_OFFSET; j < m; ++j) {
if (!mask[i * m + j]) {
pad = j;
break;
}
}
if (num_valid[i] < 3) {
// not enough vertices, take an invalid intersection point
// (zero padding)
for (int j = 0; j < MAX_NUM_VERT_IDX; ++j) {
idx[i * MAX_NUM_VERT_IDX + j] = pad;
}
} else {
// sort the valid vertices
// note the number of valid vertices is known
// note: check that num_valid[i] < MAX_NUM_VERT_IDX
for (int j = 0; j < num_valid[i]; ++j) {
// initialize with a "big" value
float x_min = 1;
float y_min = -EPSILON;
int i_take = 0;
int i2;
float x2, y2;
if (j != 0) {
i2 = idx[i * MAX_NUM_VERT_IDX + j - 1];
x2 = vertices[i * m * 2 + i2 * 2 + 0];
y2 = vertices[i * m * 2 + i2 * 2 + 1];
}
for (int k = 0; k < m; ++k) {
float x = vertices[i * m * 2 + k * 2 + 0];
float y = vertices[i * m * 2 + k * 2 + 1];
if (mask[i * m + k] && compare_vertices(x, y, x_min, y_min)) {
if ((j == 0) || (j != 0 && compare_vertices(x2, y2, x, y))) {
x_min = x;
y_min = y;
i_take = k;
}
}
}
idx[i * MAX_NUM_VERT_IDX + j] = i_take;
}
// duplicate the first idx
idx[i * MAX_NUM_VERT_IDX + num_valid[i]] = idx[i * MAX_NUM_VERT_IDX + 0];
// pad zeros
for (int j = num_valid[i] + 1; j < MAX_NUM_VERT_IDX; ++j) {
idx[i * MAX_NUM_VERT_IDX + j] = pad;
}
// for corner case: the two boxes are exactly the same.
// in this case, idx would have duplicate elements, which makes the
// shoelace formula broken because of the definition, the duplicate
// elements only appear in the first 8 positions (they are "corners in
// box", not "intersection of edges")
if (num_valid[i] == 8) {
int counter = 0;
for (int j = 0; j < 4; ++j) {
int check = idx[i * MAX_NUM_VERT_IDX + j];
for (int k = 4; k < INTERSECTION_OFFSET; ++k) {
if (idx[i * MAX_NUM_VERT_IDX + k] == check) counter++;
}
}
if (counter == 4) {
idx[i * MAX_NUM_VERT_IDX + 4] = idx[i * MAX_NUM_VERT_IDX + 0];
for (int j = 5; j < MAX_NUM_VERT_IDX; ++j) {
idx[i * MAX_NUM_VERT_IDX + j] = pad;
}
}
}
// TODO: still might need to cover some other corner cases :(
}
}
}
...@@ -22,13 +22,14 @@ __global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m, ...@@ -22,13 +22,14 @@ __global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m,
int bs_idx = blockIdx.z; int bs_idx = blockIdx.z;
int c_idx = blockIdx.y; int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(pt_idx, m) {
if (bs_idx >= b || c_idx >= c || pt_idx >= m) return; if (bs_idx >= b || c_idx >= c) return;
out += bs_idx * c * m + c_idx * m + pt_idx; out += bs_idx * c * m + c_idx * m + pt_idx;
idx += bs_idx * m + pt_idx; idx += bs_idx * m + pt_idx;
points += bs_idx * c * n + c_idx * n; points += bs_idx * c * n + c_idx * n;
out[0] = points[idx[0]]; out[0] = points[idx[0]];
}
} }
template <typename T> template <typename T>
...@@ -43,14 +44,15 @@ __global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m, ...@@ -43,14 +44,15 @@ __global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m,
int bs_idx = blockIdx.z; int bs_idx = blockIdx.z;
int c_idx = blockIdx.y; int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(pt_idx, m) {
if (bs_idx >= b || c_idx >= c || pt_idx >= m) return; if (bs_idx >= b || c_idx >= c) return;
grad_out += bs_idx * c * m + c_idx * m + pt_idx; grad_out += bs_idx * c * m + c_idx * m + pt_idx;
idx += bs_idx * m + pt_idx; idx += bs_idx * m + pt_idx;
grad_points += bs_idx * c * n + c_idx * n; grad_points += bs_idx * c * n + c_idx * n;
atomicAdd(grad_points + idx[0], grad_out[0]); atomicAdd(grad_points + idx[0], grad_out[0]);
}
} }
#endif // GATHER_POINTS_CUDA_KERNEL_CUH #endif // GATHER_POINTS_CUDA_KERNEL_CUH
...@@ -22,18 +22,19 @@ __global__ void group_points_forward_cuda_kernel(int b, int c, int n, ...@@ -22,18 +22,19 @@ __global__ void group_points_forward_cuda_kernel(int b, int c, int n,
// out: (B, C, npoints, nsample) // out: (B, C, npoints, nsample)
int bs_idx = blockIdx.z; int bs_idx = blockIdx.z;
int c_idx = blockIdx.y; int c_idx = blockIdx.y;
int index = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
int pt_idx = index / nsample; if (bs_idx >= b || c_idx >= c) return;
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
int sample_idx = index % nsample; int pt_idx = index / nsample;
int sample_idx = index % nsample;
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
int in_idx = bs_idx * c * n + c_idx * n + idx[0]; int in_idx = bs_idx * c * n + c_idx * n + idx[0];
int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
pt_idx * nsample + sample_idx; pt_idx * nsample + sample_idx;
out[out_idx] = points[in_idx]; out[out_idx] = points[in_idx];
}
} }
template <typename T> template <typename T>
...@@ -48,16 +49,17 @@ __global__ void group_points_backward_cuda_kernel(int b, int c, int n, ...@@ -48,16 +49,17 @@ __global__ void group_points_backward_cuda_kernel(int b, int c, int n,
// grad_points: (B, C, N) // grad_points: (B, C, N)
int bs_idx = blockIdx.z; int bs_idx = blockIdx.z;
int c_idx = blockIdx.y; int c_idx = blockIdx.y;
int index = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
int pt_idx = index / nsample; int pt_idx = index / nsample;
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return; if (bs_idx >= b || c_idx >= c) return;
int sample_idx = index % nsample; int sample_idx = index % nsample;
grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
pt_idx * nsample + sample_idx; pt_idx * nsample + sample_idx;
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]); atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
}
} }
#endif // GROUP_POINTS_CUDA_KERNEL_CUH #endif // GROUP_POINTS_CUDA_KERNEL_CUH
...@@ -50,21 +50,17 @@ __device__ int check_rect_cross(const Point &p1, const Point &p2, ...@@ -50,21 +50,17 @@ __device__ int check_rect_cross(const Point &p1, const Point &p2,
} }
__device__ inline int check_in_box2d(const float *box, const Point &p) { __device__ inline int check_in_box2d(const float *box, const Point &p) {
// params: box (5) [x1, y1, x2, y2, angle] // params: box (7) [x, y, z, dx, dy, dz, heading]
const float MARGIN = 1e-5; const float MARGIN = 1e-2;
float center_x = (box[0] + box[2]) / 2; float center_x = box[0], center_y = box[1];
float center_y = (box[1] + box[3]) / 2; // rotate the point in the opposite direction of box
float angle_cos = cos(-box[4]), float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);
angle_sin = float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
sin(-box[4]); // rotate the point in the opposite direction of box float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
float rot_x =
(p.x - center_x) * angle_cos - (p.y - center_y) * angle_sin + center_x; return (fabs(rot_x) < box[3] / 2 + MARGIN &&
float rot_y = fabs(rot_y) < box[4] / 2 + MARGIN);
(p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y;
return (rot_x > box[0] - MARGIN && rot_x < box[2] + MARGIN &&
rot_y > box[1] - MARGIN && rot_y < box[3] + MARGIN);
} }
__device__ inline int intersection(const Point &p1, const Point &p0, __device__ inline int intersection(const Point &p1, const Point &p0,
...@@ -116,16 +112,19 @@ __device__ inline int point_cmp(const Point &a, const Point &b, ...@@ -116,16 +112,19 @@ __device__ inline int point_cmp(const Point &a, const Point &b,
} }
__device__ inline float box_overlap(const float *box_a, const float *box_b) { __device__ inline float box_overlap(const float *box_a, const float *box_b) {
// params: box_a (5) [x1, y1, x2, y2, angle] // params box_a: [x, y, z, dx, dy, dz, heading]
// params: box_b (5) [x1, y1, x2, y2, angle] // params box_b: [x, y, z, dx, dy, dz, heading]
float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3], float a_angle = box_a[6], b_angle = box_b[6];
a_angle = box_a[4]; float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2,
float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3], a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
b_angle = box_b[4]; float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
Point center_a((a_x1 + a_x2) / 2, (a_y1 + a_y2) / 2); Point center_a(box_a[0], box_a[1]);
Point center_b((b_x1 + b_x2) / 2, (b_y1 + b_y2) / 2); Point center_b(box_b[0], box_b[1]);
Point box_a_corners[5]; Point box_a_corners[5];
box_a_corners[0].set(a_x1, a_y1); box_a_corners[0].set(a_x1, a_y1);
...@@ -209,10 +208,10 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b) { ...@@ -209,10 +208,10 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b) {
} }
__device__ inline float iou_bev(const float *box_a, const float *box_b) { __device__ inline float iou_bev(const float *box_a, const float *box_b) {
// params: box_a (5) [x1, y1, x2, y2, angle] // params box_a: [x, y, z, dx, dy, dz, heading]
// params: box_b (5) [x1, y1, x2, y2, angle] // params box_b: [x, y, z, dx, dy, dz, heading]
float sa = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]); float sa = box_a[3] * box_a[4];
float sb = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]); float sb = box_b[3] * box_b[4];
float s_overlap = box_overlap(box_a, box_b); float s_overlap = box_overlap(box_a, box_b);
return s_overlap / fmaxf(sa + sb - s_overlap, EPS); return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
} }
...@@ -220,149 +219,148 @@ __device__ inline float iou_bev(const float *box_a, const float *box_b) { ...@@ -220,149 +219,148 @@ __device__ inline float iou_bev(const float *box_a, const float *box_b) {
__global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel( __global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel(
const int num_a, const float *boxes_a, const int num_b, const int num_a, const float *boxes_a, const int num_b,
const float *boxes_b, float *ans_overlap) { const float *boxes_b, float *ans_overlap) {
const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y; // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {
if (a_idx >= num_a || b_idx >= num_b) { if (a_idx >= num_a || b_idx >= num_b) {
return; return;
} }
const float *cur_box_a = boxes_a + a_idx * 5;
const float *cur_box_b = boxes_b + b_idx * 5;
float s_overlap = box_overlap(cur_box_a, cur_box_b);
ans_overlap[a_idx * num_b + b_idx] = s_overlap;
}
__global__ void iou3d_boxes_iou_bev_forward_cuda_kernel(const int num_a,
const float *boxes_a,
const int num_b,
const float *boxes_b,
float *ans_iou) {
const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
if (a_idx >= num_a || b_idx >= num_b) { const float *cur_box_a = boxes_a + a_idx * 7;
return; const float *cur_box_b = boxes_b + b_idx * 7;
float cur_overlap = box_overlap(cur_box_a, cur_box_b);
ans_overlap[a_idx * num_b + b_idx] = cur_overlap;
} }
const float *cur_box_a = boxes_a + a_idx * 5;
const float *cur_box_b = boxes_b + b_idx * 5;
float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
} }
__global__ void nms_forward_cuda_kernel(const int boxes_num, __global__ void iou3d_nms3d_forward_cuda_kernel(const int boxes_num,
const float nms_overlap_thresh, const float nms_overlap_thresh,
const float *boxes, const float *boxes,
unsigned long long *mask) { unsigned long long *mask) {
// params: boxes (N, 5) [x1, y1, x2, y2, ry] // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
// params: mask (N, N/THREADS_PER_BLOCK_NMS) // params: mask (N, N/THREADS_PER_BLOCK_NMS)
const int blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
// if (row_start > col_start) return;
const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
__shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 7 + 0] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
block_boxes[threadIdx.x * 7 + 1] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
block_boxes[threadIdx.x * 7 + 2] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
block_boxes[threadIdx.x * 7 + 3] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
block_boxes[threadIdx.x * 7 + 4] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
block_boxes[threadIdx.x * 7 + 5] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
block_boxes[threadIdx.x * 7 + 6] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
}
__syncthreads();
const int row_start = blockIdx.y; if (threadIdx.x < row_size) {
const int col_start = blockIdx.x; const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
const float *cur_box = boxes + cur_box_idx * 7;
// if (row_start > col_start) return;
const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
__shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
const float *cur_box = boxes + cur_box_idx * 5;
int i = 0; int i = 0;
unsigned long long t = 0; unsigned long long t = 0;
int start = 0; int start = 0;
if (row_start == col_start) { if (row_start == col_start) {
start = threadIdx.x + 1; start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i;
} }
for (i = start; i < col_size; i++) {
if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
mask[cur_box_idx * col_blocks + col_start] = t;
} }
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
mask[cur_box_idx * col_blocks + col_start] = t;
} }
} }
__device__ inline float iou_normal(float const *const a, float const *const b) { __device__ inline float iou_normal(float const *const a, float const *const b) {
float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]); // params: a: [x, y, z, dx, dy, dz, heading]
float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]); // params: b: [x, y, z, dx, dy, dz, heading]
float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2),
right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2),
bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f); float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
float interS = width * height; float interS = width * height;
float Sa = (a[2] - a[0]) * (a[3] - a[1]); float Sa = a[3] * a[4];
float Sb = (b[2] - b[0]) * (b[3] - b[1]); float Sb = b[3] * b[4];
return interS / fmaxf(Sa + Sb - interS, EPS); return interS / fmaxf(Sa + Sb - interS, EPS);
} }
__global__ void nms_normal_forward_cuda_kernel(const int boxes_num, __global__ void iou3d_nms3d_normal_forward_cuda_kernel(
const float nms_overlap_thresh, const int boxes_num, const float nms_overlap_thresh, const float *boxes,
const float *boxes, unsigned long long *mask) {
unsigned long long *mask) { // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
// params: boxes (N, 5) [x1, y1, x2, y2, ry]
// params: mask (N, N/THREADS_PER_BLOCK_NMS) // params: mask (N, N/THREADS_PER_BLOCK_NMS)
const int row_start = blockIdx.y; const int blocks =
const int col_start = blockIdx.x; (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
// if (row_start > col_start) return; // if (row_start > col_start) return;
const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS); THREADS_PER_BLOCK_NMS);
const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS); THREADS_PER_BLOCK_NMS);
__shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5]; __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
if (threadIdx.x < col_size) { if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] = block_boxes[threadIdx.x * 7 + 0] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0]; boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
block_boxes[threadIdx.x * 5 + 1] = block_boxes[threadIdx.x * 7 + 1] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1]; boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
block_boxes[threadIdx.x * 5 + 2] = block_boxes[threadIdx.x * 7 + 2] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2]; boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
block_boxes[threadIdx.x * 5 + 3] = block_boxes[threadIdx.x * 7 + 3] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3]; boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
block_boxes[threadIdx.x * 5 + 4] = block_boxes[threadIdx.x * 7 + 4] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4]; boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
} block_boxes[threadIdx.x * 7 + 5] =
__syncthreads(); boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
block_boxes[threadIdx.x * 7 + 6] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
}
__syncthreads();
if (threadIdx.x < row_size) { if (threadIdx.x < row_size) {
const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
const float *cur_box = boxes + cur_box_idx * 5; const float *cur_box = boxes + cur_box_idx * 7;
int i = 0; int i = 0;
unsigned long long t = 0; unsigned long long t = 0;
int start = 0; int start = 0;
if (row_start == col_start) { if (row_start == col_start) {
start = threadIdx.x + 1; start = threadIdx.x + 1;
} }
for (i = start; i < col_size; i++) { for (i = start; i < col_size; i++) {
if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
t |= 1ULL << i; t |= 1ULL << i;
}
} }
const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
mask[cur_box_idx * col_blocks + col_start] = t;
} }
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
mask[cur_box_idx * col_blocks + col_start] = t;
} }
} }
......
...@@ -51,40 +51,41 @@ __global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample, ...@@ -51,40 +51,41 @@ __global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample,
const T *xyz, const T *new_xyz, const T *xyz, const T *new_xyz,
int *__restrict__ idx, T *dist2) { int *__restrict__ idx, T *dist2) {
int bs_idx = blockIdx.y; int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(pt_idx, m) {
if (bs_idx >= b || pt_idx >= m) return; if (bs_idx >= b) return;
new_xyz += bs_idx * m * 3 + pt_idx * 3; new_xyz += bs_idx * m * 3 + pt_idx * 3;
xyz += bs_idx * n * 3; xyz += bs_idx * n * 3;
idx += bs_idx * m * nsample + pt_idx * nsample; idx += bs_idx * m * nsample + pt_idx * nsample;
dist2 += bs_idx * m * nsample + pt_idx * nsample; dist2 += bs_idx * m * nsample + pt_idx * nsample;
T new_x = new_xyz[0]; T new_x = new_xyz[0];
T new_y = new_xyz[1]; T new_y = new_xyz[1];
T new_z = new_xyz[2]; T new_z = new_xyz[2];
float best_dist[100]; float best_dist[100];
int best_idx[100]; int best_idx[100];
for (int i = 0; i < nsample; i++) { for (int i = 0; i < nsample; i++) {
best_dist[i] = 1e10; best_dist[i] = 1e10;
best_idx[i] = 0; best_idx[i] = 0;
} }
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
T x = xyz[i * 3 + 0]; T x = xyz[i * 3 + 0];
T y = xyz[i * 3 + 1]; T y = xyz[i * 3 + 1];
T z = xyz[i * 3 + 2]; T z = xyz[i * 3 + 2];
T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
(new_z - z) * (new_z - z); (new_z - z) * (new_z - z);
if (d2 < best_dist[0]) { if (d2 < best_dist[0]) {
best_dist[0] = d2; best_dist[0] = d2;
best_idx[0] = i; best_idx[0] = i;
reheap(best_dist, best_idx, nsample); reheap(best_dist, best_idx, nsample);
}
}
heap_sort(best_dist, best_idx, nsample);
for (int i = 0; i < nsample; i++) {
idx[i] = best_idx[i];
dist2[i] = best_dist[i];
} }
}
heap_sort(best_dist, best_idx, nsample);
for (int i = 0; i < nsample; i++) {
idx[i] = best_idx[i];
dist2[i] = best_dist[i];
} }
} }
......
// Copyright (c) OpenMMLab. All rights reserved
#ifndef MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
#define MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#define MAXN 20
__device__ const float PI = 3.1415926;
struct Point {
float x, y;
__device__ Point() {}
__device__ Point(float x, float y) : x(x), y(y) {}
};
__device__ inline void swap1(Point *a, Point *b) {
Point temp;
temp.x = a->x;
temp.y = a->y;
a->x = b->x;
a->y = b->y;
b->x = temp.x;
b->y = temp.y;
}
__device__ inline float cross(Point o, Point a, Point b) {
return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
}
__device__ inline float dis(Point a, Point b) {
return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
}
__device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) {
float convex_points[2][MAXN];
for (int j = 0; j < n_points; j++) {
convex_points[0][j] = ps[j].x;
}
for (int j = 0; j < n_points; j++) {
convex_points[1][j] = ps[j].y;
}
Point edges[MAXN];
float edges_angles[MAXN];
float unique_angles[MAXN];
int n_edges = n_points - 1;
int n_unique = 0;
int unique_flag = 0;
for (int i = 0; i < n_edges; i++) {
edges[i].x = ps[i + 1].x - ps[i].x;
edges[i].y = ps[i + 1].y - ps[i].y;
}
for (int i = 0; i < n_edges; i++) {
edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x);
if (edges_angles[i] >= 0) {
edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2);
} else {
edges_angles[i] =
edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2);
}
}
unique_angles[0] = edges_angles[0];
n_unique += 1;
for (int i = 1; i < n_edges; i++) {
for (int j = 0; j < n_unique; j++) {
if (edges_angles[i] == unique_angles[j]) {
unique_flag += 1;
}
}
if (unique_flag == 0) {
unique_angles[n_unique] = edges_angles[i];
n_unique += 1;
unique_flag = 0;
} else {
unique_flag = 0;
}
}
float minarea = 1e12;
for (int i = 0; i < n_unique; i++) {
float R[2][2];
float rot_points[2][MAXN];
R[0][0] = cos(unique_angles[i]);
R[0][1] = sin(unique_angles[i]);
R[1][0] = -sin(unique_angles[i]);
R[1][1] = cos(unique_angles[i]);
// R x Points
for (int m = 0; m < 2; m++) {
for (int n = 0; n < n_points; n++) {
float sum = 0.0;
for (int k = 0; k < 2; k++) {
sum = sum + R[m][k] * convex_points[k][n];
}
rot_points[m][n] = sum;
}
}
// xmin;
float xmin, ymin, xmax, ymax;
xmin = 1e12;
for (int j = 0; j < n_points; j++) {
if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
continue;
} else {
if (rot_points[0][j] < xmin) {
xmin = rot_points[0][j];
}
}
}
// ymin
ymin = 1e12;
for (int j = 0; j < n_points; j++) {
if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
continue;
} else {
if (rot_points[1][j] < ymin) {
ymin = rot_points[1][j];
}
}
}
// xmax
xmax = -1e12;
for (int j = 0; j < n_points; j++) {
if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
continue;
} else {
if (rot_points[0][j] > xmax) {
xmax = rot_points[0][j];
}
}
}
// ymax
ymax = -1e12;
for (int j = 0; j < n_points; j++) {
if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
continue;
} else {
if (rot_points[1][j] > ymax) {
ymax = rot_points[1][j];
}
}
}
float area = (xmax - xmin) * (ymax - ymin);
if (area < minarea) {
minarea = area;
minbox[0] = unique_angles[i];
minbox[1] = xmin;
minbox[2] = ymin;
minbox[3] = xmax;
minbox[4] = ymax;
}
}
}
// convex_find
__device__ inline void Jarvis(Point *in_poly, int &n_poly) {
int n_input = n_poly;
Point input_poly[20];
for (int i = 0; i < n_input; i++) {
input_poly[i].x = in_poly[i].x;
input_poly[i].y = in_poly[i].y;
}
Point p_max, p_k;
int max_index, k_index;
int Stack[20], top1, top2;
// float sign;
double sign;
Point right_point[10], left_point[10];
for (int i = 0; i < n_poly; i++) {
if (in_poly[i].y < in_poly[0].y ||
in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
Point *j = &(in_poly[0]);
Point *k = &(in_poly[i]);
swap1(j, k);
}
if (i == 0) {
p_max = in_poly[0];
max_index = 0;
}
if (in_poly[i].y > p_max.y ||
in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
p_max = in_poly[i];
max_index = i;
}
}
if (max_index == 0) {
max_index = 1;
p_max = in_poly[max_index];
}
k_index = 0, Stack[0] = 0, top1 = 0;
while (k_index != max_index) {
p_k = p_max;
k_index = max_index;
for (int i = 1; i < n_poly; i++) {
sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
dis(in_poly[Stack[top1]], p_k)))) {
p_k = in_poly[i];
k_index = i;
}
}
top1++;
Stack[top1] = k_index;
}
for (int i = 0; i <= top1; i++) {
right_point[i] = in_poly[Stack[i]];
}
k_index = 0, Stack[0] = 0, top2 = 0;
while (k_index != max_index) {
p_k = p_max;
k_index = max_index;
for (int i = 1; i < n_poly; i++) {
sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
dis(in_poly[Stack[top2]], p_k))) {
p_k = in_poly[i];
k_index = i;
}
}
top2++;
Stack[top2] = k_index;
}
for (int i = top2 - 1; i >= 0; i--) {
left_point[i] = in_poly[Stack[i]];
}
for (int i = 0; i < top1 + top2; i++) {
if (i <= top1) {
in_poly[i] = right_point[i];
} else {
in_poly[i] = left_point[top2 - (i - top1)];
}
}
n_poly = top1 + top2;
}
template <typename T>
__device__ inline void Findminbox(T const *const p, T *minpoints) {
Point ps1[MAXN];
Point convex[MAXN];
for (int i = 0; i < 9; i++) {
convex[i].x = p[i * 2];
convex[i].y = p[i * 2 + 1];
}
int n_convex = 9;
Jarvis(convex, n_convex);
int n1 = n_convex;
for (int i = 0; i < n1; i++) {
ps1[i].x = convex[i].x;
ps1[i].y = convex[i].y;
}
ps1[n1].x = convex[0].x;
ps1[n1].y = convex[0].y;
float minbbox[5] = {0};
minBoundingRect(ps1, n1 + 1, minbbox);
float angle = minbbox[0];
float xmin = minbbox[1];
float ymin = minbbox[2];
float xmax = minbbox[3];
float ymax = minbbox[4];
float R[2][2];
R[0][0] = cos(angle);
R[0][1] = sin(angle);
R[1][0] = -sin(angle);
R[1][1] = cos(angle);
minpoints[0] = xmax * R[0][0] + ymin * R[1][0];
minpoints[1] = xmax * R[0][1] + ymin * R[1][1];
minpoints[2] = xmin * R[0][0] + ymin * R[1][0];
minpoints[3] = xmin * R[0][1] + ymin * R[1][1];
minpoints[4] = xmin * R[0][0] + ymax * R[1][0];
minpoints[5] = xmin * R[0][1] + ymax * R[1][1];
minpoints[6] = xmax * R[0][0] + ymax * R[1][0];
minpoints[7] = xmax * R[0][1] + ymax * R[1][1];
}
template <typename T>
__global__ void min_area_polygons_cuda_kernel(const int ex_n_boxes,
const T *ex_boxes, T *minbox) {
CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
const T *cur_box = ex_boxes + index * 18;
T *cur_min_box = minbox + index * 8;
Findminbox(cur_box, cur_min_box);
}
}
#endif // MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
...@@ -14,11 +14,6 @@ ...@@ -14,11 +14,6 @@
#include "common_cuda_helper.hpp" #include "common_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N, const int num_threads) {
return (N + num_threads - 1) / num_threads;
}
template <typename scalar_t> template <typename scalar_t>
__device__ scalar_t ms_deform_attn_im2col_bilinear( __device__ scalar_t ms_deform_attn_im2col_bilinear(
const scalar_t *&bottom_data, const int &height, const int &width, const scalar_t *&bottom_data, const int &height, const int &width,
...@@ -267,10 +262,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1( ...@@ -267,10 +262,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
const int channels, const int num_levels, const int num_query, const int channels, const int num_levels, const int num_query,
const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight) { scalar_t *grad_attn_weight) {
__shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
__shared__ scalar_t cache_grad_attn_weight[blockSize];
unsigned int tid = threadIdx.x;
const int qid_stride = num_heads * channels;
CUDA_1D_KERNEL_LOOP(index, n) { CUDA_1D_KERNEL_LOOP(index, n) {
__shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
__shared__ scalar_t cache_grad_attn_weight[blockSize];
unsigned int tid = threadIdx.x;
int _temp = index; int _temp = index;
const int c_col = _temp % channels; const int c_col = _temp % channels;
_temp /= channels; _temp /= channels;
...@@ -285,11 +281,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1( ...@@ -285,11 +281,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
int data_weight_ptr = sampling_index * num_levels * num_point; int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1; int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr; const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1; scalar_t *grad_sampling_loc_out =
grad_attn_weight += grad_sampling_ptr; grad_sampling_loc + (grad_sampling_ptr << 1);
scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
const int grad_weight_stride = 1; const int grad_weight_stride = 1;
const int grad_loc_stride = 2; const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col = 0; l_col < num_levels; ++l_col) { for (int l_col = 0; l_col < num_levels; ++l_col) {
...@@ -326,23 +322,23 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1( ...@@ -326,23 +322,23 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
_grad_h = cache_grad_sampling_loc[1], _grad_h = cache_grad_sampling_loc[1],
_grad_a = cache_grad_attn_weight[0]; _grad_a = cache_grad_attn_weight[0];
int sid = 2; int sid = 2;
for (unsigned int tid = 1; tid < blockSize; ++tid) { for (unsigned int _tid = 1; _tid < blockSize; ++_tid) {
_grad_w += cache_grad_sampling_loc[sid]; _grad_w += cache_grad_sampling_loc[sid];
_grad_h += cache_grad_sampling_loc[sid + 1]; _grad_h += cache_grad_sampling_loc[sid + 1];
_grad_a += cache_grad_attn_weight[tid]; _grad_a += cache_grad_attn_weight[_tid];
sid += 2; sid += 2;
} }
*grad_sampling_loc = _grad_w; *grad_sampling_loc_out = _grad_w;
*(grad_sampling_loc + 1) = _grad_h; *(grad_sampling_loc_out + 1) = _grad_h;
*grad_attn_weight = _grad_a; *grad_attn_weight_out = _grad_a;
} }
__syncthreads(); __syncthreads();
data_weight_ptr += 1; data_weight_ptr += 1;
data_loc_w_ptr += 2; data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride; grad_attn_weight_out += grad_weight_stride;
grad_sampling_loc += grad_loc_stride; grad_sampling_loc_out += grad_loc_stride;
} }
} }
} }
...@@ -357,10 +353,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2( ...@@ -357,10 +353,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
const int channels, const int num_levels, const int num_query, const int channels, const int num_levels, const int num_query,
const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight) { scalar_t *grad_attn_weight) {
__shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
__shared__ scalar_t cache_grad_attn_weight[blockSize];
unsigned int tid = threadIdx.x;
CUDA_1D_KERNEL_LOOP(index, n) { CUDA_1D_KERNEL_LOOP(index, n) {
__shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
__shared__ scalar_t cache_grad_attn_weight[blockSize];
unsigned int tid = threadIdx.x;
int _temp = index; int _temp = index;
const int c_col = _temp % channels; const int c_col = _temp % channels;
_temp /= channels; _temp /= channels;
...@@ -375,8 +371,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2( ...@@ -375,8 +371,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
int data_weight_ptr = sampling_index * num_levels * num_point; int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1; int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr; const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1; scalar_t *grad_sampling_loc_out =
grad_attn_weight += grad_sampling_ptr; grad_sampling_loc + (grad_sampling_ptr << 1);
scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
const int grad_weight_stride = 1; const int grad_weight_stride = 1;
const int grad_loc_stride = 2; const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels; const int qid_stride = num_heads * channels;
...@@ -425,16 +422,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2( ...@@ -425,16 +422,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
} }
if (tid == 0) { if (tid == 0) {
*grad_sampling_loc = cache_grad_sampling_loc[0]; *grad_sampling_loc_out = cache_grad_sampling_loc[0];
*(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
*grad_attn_weight = cache_grad_attn_weight[0]; *grad_attn_weight_out = cache_grad_attn_weight[0];
} }
__syncthreads(); __syncthreads();
data_weight_ptr += 1; data_weight_ptr += 1;
data_loc_w_ptr += 2; data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride; grad_attn_weight_out += grad_weight_stride;
grad_sampling_loc += grad_loc_stride; grad_sampling_loc_out += grad_loc_stride;
} }
} }
} }
...@@ -449,11 +446,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1( ...@@ -449,11 +446,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
const int channels, const int num_levels, const int num_query, const int channels, const int num_levels, const int num_query,
const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight) { scalar_t *grad_attn_weight) {
extern __shared__ int _s[];
scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
CUDA_1D_KERNEL_LOOP(index, n) { CUDA_1D_KERNEL_LOOP(index, n) {
extern __shared__ int _s[];
scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index; int _temp = index;
const int c_col = _temp % channels; const int c_col = _temp % channels;
_temp /= channels; _temp /= channels;
...@@ -468,8 +465,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1( ...@@ -468,8 +465,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
int data_weight_ptr = sampling_index * num_levels * num_point; int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1; int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr; const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1; scalar_t *grad_sampling_loc_out =
grad_attn_weight += grad_sampling_ptr; grad_sampling_loc + (grad_sampling_ptr << 1);
scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
const int grad_weight_stride = 1; const int grad_weight_stride = 1;
const int grad_loc_stride = 2; const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels; const int qid_stride = num_heads * channels;
...@@ -509,23 +507,23 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1( ...@@ -509,23 +507,23 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
_grad_h = cache_grad_sampling_loc[1], _grad_h = cache_grad_sampling_loc[1],
_grad_a = cache_grad_attn_weight[0]; _grad_a = cache_grad_attn_weight[0];
int sid = 2; int sid = 2;
for (unsigned int tid = 1; tid < blockDim.x; ++tid) { for (unsigned int _tid = 1; _tid < blockDim.x; ++_tid) {
_grad_w += cache_grad_sampling_loc[sid]; _grad_w += cache_grad_sampling_loc[sid];
_grad_h += cache_grad_sampling_loc[sid + 1]; _grad_h += cache_grad_sampling_loc[sid + 1];
_grad_a += cache_grad_attn_weight[tid]; _grad_a += cache_grad_attn_weight[_tid];
sid += 2; sid += 2;
} }
*grad_sampling_loc = _grad_w; *grad_sampling_loc_out = _grad_w;
*(grad_sampling_loc + 1) = _grad_h; *(grad_sampling_loc_out + 1) = _grad_h;
*grad_attn_weight = _grad_a; *grad_attn_weight_out = _grad_a;
} }
__syncthreads(); __syncthreads();
data_weight_ptr += 1; data_weight_ptr += 1;
data_loc_w_ptr += 2; data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride; grad_attn_weight_out += grad_weight_stride;
grad_sampling_loc += grad_loc_stride; grad_sampling_loc_out += grad_loc_stride;
} }
} }
} }
...@@ -540,11 +538,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2( ...@@ -540,11 +538,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
const int channels, const int num_levels, const int num_query, const int channels, const int num_levels, const int num_query,
const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight) { scalar_t *grad_attn_weight) {
extern __shared__ int _s[];
scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
CUDA_1D_KERNEL_LOOP(index, n) { CUDA_1D_KERNEL_LOOP(index, n) {
extern __shared__ int _s[];
scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index; int _temp = index;
const int c_col = _temp % channels; const int c_col = _temp % channels;
_temp /= channels; _temp /= channels;
...@@ -559,8 +557,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2( ...@@ -559,8 +557,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
int data_weight_ptr = sampling_index * num_levels * num_point; int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1; int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr; const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1; scalar_t *grad_sampling_loc_out =
grad_attn_weight += grad_sampling_ptr; grad_sampling_loc + (grad_sampling_ptr << 1);
scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
const int grad_weight_stride = 1; const int grad_weight_stride = 1;
const int grad_loc_stride = 2; const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels; const int qid_stride = num_heads * channels;
...@@ -618,16 +617,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2( ...@@ -618,16 +617,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
} }
if (tid == 0) { if (tid == 0) {
*grad_sampling_loc = cache_grad_sampling_loc[0]; *grad_sampling_loc_out = cache_grad_sampling_loc[0];
*(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
*grad_attn_weight = cache_grad_attn_weight[0]; *grad_attn_weight_out = cache_grad_attn_weight[0];
} }
__syncthreads(); __syncthreads();
data_weight_ptr += 1; data_weight_ptr += 1;
data_loc_w_ptr += 2; data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride; grad_attn_weight_out += grad_weight_stride;
grad_sampling_loc += grad_loc_stride; grad_sampling_loc_out += grad_loc_stride;
} }
} }
} }
...@@ -642,11 +641,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks( ...@@ -642,11 +641,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
const int channels, const int num_levels, const int num_query, const int channels, const int num_levels, const int num_query,
const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight) { scalar_t *grad_attn_weight) {
extern __shared__ int _s[];
scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
CUDA_1D_KERNEL_LOOP(index, n) { CUDA_1D_KERNEL_LOOP(index, n) {
extern __shared__ int _s[];
scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index; int _temp = index;
const int c_col = _temp % channels; const int c_col = _temp % channels;
_temp /= channels; _temp /= channels;
...@@ -661,8 +660,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks( ...@@ -661,8 +660,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
int data_weight_ptr = sampling_index * num_levels * num_point; int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1; int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr; const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1; scalar_t *grad_sampling_loc_out =
grad_attn_weight += grad_sampling_ptr; grad_sampling_loc + (grad_sampling_ptr << 1);
scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
const int grad_weight_stride = 1; const int grad_weight_stride = 1;
const int grad_loc_stride = 2; const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels; const int qid_stride = num_heads * channels;
...@@ -720,16 +720,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks( ...@@ -720,16 +720,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
} }
if (tid == 0) { if (tid == 0) {
atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]); atomicAdd(grad_sampling_loc_out, cache_grad_sampling_loc[0]);
atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]); atomicAdd(grad_sampling_loc_out + 1, cache_grad_sampling_loc[1]);
atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]); atomicAdd(grad_attn_weight_out, cache_grad_attn_weight[0]);
} }
__syncthreads(); __syncthreads();
data_weight_ptr += 1; data_weight_ptr += 1;
data_loc_w_ptr += 2; data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride; grad_attn_weight_out += grad_weight_stride;
grad_sampling_loc += grad_loc_stride; grad_sampling_loc_out += grad_loc_stride;
} }
} }
} }
...@@ -759,8 +759,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm( ...@@ -759,8 +759,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(
int data_weight_ptr = sampling_index * num_levels * num_point; int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1; int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr; const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1; scalar_t *grad_sampling_loc_out =
grad_attn_weight += grad_sampling_ptr; grad_sampling_loc + (grad_sampling_ptr << 1);
scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
const int grad_weight_stride = 1; const int grad_weight_stride = 1;
const int grad_loc_stride = 2; const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels; const int qid_stride = num_heads * channels;
...@@ -787,12 +788,12 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm( ...@@ -787,12 +788,12 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(
ms_deform_attn_col2im_bilinear_gm( ms_deform_attn_col2im_bilinear_gm(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
w_im, m_col, c_col, top_grad, weight, grad_value_ptr, w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
grad_sampling_loc, grad_attn_weight); grad_sampling_loc_out, grad_attn_weight_out);
} }
data_weight_ptr += 1; data_weight_ptr += 1;
data_loc_w_ptr += 2; data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride; grad_attn_weight_out += grad_weight_stride;
grad_sampling_loc += grad_loc_stride; grad_sampling_loc_out += grad_loc_stride;
} }
} }
} }
......
...@@ -30,45 +30,88 @@ __device__ inline bool devIoU(float const *const a, float const *const b, ...@@ -30,45 +30,88 @@ __device__ inline bool devIoU(float const *const a, float const *const b,
__global__ void nms_cuda(const int n_boxes, const float iou_threshold, __global__ void nms_cuda(const int n_boxes, const float iou_threshold,
const int offset, const float *dev_boxes, const int offset, const float *dev_boxes,
unsigned long long *dev_mask) { unsigned long long *dev_mask) {
const int row_start = blockIdx.y; int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
const int col_start = blockIdx.x; CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
const int tid = threadIdx.x; const int tid = threadIdx.x;
if (row_start > col_start) return;
const int row_size =
fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ float block_boxes[threadsPerBlock * 4];
if (tid < col_size) {
block_boxes[tid * 4 + 0] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
block_boxes[tid * 4 + 1] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
block_boxes[tid * 4 + 2] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
block_boxes[tid * 4 + 3] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
}
__syncthreads();
if (tid < row_size) {
const int cur_box_idx = threadsPerBlock * row_start + tid;
const float *cur_box = dev_boxes + cur_box_idx * 4;
int i = 0;
unsigned long long int t = 0;
int start = 0;
if (row_start == col_start) {
start = tid + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
t |= 1ULL << i;
}
}
dev_mask[cur_box_idx * gridDim.y + col_start] = t;
}
}
}
if (row_start > col_start) return; __global__ void gather_keep_from_mask(bool *keep,
const unsigned long long *dev_mask,
const int n_boxes) {
const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
const int tid = threadIdx.x;
const int row_size = // mark the bboxes which have been removed.
fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock); extern __shared__ unsigned long long removed[];
const int col_size =
fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ float block_boxes[threadsPerBlock * 4]; // initialize removed.
if (tid < col_size) { for (int i = tid; i < col_blocks; i += blockDim.x) {
block_boxes[tid * 4 + 0] = removed[i] = 0;
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
block_boxes[tid * 4 + 1] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
block_boxes[tid * 4 + 2] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
block_boxes[tid * 4 + 3] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
} }
__syncthreads(); __syncthreads();
if (tid < row_size) { for (int nblock = 0; nblock < col_blocks; ++nblock) {
const int cur_box_idx = threadsPerBlock * row_start + tid; auto removed_val = removed[nblock];
const float *cur_box = dev_boxes + cur_box_idx * 4; __syncthreads();
int i = 0; const int i_offset = nblock * threadsPerBlock;
unsigned long long int t = 0; #pragma unroll
int start = 0; for (int inblock = 0; inblock < threadsPerBlock; ++inblock) {
if (row_start == col_start) { const int i = i_offset + inblock;
start = tid + 1; if (i >= n_boxes) break;
} // select a candidate, check if it should kept.
for (i = start; i < col_size; i++) { if (!(removed_val & (1ULL << inblock))) {
if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) { if (tid == 0) {
t |= 1ULL << i; // mark the output.
keep[i] = true;
}
auto p = dev_mask + i * col_blocks;
// remove all bboxes which overlap the candidate.
for (int j = tid; j < col_blocks; j += blockDim.x) {
if (j >= nblock) removed[j] |= p[j];
}
__syncthreads();
removed_val = removed[nblock];
} }
} }
dev_mask[cur_box_idx * gridDim.y + col_start] = t;
} }
} }
#endif // NMS_CUDA_KERNEL_CUH #endif // NMS_CUDA_KERNEL_CUH
...@@ -43,18 +43,16 @@ __global__ void nms_rotated_cuda_kernel(const int n_boxes, ...@@ -43,18 +43,16 @@ __global__ void nms_rotated_cuda_kernel(const int n_boxes,
// (x_center, y_center, width, height, angle_degrees) here. // (x_center, y_center, width, height, angle_degrees) here.
__shared__ T block_boxes[threadsPerBlock * 5]; __shared__ T block_boxes[threadsPerBlock * 5];
if (threadIdx.x < col_size) { if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 6 + 0] = block_boxes[threadIdx.x * 5 + 0] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0]; dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
block_boxes[threadIdx.x * 6 + 1] = block_boxes[threadIdx.x * 5 + 1] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1]; dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
block_boxes[threadIdx.x * 6 + 2] = block_boxes[threadIdx.x * 5 + 2] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2]; dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
block_boxes[threadIdx.x * 6 + 3] = block_boxes[threadIdx.x * 5 + 3] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3]; dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
block_boxes[threadIdx.x * 6 + 4] = block_boxes[threadIdx.x * 5 + 4] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4]; dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
block_boxes[threadIdx.x * 6 + 5] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5];
} }
__syncthreads(); __syncthreads();
...@@ -71,7 +69,7 @@ __global__ void nms_rotated_cuda_kernel(const int n_boxes, ...@@ -71,7 +69,7 @@ __global__ void nms_rotated_cuda_kernel(const int n_boxes,
// Instead of devIoU used by original horizontal nms, here // Instead of devIoU used by original horizontal nms, here
// we use the single_box_iou_rotated function from // we use the single_box_iou_rotated function from
// box_iou_rotated_utils.h // box_iou_rotated_utils.h
if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 6, 0) > if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >
iou_threshold) { iou_threshold) {
t |= 1ULL << i; t |= 1ULL << i;
} }
......
...@@ -45,20 +45,21 @@ __global__ void points_in_boxes_part_forward_cuda_kernel( ...@@ -45,20 +45,21 @@ __global__ void points_in_boxes_part_forward_cuda_kernel(
// (B, npoints), default -1 // (B, npoints), default -1
int bs_idx = blockIdx.y; int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
if (bs_idx >= batch_size || pt_idx >= pts_num) return; if (bs_idx >= batch_size) return;
boxes += bs_idx * boxes_num * 7; boxes += bs_idx * boxes_num * 7;
pts += bs_idx * pts_num * 3 + pt_idx * 3; pts += bs_idx * pts_num * 3 + pt_idx * 3;
box_idx_of_points += bs_idx * pts_num + pt_idx; box_idx_of_points += bs_idx * pts_num + pt_idx;
T local_x = 0, local_y = 0; T local_x = 0, local_y = 0;
int cur_in_flag = 0; int cur_in_flag = 0;
for (int k = 0; k < boxes_num; k++) { for (int k = 0; k < boxes_num; k++) {
cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
if (cur_in_flag) { if (cur_in_flag) {
box_idx_of_points[0] = k; box_idx_of_points[0] = k;
break; break;
}
} }
} }
} }
...@@ -73,19 +74,20 @@ __global__ void points_in_boxes_all_forward_cuda_kernel( ...@@ -73,19 +74,20 @@ __global__ void points_in_boxes_all_forward_cuda_kernel(
// (B, npoints), default -1 // (B, npoints), default -1
int bs_idx = blockIdx.y; int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
if (bs_idx >= batch_size || pt_idx >= pts_num) return; if (bs_idx >= batch_size) return;
boxes += bs_idx * boxes_num * 7; boxes += bs_idx * boxes_num * 7;
pts += bs_idx * pts_num * 3 + pt_idx * 3; pts += bs_idx * pts_num * 3 + pt_idx * 3;
box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num; box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
T local_x = 0, local_y = 0; T local_x = 0, local_y = 0;
for (int k = 0; k < boxes_num; k++) { for (int k = 0; k < boxes_num; k++) {
const int cur_in_flag = const int cur_in_flag =
check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
if (cur_in_flag) { if (cur_in_flag) {
box_idx_of_points[k] = 1; box_idx_of_points[k] = 1;
}
} }
} }
} }
......
// Copyright (c) OpenMMLab. All rights reserved
#ifndef POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
#define POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
struct point {
float x, y;
};
template <typename scalar_t>
__global__ void points_in_polygons_forward_cuda_kernel(
const int nthreads, const scalar_t *vertex1, const scalar_t *vertex2,
const int rows, const int cols, scalar_t *inside_flag) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int row = index / cols;
int col = index % cols;
const scalar_t *offset_vertex1 = vertex1 + row * 2;
const scalar_t *offset_vertex2 = vertex2 + col * 8;
point point_[1];
point polygon[4];
point_[0].x = offset_vertex1[0];
point_[0].y = offset_vertex1[1];
polygon[0].x = offset_vertex2[0];
polygon[0].y = offset_vertex2[1];
polygon[1].x = offset_vertex2[2];
polygon[1].y = offset_vertex2[3];
polygon[2].x = offset_vertex2[4];
polygon[2].y = offset_vertex2[5];
polygon[3].x = offset_vertex2[6];
polygon[3].y = offset_vertex2[7];
int nCross = 0;
int i, j;
float sx, sy, tx, ty, px, py, x;
for (i = 0, j = 3; i < 4; j = i, i++) {
sx = polygon[i].x;
sy = polygon[i].y;
tx = polygon[j].x;
ty = polygon[j].y;
px = point_[0].x;
py = point_[0].y;
if (py < min(sy, ty)) continue;
if (py > max(sy, ty)) continue;
if ((sx == px && sy == py) || (tx == px && ty == py)) {
break;
} else {
if ((sy < py && ty >= py) || (sy >= py && ty < py)) {
x = sx + (py - sy) * (tx - sx) / (ty - sy);
if (x == px) {
break;
}
if (x > px) {
nCross++;
}
}
}
}
if (nCross % 2 == 1) {
inside_flag[index] = 1.0;
} else {
inside_flag[index] = 0.0;
}
return;
}
}
#endif // POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu
// Distributed under terms of the MIT license.
#ifndef PRROI_POOL_CUDA_KERNEL_CUH
#define PRROI_POOL_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
template <typename T>
__device__ static __forceinline__ T PrRoIPoolingGetData(const T *data,
const int h,
const int w,
const int height,
const int width) {
bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
T retVal = overflow ? 0.0f : data[h * width + w];
return retVal;
}
template <typename T>
__device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) {
return (1.0f - abs(dh)) * (1.0f - abs(dw));
}
template <typename T>
__device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t,
T c1, T c2) {
return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1;
}
template <typename T>
__device__ static T PrRoIPoolingInterpolation(const T *data, const T h,
const T w, const int height,
const int width) {
T retVal = 0.0f;
int h1 = floorf(h);
int w1 = floorf(w);
retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
h1 = floorf(h) + 1;
w1 = floorf(w);
retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
h1 = floorf(h);
w1 = floorf(w) + 1;
retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
h1 = floorf(h) + 1;
w1 = floorf(w) + 1;
retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
return retVal;
}
template <typename T>
__device__ static T PrRoIPoolingMatCalculation(const T *this_data,
const int s_h, const int s_w,
const int e_h, const int e_w,
const T y0, const T x0,
const T y1, const T x1,
const int h0, const int w0) {
T alpha, beta, lim_alpha, lim_beta, tmp;
T sum_out = 0;
alpha = x0 - T(s_w);
beta = y0 - T(s_h);
lim_alpha = x1 - T(s_w);
lim_beta = y1 - T(s_h);
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp;
alpha = T(e_w) - x1;
lim_alpha = T(e_w) - x0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp;
alpha = x0 - T(s_w);
beta = T(e_h) - y1;
lim_alpha = x1 - T(s_w);
lim_beta = T(e_h) - y0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp;
alpha = T(e_w) - x1;
lim_alpha = T(e_w) - x0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp;
return sum_out;
}
template <typename T>
__device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff,
const int h, const int w,
const int height,
const int width,
const T coeff) {
bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff);
}
template <typename T>
__device__ static void PrRoIPoolingMatDistributeDiff(
T *diff, const T top_diff, const int s_h, const int s_w, const int e_h,
const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0,
const int w0) {
T alpha, beta, lim_alpha, lim_beta, tmp;
alpha = x0 - T(s_w);
beta = y0 - T(s_h);
lim_alpha = x1 - T(s_w);
lim_beta = y1 - T(s_h);
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp);
alpha = T(e_w) - x1;
lim_alpha = T(e_w) - x0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp);
alpha = x0 - T(s_w);
beta = T(e_h) - y1;
lim_alpha = x1 - T(s_w);
lim_beta = T(e_h) - y0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp);
alpha = T(e_w) - x1;
lim_alpha = T(e_w) - x0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp);
}
template <typename T>
__global__ void prroi_pool_forward_cuda_kernel(
const int nthreads, const T *input, const T *rois, T *output,
const int pooled_height, const int pooled_width, const T spatial_scale,
const int channels, const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T *offset_rois = rois + n * 5;
int roi_batch_ind = offset_rois[0];
T roi_x1 = offset_rois[1] * spatial_scale;
T roi_y1 = offset_rois[2] * spatial_scale;
T roi_x2 = offset_rois[3] * spatial_scale;
T roi_y2 = offset_rois[4] * spatial_scale;
T roi_width = max(roi_x2 - roi_x1, ((T)0.0));
T roi_height = max(roi_y2 - roi_y1, ((T)0.0));
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
const T *this_data =
input + (roi_batch_ind * channels + c) * height * width;
T *this_out = output + index;
T bin_x1 = roi_x1 + bin_size_w * pw;
T bin_y1 = roi_y1 + bin_size_h * ph;
T bin_x2 = bin_x1 + bin_size_w;
T bin_y2 = bin_y1 + bin_size_h;
T bin_size = max(T(0.0), bin_size_w * bin_size_h);
if (bin_size == 0) {
*this_out = 0;
continue;
}
T sum_out = 0;
int start_x, start_y, end_x, end_y;
start_x = floorf(bin_x1);
end_x = ceilf(bin_x2);
start_y = floorf(bin_y1);
end_y = ceilf(bin_y2);
for (int bin_x = start_x; bin_x < end_x; ++bin_x)
for (int bin_y = start_y; bin_y < end_y; ++bin_y)
sum_out += PrRoIPoolingMatCalculation(
this_data, bin_y, bin_x, bin_y + 1, bin_x + 1,
max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
width);
*this_out = sum_out / bin_size;
}
}
template <typename T>
__global__ void prroi_pool_backward_cuda_kernel(
const int nthreads, const T *grad_output, const T *rois, T *grad_input,
const int pooled_height, const int pooled_width, const T spatial_scale,
const int channels, const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
rois += n * 5;
int roi_batch_ind = rois[0];
T roi_x1 = rois[1] * spatial_scale;
T roi_y1 = rois[2] * spatial_scale;
T roi_x2 = rois[3] * spatial_scale;
T roi_y2 = rois[4] * spatial_scale;
T roi_width = max(roi_x2 - roi_x1, (T)0);
T roi_height = max(roi_y2 - roi_y1, (T)0);
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
const T *this_out_grad = grad_output + index;
T *this_data_grad =
grad_input + (roi_batch_ind * channels + c) * height * width;
T bin_x1 = roi_x1 + bin_size_w * pw;
T bin_y1 = roi_y1 + bin_size_h * ph;
T bin_x2 = bin_x1 + bin_size_w;
T bin_y2 = bin_y1 + bin_size_h;
T bin_size = max(T(0.0), bin_size_w * bin_size_h);
T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size;
int start_x, start_y, end_x, end_y;
start_x = floorf(bin_x1);
end_x = ceilf(bin_x2);
start_y = floorf(bin_y1);
end_y = ceilf(bin_y2);
for (int bin_x = start_x; bin_x < end_x; ++bin_x)
for (int bin_y = start_y; bin_y < end_y; ++bin_y)
PrRoIPoolingMatDistributeDiff(
this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1,
max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
width);
}
}
template <typename T>
__global__ void prroi_pool_coor_backward_cuda_kernel(
const int nthreads, const T *output, const T *grad_output, const T *input,
const T *rois, T *grad_rois, const int pooled_height,
const int pooled_width, const T spatial_scale, const int channels,
const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
rois += n * 5;
int roi_batch_ind = rois[0];
T roi_x1 = rois[1] * spatial_scale;
T roi_y1 = rois[2] * spatial_scale;
T roi_x2 = rois[3] * spatial_scale;
T roi_y2 = rois[4] * spatial_scale;
T roi_width = max(roi_x2 - roi_x1, (T)0);
T roi_height = max(roi_y2 - roi_y1, (T)0);
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
const T output_grad_val = grad_output[index];
const T *this_input_data =
input + (roi_batch_ind * channels + c) * height * width;
const T output_val = output[index];
T *this_rois_grad = grad_rois + n * 5;
T bin_x1 = roi_x1 + bin_size_w * pw;
T bin_y1 = roi_y1 + bin_size_h * ph;
T bin_x2 = bin_x1 + bin_size_w;
T bin_y2 = bin_y1 + bin_size_h;
T bin_size = max(T(0.0), bin_size_w * bin_size_h);
T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size;
// WARNING: to be discussed
if (sum_out == 0) return;
int start_x, start_y, end_x, end_y;
start_x = floorf(bin_x1);
end_x = ceilf(bin_x2);
start_y = floorf(bin_y1);
end_y = ceilf(bin_y2);
T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0;
for (int bin_y = start_y; bin_y < end_y; ++bin_y) {
grad_x1_y += PrRoIPoolingSingleCoorIntegral(
max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1,
height, width),
PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1,
height, width));
grad_x2_y += PrRoIPoolingSingleCoorIntegral(
max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2,
height, width),
PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2,
height, width));
}
for (int bin_x = start_x; bin_x < end_x; ++bin_x) {
grad_x_y1 += PrRoIPoolingSingleCoorIntegral(
max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x),
height, width),
PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1),
height, width));
grad_x_y2 += PrRoIPoolingSingleCoorIntegral(
max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x),
height, width),
PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1),
height, width));
}
T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val;
T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val;
T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val;
T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val;
partial_x1 = partial_x1 / bin_size * spatial_scale;
partial_x2 = partial_x2 / bin_size * spatial_scale;
partial_y1 = partial_y1 / bin_size * spatial_scale;
partial_y2 = partial_y2 / bin_size * spatial_scale;
// (index, x1, y1, x2, y2)
this_rois_grad[0] = 0;
atomicAdd(this_rois_grad + 1,
(partial_x1 * (1.0f - T(pw) / pooled_width) +
partial_x2 * (1.0f - T(pw + 1) / pooled_width)) *
output_grad_val);
atomicAdd(this_rois_grad + 2,
(partial_y1 * (1.0f - T(ph) / pooled_height) +
partial_y2 * (1.0f - T(ph + 1) / pooled_height)) *
output_grad_val);
atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width +
partial_x1 * T(pw) / pooled_width) *
output_grad_val);
atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height +
partial_y1 * T(ph) / pooled_height) *
output_grad_val);
}
}
#endif // ROI_POOL_CUDA_KERNEL_CUH
// Modified from
// https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu
#ifndef RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
#define RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
#include <float.h>
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else // MMCV_USE_PARROTS
#include "pytorch_cuda_helper.hpp"
#endif // MMCV_USE_PARROTS
/*** Forward ***/
template <typename scalar_t>
__global__ void riroi_align_rotated_forward_cuda_kernel(
const int nthreads, const scalar_t *bottom_data,
const scalar_t *bottom_rois, const scalar_t spatial_scale,
const int num_samples, const bool clockwise, const int channels,
const int height, const int width, const int pooled_height,
const int pooled_width, const int num_orientations, scalar_t *top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int o = (index / pooled_width / pooled_height) % num_orientations;
int c =
(index / pooled_width / pooled_height / num_orientations) % channels;
int n = index / pooled_width / pooled_height / num_orientations / channels;
const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
int roi_batch_ind = offset_bottom_rois[0];
// Do not using rounding; this implementation detail is critical
scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
// scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
scalar_t theta = offset_bottom_rois[5];
// Force malformed ROIs to be 1x1
roi_width = max(roi_width, (scalar_t)1.);
roi_height = max(roi_height, (scalar_t)1.);
scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
static_cast<scalar_t>(pooled_height);
scalar_t bin_size_w =
static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
// find aligned index
scalar_t ind_float = theta * num_orientations / (2 * M_PI);
int ind = floorf(ind_float);
scalar_t l_var = ind_float - (scalar_t)ind;
scalar_t r_var = 1.0 - l_var;
// correct start channel
ind = (ind + num_orientations) % num_orientations;
// rotated channel
int ind_rot = (o - ind + num_orientations) % num_orientations;
int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
const scalar_t *offset_bottom_data =
bottom_data + (roi_batch_ind * channels * num_orientations +
c * num_orientations + ind_rot) *
height * width;
const scalar_t *offset_bottom_data_plus =
bottom_data + (roi_batch_ind * channels * num_orientations +
c * num_orientations + ind_rot_plus) *
height * width;
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (num_samples > 0)
? num_samples
: ceilf(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
if (clockwise) {
theta = -theta; // If clockwise, the angle needs to be reversed.
}
scalar_t roi_start_h = -roi_height / 2.0;
scalar_t roi_start_w = -roi_width / 2.0;
scalar_t cosscalar_theta = cos(theta);
scalar_t sinscalar_theta = sin(theta);
// We do average (integral) pooling inside a bin
const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
scalar_t output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1
const scalar_t yy =
roi_start_h + ph * bin_size_h +
static_cast<scalar_t>(iy + .5f) * bin_size_h /
static_cast<scalar_t>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const scalar_t xx = roi_start_w + pw * bin_size_w +
static_cast<scalar_t>(ix + .5f) * bin_size_w /
static_cast<scalar_t>(roi_bin_grid_w);
// Rotate by theta (counterclockwise) around the center and translate
scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
scalar_t val = bilinear_interpolate<scalar_t>(
offset_bottom_data, height, width, y, x, index);
scalar_t val_plus = bilinear_interpolate<scalar_t>(
offset_bottom_data_plus, height, width, y, x, index);
output_val += r_var * val + l_var * val_plus;
}
}
output_val /= count;
top_data[index] = output_val;
}
}
/*** Backward ***/
template <typename scalar_t>
__global__ void riroi_align_rotated_backward_cuda_kernel(
const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
const scalar_t spatial_scale, const int num_samples, const bool clockwise,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, const int num_orientations,
scalar_t *bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int o = (index / pooled_width / pooled_height) % num_orientations;
int c =
(index / pooled_width / pooled_height / num_orientations) % channels;
int n = index / pooled_width / pooled_height / num_orientations / channels;
const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
int roi_batch_ind = offset_bottom_rois[0];
// Do not round
scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
// scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
scalar_t theta = offset_bottom_rois[5];
// Force malformed ROIs to be 1x1
roi_width = max(roi_width, (scalar_t)1.);
roi_height = max(roi_height, (scalar_t)1.);
scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
static_cast<scalar_t>(pooled_height);
scalar_t bin_size_w =
static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
// find aligned index
scalar_t ind_float = theta * num_orientations / (2 * M_PI);
int ind = floorf(ind_float);
scalar_t l_var = ind_float - (scalar_t)ind;
scalar_t r_var = 1.0 - l_var;
// correct start channel
ind = (ind + num_orientations) % num_orientations;
// rotated channel
int ind_rot = (o - ind + num_orientations) % num_orientations;
int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
scalar_t *offset_bottom_diff =
bottom_diff + (roi_batch_ind * channels * num_orientations +
c * num_orientations + ind_rot) *
height * width;
scalar_t *offset_bottom_diff_plus =
bottom_diff + (roi_batch_ind * channels * num_orientations +
c * num_orientations + ind_rot_plus) *
height * width;
int top_offset =
(n * channels * num_orientations + c * num_orientations + o) *
pooled_height * pooled_width;
const scalar_t *offset_top_diff = top_diff + top_offset;
const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (num_samples > 0)
? num_samples
: ceilf(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
if (clockwise) {
theta = -theta; // If clockwise, the angle needs to be reversed.
}
scalar_t roi_start_h = -roi_height / 2.0;
scalar_t roi_start_w = -roi_width / 2.0;
scalar_t cosTheta = cos(theta);
scalar_t sinTheta = sin(theta);
// We do average (integral) pooling inside a bin
const scalar_t count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1
const scalar_t yy =
roi_start_h + ph * bin_size_h +
static_cast<scalar_t>(iy + .5f) * bin_size_h /
static_cast<scalar_t>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const scalar_t xx = roi_start_w + pw * bin_size_w +
static_cast<scalar_t>(ix + .5f) * bin_size_w /
static_cast<scalar_t>(roi_bin_grid_w);
// Rotate by theta around the center and translate
scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;
scalar_t w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
w4, x_low, x_high, y_low,
y_high, index);
scalar_t g1 = top_diff_this_bin * w1 / count;
scalar_t g2 = top_diff_this_bin * w2 / count;
scalar_t g3 = top_diff_this_bin * w3 / count;
scalar_t g4 = top_diff_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
atomicAdd(offset_bottom_diff + y_low * width + x_low, g1 * r_var);
atomicAdd(offset_bottom_diff + y_low * width + x_high, g2 * r_var);
atomicAdd(offset_bottom_diff + y_high * width + x_low, g3 * r_var);
atomicAdd(offset_bottom_diff + y_high * width + x_high, g4 * r_var);
atomicAdd(offset_bottom_diff_plus + y_low * width + x_low,
g1 * l_var);
atomicAdd(offset_bottom_diff_plus + y_low * width + x_high,
g2 * l_var);
atomicAdd(offset_bottom_diff_plus + y_high * width + x_low,
g3 * l_var);
atomicAdd(offset_bottom_diff_plus + y_high * width + x_high,
g4 * l_var);
} // if
} // ix
} // iy
} // CUDA_1D_KERNEL_LOOP
} // RiRoIAlignBackward
#endif // RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
...@@ -20,7 +20,7 @@ template <typename scalar_t> ...@@ -20,7 +20,7 @@ template <typename scalar_t>
__global__ void roi_align_rotated_forward_cuda_kernel( __global__ void roi_align_rotated_forward_cuda_kernel(
const int nthreads, const scalar_t *bottom_data, const int nthreads, const scalar_t *bottom_data,
const scalar_t *bottom_rois, const scalar_t spatial_scale, const scalar_t *bottom_rois, const scalar_t spatial_scale,
const int sample_num, const bool aligned, const bool clockwise, const int sampling_ratio, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, scalar_t *top_data) { const int pooled_height, const int pooled_width, scalar_t *top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) { CUDA_1D_KERNEL_LOOP(index, nthreads) {
...@@ -58,11 +58,11 @@ __global__ void roi_align_rotated_forward_cuda_kernel( ...@@ -58,11 +58,11 @@ __global__ void roi_align_rotated_forward_cuda_kernel(
bottom_data + (roi_batch_ind * channels + c) * height * width; bottom_data + (roi_batch_ind * channels + c) * height * width;
// We use roi_bin_grid to sample the grid and mimic integral // We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sample_num > 0) int roi_bin_grid_h = (sampling_ratio > 0)
? sample_num ? sampling_ratio
: ceilf(roi_height / pooled_height); // e.g., = 2 : ceilf(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w = int roi_bin_grid_w =
(sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width); (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after. // Appropriate translation needs to be applied after.
...@@ -104,7 +104,7 @@ __global__ void roi_align_rotated_forward_cuda_kernel( ...@@ -104,7 +104,7 @@ __global__ void roi_align_rotated_forward_cuda_kernel(
template <typename scalar_t> template <typename scalar_t>
__global__ void roi_align_rotated_backward_cuda_kernel( __global__ void roi_align_rotated_backward_cuda_kernel(
const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois, const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
const scalar_t spatial_scale, const int sample_num, const bool aligned, const scalar_t spatial_scale, const int sampling_ratio, const bool aligned,
const bool clockwise, const int channels, const int height, const int width, const bool clockwise, const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, scalar_t *bottom_diff) { const int pooled_height, const int pooled_width, scalar_t *bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) { CUDA_1D_KERNEL_LOOP(index, nthreads) {
...@@ -146,11 +146,11 @@ __global__ void roi_align_rotated_backward_cuda_kernel( ...@@ -146,11 +146,11 @@ __global__ void roi_align_rotated_backward_cuda_kernel(
const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
// We use roi_bin_grid to sample the grid and mimic integral // We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sample_num > 0) int roi_bin_grid_h = (sampling_ratio > 0)
? sample_num ? sampling_ratio
: ceilf(roi_height / pooled_height); // e.g., = 2 : ceilf(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w = int roi_bin_grid_w =
(sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width); (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after. // Appropriate translation needs to be applied after.
......
...@@ -44,37 +44,38 @@ __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num, ...@@ -44,37 +44,38 @@ __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
// coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N, // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N,
// npoints): -1 means point does not in this box, otherwise: encode (x_idxs, // npoints): -1 means point does not in this box, otherwise: encode (x_idxs,
// y_idxs, z_idxs) by binary bit // y_idxs, z_idxs) by binary bit
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
int box_idx = blockIdx.y; int box_idx = blockIdx.y;
if (pt_idx >= pts_num || box_idx >= boxes_num) return; CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
if (box_idx >= boxes_num) return;
pts += pt_idx * 3; pts += pt_idx * 3;
rois += box_idx * 7; rois += box_idx * 7;
pts_mask += box_idx * pts_num + pt_idx; pts_mask += box_idx * pts_num + pt_idx;
T local_x = 0, local_y = 0; T local_x = 0, local_y = 0;
int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y); int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
pts_mask[0] = -1; pts_mask[0] = -1;
if (cur_in_flag > 0) { if (cur_in_flag > 0) {
T local_z = pts[2] - rois[2]; T local_z = pts[2] - rois[2];
T x_size = rois[3], y_size = rois[4], z_size = rois[5]; T x_size = rois[3], y_size = rois[4], z_size = rois[5];
T x_res = x_size / out_x; T x_res = x_size / out_x;
T y_res = y_size / out_y; T y_res = y_size / out_y;
T z_res = z_size / out_z; T z_res = z_size / out_z;
unsigned int x_idx = int((local_x + x_size / 2) / x_res); unsigned int x_idx = int((local_x + x_size / 2) / x_res);
unsigned int y_idx = int((local_y + y_size / 2) / y_res); unsigned int y_idx = int((local_y + y_size / 2) / y_res);
unsigned int z_idx = int(local_z / z_res); unsigned int z_idx = int(local_z / z_res);
x_idx = min(max(x_idx, 0), out_x - 1); x_idx = min(max(x_idx, 0), out_x - 1);
y_idx = min(max(y_idx, 0), out_y - 1); y_idx = min(max(y_idx, 0), out_y - 1);
z_idx = min(max(z_idx, 0), out_z - 1); z_idx = min(max(z_idx, 0), out_z - 1);
unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx; unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
pts_mask[0] = idx_encoding; pts_mask[0] = idx_encoding;
}
} }
} }
...@@ -86,26 +87,24 @@ __global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num, ...@@ -86,26 +87,24 @@ __global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
T *pts_idx_of_voxels) { T *pts_idx_of_voxels) {
// params pts_mask: (N, npoints) 0 or 1 // params pts_mask: (N, npoints) 0 or 1
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
CUDA_1D_KERNEL_LOOP(box_idx, boxes_num) {
int box_idx = blockIdx.x * blockDim.x + threadIdx.x; int max_num_pts = max_pts_each_voxel - 1; // index 0 is the counter
if (box_idx >= boxes_num) return; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
int max_num_pts = max_pts_each_voxel - 1; // index 0 is the counter for (int k = 0; k < pts_num; k++) {
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel; if (pts_mask[box_idx * pts_num + k] != -1) {
unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
for (int k = 0; k < pts_num; k++) { unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
if (pts_mask[box_idx * pts_num + k] != -1) { unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
unsigned int idx_encoding = pts_mask[box_idx * pts_num + k]; unsigned int z_idx = idx_encoding & 0xFF;
unsigned int x_idx = (idx_encoding >> 16) & 0xFF; unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
unsigned int y_idx = (idx_encoding >> 8) & 0xFF; y_idx * out_z * max_pts_each_voxel +
unsigned int z_idx = idx_encoding & 0xFF; z_idx * max_pts_each_voxel;
unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel + unsigned int cnt = pts_idx_of_voxels[base_offset];
y_idx * out_z * max_pts_each_voxel + if (cnt < max_num_pts) {
z_idx * max_pts_each_voxel; pts_idx_of_voxels[base_offset + cnt + 1] = k;
unsigned int cnt = pts_idx_of_voxels[base_offset]; pts_idx_of_voxels[base_offset]++;
if (cnt < max_num_pts) { }
pts_idx_of_voxels[base_offset + cnt + 1] = k;
pts_idx_of_voxels[base_offset]++;
} }
} }
} }
...@@ -124,39 +123,38 @@ __global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels, ...@@ -124,39 +123,38 @@ __global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
int box_idx = blockIdx.z; int box_idx = blockIdx.z;
int channel_idx = blockIdx.y; int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
int x_idx = voxel_idx_flat / (out_y * out_z);
int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z;
int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels) return;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z) int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
return; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
offset_base * max_pts_each_voxel;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; pooled_features += box_idx * out_x * out_y * out_z * channels +
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * channels + channel_idx;
offset_base * max_pts_each_voxel; argmax += box_idx * out_x * out_y * out_z * channels +
pooled_features += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx;
offset_base * channels + channel_idx;
argmax += box_idx * out_x * out_y * out_z * channels + int argmax_idx = -1;
offset_base * channels + channel_idx; float max_val = -1e50;
int argmax_idx = -1; int total_pts = pts_idx_of_voxels[0];
float max_val = -1e50;
for (int k = 1; k <= total_pts; k++) {
int total_pts = pts_idx_of_voxels[0]; if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] >
max_val) {
for (int k = 1; k <= total_pts; k++) { max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) { argmax_idx = pts_idx_of_voxels[k];
max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx]; }
argmax_idx = pts_idx_of_voxels[k];
} }
}
if (argmax_idx != -1) { if (argmax_idx != -1) {
pooled_features[0] = max_val; pooled_features[0] = max_val;
}
argmax[0] = argmax_idx;
} }
argmax[0] = argmax_idx;
} }
template <typename T> template <typename T>
...@@ -172,30 +170,28 @@ __global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels, ...@@ -172,30 +170,28 @@ __global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
int box_idx = blockIdx.z; int box_idx = blockIdx.z;
int channel_idx = blockIdx.y; int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
int x_idx = voxel_idx_flat / (out_y * out_z);
int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z;
int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels) return;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z) int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
return; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
offset_base * max_pts_each_voxel;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; pooled_features += box_idx * out_x * out_y * out_z * channels +
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * channels + channel_idx;
offset_base * max_pts_each_voxel;
pooled_features += box_idx * out_x * out_y * out_z * channels + float sum_val = 0;
offset_base * channels + channel_idx; int total_pts = pts_idx_of_voxels[0];
float sum_val = 0; for (int k = 1; k <= total_pts; k++) {
int total_pts = pts_idx_of_voxels[0]; sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
}
for (int k = 1; k <= total_pts; k++) {
sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
}
if (total_pts > 0) { if (total_pts > 0) {
pooled_features[0] = sum_val / total_pts; pooled_features[0] = sum_val / total_pts;
}
} }
} }
...@@ -210,24 +206,22 @@ __global__ void roiaware_maxpool3d_backward(int boxes_num, int channels, ...@@ -210,24 +206,22 @@ __global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
int box_idx = blockIdx.z; int box_idx = blockIdx.z;
int channel_idx = blockIdx.y; int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
int x_idx = voxel_idx_flat / (out_y * out_z);
int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z;
int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels) return;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z) int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
return; argmax += box_idx * out_x * out_y * out_z * channels +
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
argmax += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
grad_out += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx; offset_base * channels + channel_idx;
grad_out += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
if (argmax[0] == -1) return; if (argmax[0] == -1) return;
atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1); atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
}
} }
template <typename T> template <typename T>
...@@ -242,26 +236,24 @@ __global__ void roiaware_avgpool3d_backward(int boxes_num, int channels, ...@@ -242,26 +236,24 @@ __global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
int box_idx = blockIdx.z; int box_idx = blockIdx.z;
int channel_idx = blockIdx.y; int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
int x_idx = voxel_idx_flat / (out_y * out_z);
int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z;
int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels) return;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z) int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
return; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
offset_base * max_pts_each_voxel;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; grad_out += box_idx * out_x * out_y * out_z * channels +
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * channels + channel_idx;
offset_base * max_pts_each_voxel;
grad_out += box_idx * out_x * out_y * out_z * channels + int total_pts = pts_idx_of_voxels[0];
offset_base * channels + channel_idx; float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
for (int k = 1; k <= total_pts; k++) {
int total_pts = pts_idx_of_voxels[0]; atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
float cur_grad = 1 / fmaxf(float(total_pts), 1.0); grad_out[0] * cur_grad);
for (int k = 1; k <= total_pts; k++) { }
atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
grad_out[0] * cur_grad);
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment