Commit fdeee889 authored by limm's avatar limm
Browse files

release v1.6.1 of mmcv

parent df465820
......@@ -32,12 +32,12 @@ __device__ inline int Loc2Index(const int n, const int c, const int h,
#ifndef HIP_DIFF
/* TODO: move this to a common place */
template <typename scalar_t>
__device__ inline scalar_t mmcv_min(scalar_t a, scalar_t b) {
__device__ inline scalar_t min(scalar_t a, scalar_t b) {
return a < b ? a : b;
}
template <typename scalar_t>
__device__ inline scalar_t mmcv_max(scalar_t a, scalar_t b) {
__device__ inline scalar_t max(scalar_t a, scalar_t b) {
return a > b ? a : b;
}
#endif
......
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu
#ifndef CHAMFER_DISTANCE_CUDA_KERNEL_CUH
#define CHAMFER_DISTANCE_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#define MAX_SHARED_SCALAR_T 6144 // 49152 / 8 = 6144
template <typename scalar_t>
__global__ void chamfer_distance_forward_cuda_kernel(int b, int n,
const scalar_t* xyz, int m,
const scalar_t* xyz2,
scalar_t* result,
int* result_i) {
__shared__ scalar_t buf[MAX_SHARED_SCALAR_T];
for (int i = blockIdx.x; i < b; i += gridDim.x) {
for (int k2 = 0; k2 < m; k2 += THREADS_PER_BLOCK) {
int end_k = min(m, k2 + THREADS_PER_BLOCK) - k2;
for (int j = threadIdx.x; j < end_k * 2; j += blockDim.x) {
buf[j] = xyz2[(i * m + k2) * 2 + j];
}
__syncthreads();
for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
scalar_t x1 = xyz[(i * n + j) * 2 + 0];
scalar_t y1 = xyz[(i * n + j) * 2 + 1];
int best_i = 0;
scalar_t best = 1e10;
int end_ka = end_k & (~2);
if (end_ka == THREADS_PER_BLOCK) {
for (int k = 0; k < THREADS_PER_BLOCK; k += 4) {
#pragma unroll
for (int j = 0; j < 4; ++j) {
scalar_t x2 = buf[(k + j) * 2] - x1;
scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
scalar_t d = x2 * x2 + y2 * y2;
if (d < best) {
best = d;
best_i = k + k2 + j;
}
}
}
} else {
for (int k = 0; k < end_ka; k += 4) {
#pragma unroll
for (int j = 0; j < 4; ++j) {
scalar_t x2 = buf[(k + j) * 2] - x1;
scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
scalar_t d = x2 * x2 + y2 * y2;
if (d < best) {
best = d;
best_i = k + k2 + j;
}
}
}
}
for (int k = end_ka; k < end_k; k++) {
scalar_t x2 = buf[k * 2 + 0] - x1;
scalar_t y2 = buf[k * 2 + 1] - y1;
scalar_t d = x2 * x2 + y2 * y2;
if (k == 0 || d < best) {
best = d;
best_i = k + k2;
}
}
if (k2 == 0 || result[(i * n + j)] > best) {
result[(i * n + j)] = best;
result_i[(i * n + j)] = best_i;
}
}
__syncthreads();
}
}
}
template <typename scalar_t>
__global__ void chamfer_distance_backward_cuda_kernel(
int b, int n, const scalar_t* xyz1, int m, const scalar_t* xyz2,
const scalar_t* grad_dist1, const int* idx1, scalar_t* grad_xyz1,
scalar_t* grad_xyz2) {
for (int i = blockIdx.x; i < b; i += gridDim.x) {
for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
scalar_t x1 = xyz1[(i * n + j) * 2 + 0];
scalar_t y1 = xyz1[(i * n + j) * 2 + 1];
int j2 = idx1[i * n + j];
scalar_t x2 = xyz2[(i * m + j2) * 2 + 0];
scalar_t y2 = xyz2[(i * m + j2) * 2 + 1];
scalar_t g = grad_dist1[i * n + j] * 2;
atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 0]), g * (x1 - x2));
atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 1]), g * (y1 - y2));
atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 0]), -(g * (x1 - x2)));
atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 1]), -(g * (y1 - y2)));
}
}
}
#endif // CHAMFER_DISTANCE_CUDA_KERNEL_CUH
......@@ -7,12 +7,20 @@
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
#define THREADS_PER_BLOCK 512
#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x) \
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
j += blockDim.y * gridDim.y)
#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define THREADS_PER_BLOCK 512
inline int GET_BLOCKS(const int N) {
int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
int optimal_block_num = (N + num_threads - 1) / num_threads;
int max_block_num = 4096;
return min(optimal_block_num, max_block_num);
}
......
// Copyright (c) OpenMMLab. All rights reserved
#ifndef CONVEX_IOU_CUDA_KERNEL_CUH
#define CONVEX_IOU_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#define MAXN 100
#define NMAX 512
__device__ const double EPS = 1E-8;
__device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); }
struct Point {
double x, y;
__device__ Point() {}
__device__ Point(double x, double y) : x(x), y(y) {}
};
__device__ inline bool point_same(Point& a, Point& b) {
return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0;
}
__device__ inline void swap1(Point* a, Point* b) {
Point temp;
temp.x = a->x;
temp.y = a->y;
a->x = b->x;
a->y = b->y;
b->x = temp.x;
b->y = temp.y;
}
__device__ inline void reverse1(Point* a, const int n) {
for (int i = 0; i < (n - 1) / 2.0; i++) {
Point* j = &(a[i]);
Point* k = &(a[n - 1 - i]);
swap1(j, k);
}
}
__device__ inline double cross(Point o, Point a, Point b) {
return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
}
__device__ inline double dis(Point a, Point b) {
return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
}
__device__ inline double area(Point* ps, int n) {
ps[n] = ps[0];
double res = 0;
for (int i = 0; i < n; i++) {
res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
}
return res / 2.0;
}
__device__ inline double polygon_area_grad(Point* ps, int n,
int* polygon_to_pred_index,
int n_pred, double* grad_C) {
ps[n] = ps[0];
double partion_grad[4 * 30 + 2];
double res = 0;
for (int i = 0; i < n; i++) {
res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
partion_grad[i * 4 + 2] = ps[i + 1].y;
partion_grad[i * 4 + 3] = -ps[i + 1].x;
if (i != n - 1) {
partion_grad[i * 4 + 4] = -ps[i].y;
partion_grad[i * 4 + 5] = ps[i].x;
} else {
partion_grad[0] = -ps[i].y;
partion_grad[1] = ps[i].x;
}
}
for (int i = 0; i < n; i++) {
for (int j = 0; j < n_pred; j++) {
if (i == polygon_to_pred_index[j]) {
grad_C[2 * polygon_to_pred_index[j + n_pred]] =
(partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2;
break;
}
}
for (int j = 0; j < n_pred; j++) {
if (i == polygon_to_pred_index[j]) {
grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] =
(partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2;
break;
}
}
}
return res / 2.0;
}
__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p,
double* cut_grad, int m, int n, int i) {
double s1, s2;
double s2_s1_2;
double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd;
double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd;
s1 = cross(a, b, c);
s2 = cross(a, b, d);
ds1_dxc = -(b.y - a.y);
ds1_dyc = b.x - a.x;
ds2_dxd = ds1_dxc;
ds2_dyd = ds1_dyc;
s2_s1_2 = (s2 - s1) * (s2 - s1);
if (sig(s1) == 0 && sig(s2) == 0) return 2;
if (sig(s2 - s1) == 0) return 0;
dxp_dxc =
((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) /
(s2_s1_2);
dxp_dyc =
((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) /
(s2_s1_2);
dxp_dxd =
((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) /
(s2_s1_2);
dxp_dyd =
((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) /
(s2_s1_2);
dyp_dxc =
((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) /
(s2_s1_2);
dyp_dyc =
((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) /
(s2_s1_2);
dyp_dxd =
((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) /
(s2_s1_2);
dyp_dyd =
((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) /
(s2_s1_2);
p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
if (i == n - 1) {
cut_grad[4 * n * m + 4 * i] = dxp_dxc; // + dyp_dxc;
cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc; // + dyp_dyc;
cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
cut_grad[4 * n * m + 0] = dxp_dxd; // + dyp_dxd;
cut_grad[4 * n * m + 1] = dyp_dxd;
cut_grad[4 * n * m + 2] = dxp_dyd; // + dyp_dyd;
cut_grad[4 * n * m + 3] = dyp_dyd;
} else {
cut_grad[4 * n * m + 4 * i] = dxp_dxc; // + dyp_dxc;
cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc; // + dyp_dyc;
cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd; // + dyp_dxd;
cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd;
cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd; // + dyp_dyd;
cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd;
}
return 1;
}
__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b,
double* cut_grad) {
Point pp[MAXN];
double ccur_grad[MAXN] = {};
int m = 0;
p[n] = p[0];
int k = n;
for (int i = 0; i < n; i++) {
if (sig(cross(a, b, p[i])) > 0) {
pp[m] = p[i];
ccur_grad[4 * n * m + 4 * i] = 1.0;
ccur_grad[4 * n * m + 4 * i + 3] = 1.0;
m++;
}
if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i);
m++;
}
}
n = 0;
for (int i = 0; i < m; i++) {
if (!i || !(point_same(pp[i], pp[i - 1]))) {
p[n] = pp[i];
for (int j = 0; j < 4 * k; j++) {
cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j];
}
n++;
}
}
while (n > 1 && point_same(p[n - 1], p[0])) n--;
}
__device__ inline double intersectArea(Point a, Point b, Point c, Point d,
double* grad_AB, int order,
int convex_n) {
Point o(0, 0);
int res_flag = 0;
int s1 = sig(cross(o, a, b));
int s2 = sig(cross(o, c, d));
if (s1 == 0 || s2 == 0) return 0.0;
if (s1 == -1) {
Point* i = &a;
Point* j = &b;
swap1(i, j);
res_flag = 1;
}
if (s2 == -1) {
Point* i = &c;
Point* j = &d;
swap1(i, j);
}
Point p[10] = {o, a, b};
int n = 3, n0 = 3, n1, n2, n3;
double cut_grad1[MAXN] = {};
double cut_grad2[MAXN] = {};
double cut_grad3[MAXN] = {};
double p1_p_grad[10][10] = {};
double p2_p1_grad[10][10] = {};
double p3_p2_grad[10][10] = {};
double p3_p1_grad[10][10] = {};
double p3_p_grad[10][10] = {};
// 1
polygon_cut(p, n, o, c, cut_grad1);
n1 = n;
for (int i = 0; i < n; i++) {
for (int j = 0; j < 4 * n0; j++) {
if (!(j % 2)) {
p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j];
} else {
p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j];
}
}
}
// 2
polygon_cut(p, n, c, d, cut_grad2);
n2 = n;
for (int i = 0; i < n; i++) {
for (int j = 0; j < 4 * n1; j++) {
if (!(j % 2)) {
p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j];
} else {
p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j];
}
}
}
// 3
polygon_cut(p, n, d, o, cut_grad3);
n3 = n;
for (int i = 0; i < n; i++) {
for (int j = 0; j < 4 * n2; j++) {
if (!(j % 2)) {
p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j];
} else {
p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j];
}
}
}
// mul
// p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1)
for (int i = 0; i < 2 * n3; i++) {
for (int j = 0; j < 2 * n1; j++) {
double sum = 0.0;
for (int m = 0; m < 2 * n2; m++) {
sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j];
}
p3_p1_grad[i][j] = sum;
}
}
// p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0)
for (int i = 0; i < 2 * n3; i++) {
for (int j = 0; j < 2 * n0; j++) {
double sum = 0.0;
for (int m = 0; m < 2 * n1; m++) {
sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j];
}
p3_p_grad[i][j] = sum;
}
}
// calculate S_grad
int polygon_index_box_index[20];
double grad_polygon[20];
double S_grad[6];
for (int i = 0; i < n3; i++) {
polygon_index_box_index[i] = i;
polygon_index_box_index[i + n3] = i;
}
double res =
polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon);
if (s1 * s2 == -1) {
for (int j = 0; j < 2 * 3; j++) {
double sum = 0.0;
for (int m = 0; m < 2 * n3; m++) {
sum = sum - grad_polygon[m] * p3_p_grad[m][j];
}
S_grad[j] = sum;
}
if (order != convex_n - 1) {
if (res_flag) {
grad_AB[2 * order] += S_grad[4];
grad_AB[2 * order + 1] += S_grad[5];
grad_AB[2 * order + 2] += S_grad[2];
grad_AB[2 * order + 3] += S_grad[3];
} else {
grad_AB[2 * order] += S_grad[2];
grad_AB[2 * order + 1] += S_grad[3];
grad_AB[2 * order + 2] += S_grad[4];
grad_AB[2 * order + 3] += S_grad[5];
}
} else {
if (res_flag) {
grad_AB[2 * order] += S_grad[4];
grad_AB[2 * order + 1] += S_grad[5];
grad_AB[0] += S_grad[2];
grad_AB[1] += S_grad[3];
} else {
grad_AB[2 * order] += S_grad[2];
grad_AB[2 * order + 1] += S_grad[3];
grad_AB[0] += S_grad[4];
grad_AB[1] += S_grad[5];
}
}
res = -res;
} else {
for (int j = 0; j < 2 * 3; j++) {
double sum = 0.0;
for (int m = 0; m < 2 * n3; m++) {
sum = sum + grad_polygon[m] * p3_p_grad[m][j];
}
S_grad[j] = sum;
}
if (order != convex_n - 1) {
if (res_flag) {
grad_AB[2 * order] += S_grad[4];
grad_AB[2 * order + 1] += S_grad[5];
grad_AB[2 * order + 2] += S_grad[2];
grad_AB[2 * order + 3] += S_grad[3];
} else {
grad_AB[2 * order] += S_grad[2];
grad_AB[2 * order + 1] += S_grad[3];
grad_AB[2 * order + 2] += S_grad[4];
grad_AB[2 * order + 3] += S_grad[5];
}
} else {
if (res_flag) {
grad_AB[2 * order] += S_grad[4];
grad_AB[2 * order + 1] += S_grad[5];
grad_AB[0] += S_grad[2];
grad_AB[1] += S_grad[3];
} else {
grad_AB[2 * order] += S_grad[2];
grad_AB[2 * order + 1] += S_grad[3];
grad_AB[0] += S_grad[4];
grad_AB[1] += S_grad[5];
}
}
}
return res;
}
__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2,
double* grad_AB) {
if (area(ps1, n1) < 0) reverse1(ps1, n1);
if (area(ps2, n2) < 0) reverse1(ps2, n2);
ps1[n1] = ps1[0];
ps2[n2] = ps2[0];
double res = 0;
for (int i = 0; i < n1; i++) {
for (int j = 0; j < n2; j++) {
res +=
intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1);
}
}
return res;
}
__device__ inline void Jarvis(Point* in_poly, int& n_poly) {
Point p_max, p_k;
int max_index, k_index;
int Stack[NMAX] = {}, top1, top2;
double sign;
Point right_point[10], left_point[10];
for (int i = 0; i < n_poly; i++) {
if (in_poly[i].y < in_poly[0].y ||
in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
Point* j = &(in_poly[0]);
Point* k = &(in_poly[i]);
swap1(j, k);
}
if (i == 0) {
p_max = in_poly[0];
max_index = 0;
}
if (in_poly[i].y > p_max.y ||
in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
p_max = in_poly[i];
max_index = i;
}
}
if (max_index == 0) {
max_index = 1;
p_max = in_poly[max_index];
}
k_index = 0, Stack[0] = 0, top1 = 0;
while (k_index != max_index) {
p_k = p_max;
k_index = max_index;
for (int i = 1; i < n_poly; i++) {
sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
dis(in_poly[Stack[top1]], p_k)))) {
p_k = in_poly[i];
k_index = i;
}
}
top1++;
Stack[top1] = k_index;
}
for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]];
k_index = 0, Stack[0] = 0, top2 = 0;
while (k_index != max_index) {
p_k = p_max;
k_index = max_index;
for (int i = 1; i < n_poly; i++) {
sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
dis(in_poly[Stack[top2]], p_k))) {
p_k = in_poly[i];
k_index = i;
}
}
top2++;
Stack[top2] = k_index;
}
for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]];
for (int i = 0; i < top1 + top2; i++) {
if (i <= top1) {
in_poly[i] = right_point[i];
} else {
in_poly[i] = left_point[top2 - (i - top1)];
}
}
n_poly = top1 + top2;
}
__device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2,
int n2, double* grad_C) {
Point polygon[MAXN];
int n = n1 + n2, n_poly = 0;
for (int i = 0; i < n1; i++) {
for (int j = 0; j < n - n1; j++) {
if (point_same(ps1[i], ps2[j])) {
for (int k = j; k < n - n1 - 1; k++) {
ps2[k] = ps2[k + 1];
}
n2--;
break;
}
}
}
n_poly = n1 + n2;
for (int i = 0; i < n_poly; i++) {
if (i < n1) {
polygon[i] = ps1[i];
} else {
polygon[i] = ps2[i - n1];
}
}
Jarvis(polygon, n_poly);
int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1};
int n_pred = 0;
for (int i = 0; i < n_poly; i++) {
for (int j = 0; j < n1; j++) {
if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) {
polygon_to_pred_index[n_pred] = i;
polygon_to_pred_index[n_pred + n1] = j;
n_pred += 1;
break;
}
}
}
if (n_pred == 0) {
double polygon_area = fabs(area(polygon, n_poly));
for (int i = 0; i < 18; i++) {
grad_C[i] = 0.0;
}
return polygon_area;
} else {
double polygon_area =
polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C);
if (polygon_area < 0) {
for (int i = 0; i < 18; i++) {
grad_C[i] = -grad_C[i];
}
}
return fabs(polygon_area);
}
}
// convex_find and get the polygon_index_box_index
__device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly,
int* points_to_convex_ind) {
int n_input = n_poly;
Point input_poly[20];
for (int i = 0; i < n_input; i++) {
input_poly[i].x = in_poly[i].x;
input_poly[i].y = in_poly[i].y;
}
Point p_max, p_k;
int max_index, k_index;
int Stack[20], top1, top2;
double sign;
Point right_point[10], left_point[10];
for (int i = 0; i < n_poly; i++) {
if (in_poly[i].y < in_poly[0].y ||
in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
Point* j = &(in_poly[0]);
Point* k = &(in_poly[i]);
swap1(j, k);
}
if (i == 0) {
p_max = in_poly[0];
max_index = 0;
}
if (in_poly[i].y > p_max.y ||
in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
p_max = in_poly[i];
max_index = i;
}
}
if (max_index == 0) {
max_index = 1;
p_max = in_poly[max_index];
}
k_index = 0, Stack[0] = 0, top1 = 0;
while (k_index != max_index) {
p_k = p_max;
k_index = max_index;
for (int i = 1; i < n_poly; i++) {
sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
dis(in_poly[Stack[top1]], p_k)))) {
p_k = in_poly[i];
k_index = i;
}
}
top1++;
Stack[top1] = k_index;
}
for (int i = 0; i <= top1; i++) {
right_point[i] = in_poly[Stack[i]];
}
k_index = 0, Stack[0] = 0, top2 = 0;
while (k_index != max_index) {
p_k = p_max;
k_index = max_index;
for (int i = 1; i < n_poly; i++) {
sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
dis(in_poly[Stack[top2]], p_k))) {
p_k = in_poly[i];
k_index = i;
}
}
top2++;
Stack[top2] = k_index;
}
for (int i = top2 - 1; i >= 0; i--) {
left_point[i] = in_poly[Stack[i]];
}
for (int i = 0; i < top1 + top2; i++) {
if (i <= top1) {
in_poly[i] = right_point[i];
} else {
in_poly[i] = left_point[top2 - (i - top1)];
}
}
n_poly = top1 + top2;
for (int i = 0; i < n_poly; i++) {
for (int j = 0; j < n_input; j++) {
if (point_same(in_poly[i], input_poly[j])) {
points_to_convex_ind[i] = j;
break;
}
}
}
}
template <typename T>
__device__ inline float devrIoU(T const* const p, T const* const q,
T* point_grad, const int idx) {
Point ps1[MAXN], ps2[MAXN];
Point convex[MAXN];
for (int i = 0; i < 9; i++) {
convex[i].x = (double)p[i * 2];
convex[i].y = (double)p[i * 2 + 1];
}
int n_convex = 9;
int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
Jarvis_and_index(convex, n_convex, points_to_convex_ind);
int n1 = n_convex;
int n2 = 4;
for (int i = 0; i < n1; i++) {
ps1[i].x = (double)convex[i].x;
ps1[i].y = (double)convex[i].y;
}
for (int i = 0; i < n2; i++) {
ps2[i].x = (double)q[i * 2];
ps2[i].y = (double)q[i * 2 + 1];
}
int polygon_index_box_index[18];
for (int i = 0; i < n1; i++) {
polygon_index_box_index[i] = i;
polygon_index_box_index[i + n1] = i;
}
double grad_A[18] = {};
double grad_AB[18] = {};
double grad_C[18] = {};
double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB);
double S_pred =
polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A);
if (S_pred < 0) {
for (int i = 0; i < n_convex * 2; i++) {
grad_A[i] = -grad_A[i];
}
}
double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
double iou = inter_area / union_area;
double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C);
// printf("%d:live\n", idx);
double rot_giou = iou - (polygon_area - union_area) / polygon_area;
float grad_point_temp[18] = {};
for (int i = 0; i < n_convex; i++) {
int grad_point = points_to_convex_ind[i];
grad_point_temp[2 * grad_point] =
(float)((union_area + inter_area) / (union_area * union_area) *
grad_AB[2 * i] -
iou / union_area * grad_A[2 * i] -
1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) -
(union_area) / polygon_area / polygon_area * grad_C[2 * i]);
grad_point_temp[2 * grad_point + 1] =
(float)((union_area + inter_area) / (union_area * union_area) *
grad_AB[2 * i + 1] -
iou / union_area * grad_A[2 * i + 1] -
1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) -
(union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]);
}
for (int i = 0; i < 9; i++) {
point_grad[2 * i] = grad_point_temp[2 * i];
point_grad[2 * i + 1] = grad_point_temp[2 * i + 1];
}
return (float)rot_giou;
}
template <typename T>
__global__ void convex_giou_cuda_kernel(const int ex_n_boxes,
const int gt_n_boxes, const T* ex_boxes,
const T* gt_boxes, T* point_grad) {
CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
const T* cur_box = ex_boxes + index * 18;
const T* cur_gt_box = gt_boxes + index * 8;
T* cur_grad = point_grad + index * 19;
T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x);
cur_grad[18] = giou;
}
}
__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) {
double s1, s2;
s1 = cross(a, b, c);
s2 = cross(a, b, d);
if (sig(s1) == 0 && sig(s2) == 0) return 2;
if (sig(s2 - s1) == 0) return 0;
p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
return 1;
}
__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) {
Point pp[MAXN];
int m = 0;
p[n] = p[0];
for (int i = 0; i < n; i++) {
if (sig(cross(a, b, p[i])) > 0) {
pp[m] = p[i];
m++;
}
if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
lineCross(a, b, p[i], p[i + 1], pp[m]);
m++;
}
}
n = 0;
for (int i = 0; i < m; i++) {
if (!i || !(point_same(pp[i], pp[i - 1]))) {
p[n] = pp[i];
n++;
}
}
while (n > 1 && point_same(p[n - 1], p[0])) n--;
}
__device__ inline double intersectArea(Point a, Point b, Point c, Point d) {
Point o(0, 0);
int s1 = sig(cross(o, a, b));
int s2 = sig(cross(o, c, d));
if (s1 == 0 || s2 == 0) return 0.0;
if (s1 == -1) {
Point* i = &a;
Point* j = &b;
swap1(i, j);
}
if (s2 == -1) {
Point* i = &c;
Point* j = &d;
swap1(i, j);
}
Point p[10] = {o, a, b};
int n = 3;
polygon_cut(p, n, o, c);
polygon_cut(p, n, c, d);
polygon_cut(p, n, d, o);
double res = area(p, n);
if (s1 * s2 == -1) res = -res;
return res;
}
__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2,
int n2) {
if (area(ps1, n1) < 0) reverse1(ps1, n1);
if (area(ps2, n2) < 0) reverse1(ps2, n2);
ps1[n1] = ps1[0];
ps2[n2] = ps2[0];
double res = 0;
for (int i = 0; i < n1; i++) {
for (int j = 0; j < n2; j++) {
res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]);
}
}
return res;
}
template <typename T>
__device__ inline float devrIoU(T const* const p, T const* const q) {
Point ps1[MAXN], ps2[MAXN];
Point convex[MAXN];
for (int i = 0; i < 9; i++) {
convex[i].x = (double)p[i * 2];
convex[i].y = (double)p[i * 2 + 1];
}
int n_convex = 9;
int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
Jarvis_and_index(convex, n_convex, points_to_convex_ind);
int n1 = n_convex;
for (int i = 0; i < n1; i++) {
ps1[i].x = (double)convex[i].x;
ps1[i].y = (double)convex[i].y;
}
int n2 = 4;
for (int i = 0; i < n2; i++) {
ps2[i].x = (double)q[i * 2];
ps2[i].y = (double)q[i * 2 + 1];
}
double inter_area = intersectAreaO(ps1, n1, ps2, n2);
double S_pred = area(ps1, n1);
double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
double iou = inter_area / union_area;
return (float)iou;
}
template <typename T>
__global__ void convex_iou_cuda_kernel(const int ex_n_boxes,
const int gt_n_boxes, const T* ex_boxes,
const T* gt_boxes, T* iou) {
CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
const T* cur_box = ex_boxes + index * 18;
for (int i = 0; i < gt_n_boxes; i++) {
iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8);
}
}
}
#endif // CONVEX_IOU_CUDA_KERNEL_CUH
......@@ -29,8 +29,8 @@ using namespace torch;
#define TensorAcc5R PackedTensorAccessor32<scalar_t, 5, RestrictPtrTraits>
#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
#define THREADS_FORWARD 32
#define THREADS_BACKWARD 16
#define WARP_SIZE 32
#define FULL_MASK 0xffffffff
template <typename scalar_t>
__global__ void correlation_forward_cuda_kernel(
......@@ -42,8 +42,8 @@ __global__ void correlation_forward_cuda_kernel(
const int C = rInput1.size(3);
const int n = blockIdx.x;
const int h = blockIdx.y;
const int w = blockIdx.z;
const int h = blockIdx.y * blockDim.y + threadIdx.y;
const int w = blockIdx.z * blockDim.z + threadIdx.z;
const int thread = threadIdx.x;
const int start_i = -padH + h * dH;
......@@ -52,13 +52,11 @@ __global__ void correlation_forward_cuda_kernel(
const int patchRadH = dilation_patchH * (patchH - 1) / 2;
const int patchRadW = dilation_patchW * (patchW - 1) / 2;
__shared__ scalar_t prod_sum[THREADS_FORWARD];
for (int ph = 0; ph < patchH; ++ph) {
int ph_dilated = ph * dilation_patchH - patchRadH;
for (int pw = 0; pw < patchW; ++pw) {
int pw_dilated = pw * dilation_patchW - patchRadW;
prod_sum[thread] = 0;
scalar_t prod_sum = 0.0f;
for (int i = 0; i < kH; ++i) {
int i1 = start_i + i * dilationH;
int i2 = i1 + ph_dilated;
......@@ -69,23 +67,20 @@ __global__ void correlation_forward_cuda_kernel(
int j2 = j1 + pw_dilated;
if
WITHIN_BOUNDS(j1, j2, iW, iW) {
for (int c = thread; c < C; c += THREADS_FORWARD) {
for (int c = thread; c < C; c += WARP_SIZE) {
scalar_t v1 = rInput1[n][i1][j1][c];
scalar_t v2 = rInput2[n][i2][j2][c];
prod_sum[thread] += v1 * v2;
prod_sum += v1 * v2;
}
}
}
}
}
// accumulate
__syncthreads();
for (int offset = 16; offset > 0; offset /= 2)
prod_sum += __shfl_down_sync(FULL_MASK, float(prod_sum), offset);
if (thread == 0) {
scalar_t reduce_sum = 0;
for (int index = 0; index < THREADS_FORWARD; ++index) {
reduce_sum += prod_sum[index];
}
output[n][ph][pw][h][w] = reduce_sum;
output[n][ph][pw][h][w] = prod_sum;
}
}
}
......@@ -97,9 +92,10 @@ __global__ void correlation_backward_cuda_kernel_input1(
TensorAcc4R grad_input1, const int kH, const int kW, const int patchH,
const int patchW, const int padH, const int padW, const int dilationH,
const int dilationW, const int dilation_patchH, const int dilation_patchW,
const int dH, const int dW, const int batch) {
const int iH = input2.size(2);
const int iW = input2.size(3);
const int dH, const int dW) {
const int iH = input2.size(1);
const int iW = input2.size(2);
const int C = input2.size(3);
const int H = grad_output.size(3);
const int W = grad_output.size(4);
......@@ -107,54 +103,53 @@ __global__ void correlation_backward_cuda_kernel_input1(
const int patchRadH = (patchH - 1) / 2;
const int patchRadW = (patchW - 1) / 2;
const int n = batch;
const int c = blockIdx.x;
const int n = blockIdx.x;
const int h = blockIdx.y;
const int w = blockIdx.z;
const int ph_off = threadIdx.x;
const int pw_off = threadIdx.y;
const int h_2 = h + padH;
const int w_2 = w + padW;
const int min_h = h_2 - kH * dilationH;
const int min_w = w_2 - kW * dilationW;
__shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
prod_sum[ph_off][pw_off] = 0;
for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) {
extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
const int ph = i / patchW;
const int pw = i % patchW;
int i1 = h + dilation_patchH * (ph - patchRadH);
for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) {
int j1 = w + dilation_patchW * (pw - patchRadW);
if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
scalar_t val = input2[n][c][i1][j1];
for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
int i2 = (h_3) / dH;
if (i2 * dH != h_3) continue;
for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
int j2 = (w_3) / dW;
if (j2 * dW != w_3) continue;
if
WITHIN_BOUNDS(i2, j2, H, W) {
prod_sum[ph_off][pw_off] +=
grad_output[n][ph][pw][i2][j2] * val;
}
int j1 = w + dilation_patchW * (pw - patchRadW);
if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
scalar_t grad_val = 0.0f;
for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
int i2 = (h_3) / dH;
if (i2 * dH != h_3) continue;
for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
int j2 = (w_3) / dW;
if (j2 * dW != w_3) continue;
if (WITHIN_BOUNDS(i2, j2, H, W)) {
grad_val += grad_output[n][ph][pw][i2][j2];
}
}
}
grad_cache[i] = grad_val;
}
}
__syncthreads();
if (ph_off == 0 && pw_off == 0) {
scalar_t reduce_sum = 0;
for (int ph = 0; ph < THREADS_BACKWARD; ++ph) {
for (int pw = 0; pw < THREADS_BACKWARD; ++pw) {
reduce_sum += prod_sum[ph][pw];
for (int c = threadIdx.x; c < C; c += blockDim.x) {
scalar_t grad_input_val = 0.0f;
for (int ph = 0; ph < patchH; ++ph) {
int i1 = h + dilation_patchH * (ph - patchRadH);
for (int pw = 0; pw < patchW; ++pw) {
int j1 = w + dilation_patchW * (pw - patchRadW);
if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
grad_input_val += input2[n][i1][j1][c] * grad_cache[ph * patchW + pw];
}
}
}
grad_input1[n][c][h][w] = reduce_sum;
grad_input1[n][c][h][w] = grad_input_val;
}
}
......@@ -163,9 +158,10 @@ __global__ void correlation_backward_cuda_kernel_input2(
const TensorAcc5R grad_output, const TensorAcc4R input1,
TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH,
int padW, int dilationH, int dilationW, int dilation_patchH,
int dilation_patchW, int dH, int dW, int batch) {
const int iH = input1.size(2);
const int iW = input1.size(3);
int dilation_patchW, int dH, int dW) {
const int iH = input1.size(1);
const int iW = input1.size(2);
const int C = input1.size(3);
const int patchRadH = (patchH - 1) / 2;
const int patchRadW = (patchW - 1) / 2;
......@@ -176,56 +172,54 @@ __global__ void correlation_backward_cuda_kernel_input2(
const int dilatedKH = kH * dilationH;
const int dilatedKW = kW * dilationW;
const int n = batch;
const int c = blockIdx.x;
const int n = blockIdx.x;
const int h = blockIdx.y;
const int w = blockIdx.z;
const int ph_off = threadIdx.x;
const int pw_off = threadIdx.y;
__shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
prod_sum[ph_off][pw_off] = 0;
for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) {
extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
const int ph = i / patchW;
const int pw = i % patchW;
int i1 = h - dilation_patchH * (ph - patchRadH);
for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) {
int j1 = w - dilation_patchW * (pw - patchRadW);
if
WITHIN_BOUNDS(i1, j1, iH, iW) {
scalar_t val = input1[n][c][i1][j1];
const int h_2 = i1 + padH;
const int w_2 = j1 + padW;
const int min_h = h_2 - dilatedKH;
const int min_w = w_2 - dilatedKW;
for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
int i2 = (h_3) / dH;
if (i2 * dH != h_3) continue;
for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
int j2 = (w_3) / dW;
if (j2 * dW != w_3) continue;
if
WITHIN_BOUNDS(i2, j2, H, W) {
prod_sum[ph_off][pw_off] +=
grad_output[n][ph][pw][i2][j2] * val;
}
}
int j1 = w - dilation_patchW * (pw - patchRadW);
if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
scalar_t grad_val = 0.0f;
const int h_2 = i1 + padH;
const int w_2 = j1 + padW;
const int min_h = h_2 - dilatedKH;
const int min_w = w_2 - dilatedKW;
for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
int i2 = (h_3) / dH;
if (i2 * dH != h_3) continue;
for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
int j2 = (w_3) / dW;
if (j2 * dW != w_3) continue;
if (WITHIN_BOUNDS(i2, j2, H, W)) {
grad_val += grad_output[n][ph][pw][i2][j2];
}
}
}
grad_cache[i] = grad_val;
}
}
__syncthreads();
if (ph_off == 0 && pw_off == 0) {
scalar_t reduce_sum = 0;
for (int ph = 0; ph < THREADS_BACKWARD; ++ph) {
for (int pw = 0; pw < THREADS_BACKWARD; ++pw) {
reduce_sum += prod_sum[ph][pw];
for (int c = threadIdx.x; c < C; c += blockDim.x) {
scalar_t grad_input_val = 0.0f;
for (int ph = 0; ph < patchH; ++ph) {
int i1 = h - dilation_patchH * (ph - patchRadH);
for (int pw = 0; pw < patchW; ++pw) {
int j1 = w - dilation_patchW * (pw - patchRadW);
if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
grad_input_val += input1[n][i1][j1][c] * grad_cache[ph * patchW + pw];
}
}
}
grad_input2[n][c][h][w] = reduce_sum;
grad_input2[n][c][h][w] = grad_input_val;
}
}
#endif
// Copyright (c) OpenMMLab. All rights reserved
// Adapted from
// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu # noqa
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#define MAX_NUM_VERT_IDX 9
#define INTERSECTION_OFFSET 8
#define EPSILON 1e-8
inline int opt_n_thread(int work_size) {
const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
return max(min(1 << pow_2, THREADS_PER_BLOCK), 1);
}
/*
compare normalized vertices (vertices around (0,0))
if vertex1 < vertex2 return true.
order: minimum at x-aixs, become larger in anti-clockwise direction
*/
__device__ bool compare_vertices(float x1, float y1, float x2, float y2) {
if (fabs(x1 - x2) < EPSILON && fabs(y2 - y1) < EPSILON)
return false; // if equal, return false
if (y1 > 0 && y2 < 0) return true;
if (y1 < 0 && y2 > 0) return false;
float n1 = x1 * x1 + y1 * y1 + EPSILON;
float n2 = x2 * x2 + y2 * y2 + EPSILON;
float diff = fabs(x1) * x1 / n1 - fabs(x2) * x2 / n2;
if (y1 > 0 && y2 > 0) {
if (diff > EPSILON)
return true;
else
return false;
}
if (y1 < 0 && y2 < 0) {
if (diff < EPSILON)
return true;
else
return false;
}
}
__global__ void diff_iou_rotated_sort_vertices_forward_cuda_kernel(
int b, int n, int m, const float *__restrict__ vertices,
const bool *__restrict__ mask, const int *__restrict__ num_valid,
int *__restrict__ idx) {
int batch_idx = blockIdx.x;
vertices += batch_idx * n * m * 2;
mask += batch_idx * n * m;
num_valid += batch_idx * n;
idx += batch_idx * n * MAX_NUM_VERT_IDX;
int index = threadIdx.x; // index of polygon
int stride = blockDim.x;
for (int i = index; i < n; i += stride) {
int pad; // index of arbitrary invalid intersection point (not box corner!)
for (int j = INTERSECTION_OFFSET; j < m; ++j) {
if (!mask[i * m + j]) {
pad = j;
break;
}
}
if (num_valid[i] < 3) {
// not enough vertices, take an invalid intersection point
// (zero padding)
for (int j = 0; j < MAX_NUM_VERT_IDX; ++j) {
idx[i * MAX_NUM_VERT_IDX + j] = pad;
}
} else {
// sort the valid vertices
// note the number of valid vertices is known
// note: check that num_valid[i] < MAX_NUM_VERT_IDX
for (int j = 0; j < num_valid[i]; ++j) {
// initialize with a "big" value
float x_min = 1;
float y_min = -EPSILON;
int i_take = 0;
int i2;
float x2, y2;
if (j != 0) {
i2 = idx[i * MAX_NUM_VERT_IDX + j - 1];
x2 = vertices[i * m * 2 + i2 * 2 + 0];
y2 = vertices[i * m * 2 + i2 * 2 + 1];
}
for (int k = 0; k < m; ++k) {
float x = vertices[i * m * 2 + k * 2 + 0];
float y = vertices[i * m * 2 + k * 2 + 1];
if (mask[i * m + k] && compare_vertices(x, y, x_min, y_min)) {
if ((j == 0) || (j != 0 && compare_vertices(x2, y2, x, y))) {
x_min = x;
y_min = y;
i_take = k;
}
}
}
idx[i * MAX_NUM_VERT_IDX + j] = i_take;
}
// duplicate the first idx
idx[i * MAX_NUM_VERT_IDX + num_valid[i]] = idx[i * MAX_NUM_VERT_IDX + 0];
// pad zeros
for (int j = num_valid[i] + 1; j < MAX_NUM_VERT_IDX; ++j) {
idx[i * MAX_NUM_VERT_IDX + j] = pad;
}
// for corner case: the two boxes are exactly the same.
// in this case, idx would have duplicate elements, which makes the
// shoelace formula broken because of the definition, the duplicate
// elements only appear in the first 8 positions (they are "corners in
// box", not "intersection of edges")
if (num_valid[i] == 8) {
int counter = 0;
for (int j = 0; j < 4; ++j) {
int check = idx[i * MAX_NUM_VERT_IDX + j];
for (int k = 4; k < INTERSECTION_OFFSET; ++k) {
if (idx[i * MAX_NUM_VERT_IDX + k] == check) counter++;
}
}
if (counter == 4) {
idx[i * MAX_NUM_VERT_IDX + 4] = idx[i * MAX_NUM_VERT_IDX + 0];
for (int j = 5; j < MAX_NUM_VERT_IDX; ++j) {
idx[i * MAX_NUM_VERT_IDX + j] = pad;
}
}
}
// TODO: still might need to cover some other corner cases :(
}
}
}
......@@ -22,13 +22,14 @@ __global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m,
int bs_idx = blockIdx.z;
int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
out += bs_idx * c * m + c_idx * m + pt_idx;
idx += bs_idx * m + pt_idx;
points += bs_idx * c * n + c_idx * n;
out[0] = points[idx[0]];
CUDA_1D_KERNEL_LOOP(pt_idx, m) {
if (bs_idx >= b || c_idx >= c) return;
out += bs_idx * c * m + c_idx * m + pt_idx;
idx += bs_idx * m + pt_idx;
points += bs_idx * c * n + c_idx * n;
out[0] = points[idx[0]];
}
}
template <typename T>
......@@ -43,14 +44,15 @@ __global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m,
int bs_idx = blockIdx.z;
int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
CUDA_1D_KERNEL_LOOP(pt_idx, m) {
if (bs_idx >= b || c_idx >= c) return;
grad_out += bs_idx * c * m + c_idx * m + pt_idx;
idx += bs_idx * m + pt_idx;
grad_points += bs_idx * c * n + c_idx * n;
grad_out += bs_idx * c * m + c_idx * m + pt_idx;
idx += bs_idx * m + pt_idx;
grad_points += bs_idx * c * n + c_idx * n;
atomicAdd(grad_points + idx[0], grad_out[0]);
atomicAdd(grad_points + idx[0], grad_out[0]);
}
}
#endif // GATHER_POINTS_CUDA_KERNEL_CUH
......@@ -22,18 +22,19 @@ __global__ void group_points_forward_cuda_kernel(int b, int c, int n,
// out: (B, C, npoints, nsample)
int bs_idx = blockIdx.z;
int c_idx = blockIdx.y;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int pt_idx = index / nsample;
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
if (bs_idx >= b || c_idx >= c) return;
int sample_idx = index % nsample;
int pt_idx = index / nsample;
int sample_idx = index % nsample;
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
int in_idx = bs_idx * c * n + c_idx * n + idx[0];
int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
pt_idx * nsample + sample_idx;
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
int in_idx = bs_idx * c * n + c_idx * n + idx[0];
int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
pt_idx * nsample + sample_idx;
out[out_idx] = points[in_idx];
out[out_idx] = points[in_idx];
}
}
template <typename T>
......@@ -48,16 +49,17 @@ __global__ void group_points_backward_cuda_kernel(int b, int c, int n,
// grad_points: (B, C, N)
int bs_idx = blockIdx.z;
int c_idx = blockIdx.y;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int pt_idx = index / nsample;
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
int pt_idx = index / nsample;
if (bs_idx >= b || c_idx >= c) return;
int sample_idx = index % nsample;
grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
pt_idx * nsample + sample_idx;
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
int sample_idx = index % nsample;
grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
pt_idx * nsample + sample_idx;
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
}
}
#endif // GROUP_POINTS_CUDA_KERNEL_CUH
......@@ -50,21 +50,17 @@ __device__ int check_rect_cross(const Point &p1, const Point &p2,
}
__device__ inline int check_in_box2d(const float *box, const Point &p) {
// params: box (5) [x1, y1, x2, y2, angle]
const float MARGIN = 1e-5;
float center_x = (box[0] + box[2]) / 2;
float center_y = (box[1] + box[3]) / 2;
float angle_cos = cos(-box[4]),
angle_sin =
sin(-box[4]); // rotate the point in the opposite direction of box
float rot_x =
(p.x - center_x) * angle_cos - (p.y - center_y) * angle_sin + center_x;
float rot_y =
(p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y;
return (rot_x > box[0] - MARGIN && rot_x < box[2] + MARGIN &&
rot_y > box[1] - MARGIN && rot_y < box[3] + MARGIN);
// params: box (7) [x, y, z, dx, dy, dz, heading]
const float MARGIN = 1e-2;
float center_x = box[0], center_y = box[1];
// rotate the point in the opposite direction of box
float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);
float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
return (fabs(rot_x) < box[3] / 2 + MARGIN &&
fabs(rot_y) < box[4] / 2 + MARGIN);
}
__device__ inline int intersection(const Point &p1, const Point &p0,
......@@ -116,16 +112,19 @@ __device__ inline int point_cmp(const Point &a, const Point &b,
}
__device__ inline float box_overlap(const float *box_a, const float *box_b) {
// params: box_a (5) [x1, y1, x2, y2, angle]
// params: box_b (5) [x1, y1, x2, y2, angle]
// params box_a: [x, y, z, dx, dy, dz, heading]
// params box_b: [x, y, z, dx, dy, dz, heading]
float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3],
a_angle = box_a[4];
float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3],
b_angle = box_b[4];
float a_angle = box_a[6], b_angle = box_b[6];
float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2,
a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
Point center_a((a_x1 + a_x2) / 2, (a_y1 + a_y2) / 2);
Point center_b((b_x1 + b_x2) / 2, (b_y1 + b_y2) / 2);
Point center_a(box_a[0], box_a[1]);
Point center_b(box_b[0], box_b[1]);
Point box_a_corners[5];
box_a_corners[0].set(a_x1, a_y1);
......@@ -209,10 +208,10 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b) {
}
__device__ inline float iou_bev(const float *box_a, const float *box_b) {
// params: box_a (5) [x1, y1, x2, y2, angle]
// params: box_b (5) [x1, y1, x2, y2, angle]
float sa = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]);
float sb = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]);
// params box_a: [x, y, z, dx, dy, dz, heading]
// params box_b: [x, y, z, dx, dy, dz, heading]
float sa = box_a[3] * box_a[4];
float sb = box_b[3] * box_b[4];
float s_overlap = box_overlap(box_a, box_b);
return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
}
......@@ -220,149 +219,148 @@ __device__ inline float iou_bev(const float *box_a, const float *box_b) {
__global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel(
const int num_a, const float *boxes_a, const int num_b,
const float *boxes_b, float *ans_overlap) {
const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
if (a_idx >= num_a || b_idx >= num_b) {
return;
}
const float *cur_box_a = boxes_a + a_idx * 5;
const float *cur_box_b = boxes_b + b_idx * 5;
float s_overlap = box_overlap(cur_box_a, cur_box_b);
ans_overlap[a_idx * num_b + b_idx] = s_overlap;
}
__global__ void iou3d_boxes_iou_bev_forward_cuda_kernel(const int num_a,
const float *boxes_a,
const int num_b,
const float *boxes_b,
float *ans_iou) {
const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
// params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
// params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {
if (a_idx >= num_a || b_idx >= num_b) {
return;
}
if (a_idx >= num_a || b_idx >= num_b) {
return;
const float *cur_box_a = boxes_a + a_idx * 7;
const float *cur_box_b = boxes_b + b_idx * 7;
float cur_overlap = box_overlap(cur_box_a, cur_box_b);
ans_overlap[a_idx * num_b + b_idx] = cur_overlap;
}
const float *cur_box_a = boxes_a + a_idx * 5;
const float *cur_box_b = boxes_b + b_idx * 5;
float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
}
__global__ void nms_forward_cuda_kernel(const int boxes_num,
const float nms_overlap_thresh,
const float *boxes,
unsigned long long *mask) {
// params: boxes (N, 5) [x1, y1, x2, y2, ry]
__global__ void iou3d_nms3d_forward_cuda_kernel(const int boxes_num,
const float nms_overlap_thresh,
const float *boxes,
unsigned long long *mask) {
// params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
// params: mask (N, N/THREADS_PER_BLOCK_NMS)
const int blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
// if (row_start > col_start) return;
const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
__shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 7 + 0] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
block_boxes[threadIdx.x * 7 + 1] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
block_boxes[threadIdx.x * 7 + 2] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
block_boxes[threadIdx.x * 7 + 3] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
block_boxes[threadIdx.x * 7 + 4] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
block_boxes[threadIdx.x * 7 + 5] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
block_boxes[threadIdx.x * 7 + 6] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
}
__syncthreads();
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
__shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
const float *cur_box = boxes + cur_box_idx * 5;
if (threadIdx.x < row_size) {
const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
const float *cur_box = boxes + cur_box_idx * 7;
int i = 0;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i;
int i = 0;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
mask[cur_box_idx * col_blocks + col_start] = t;
}
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
mask[cur_box_idx * col_blocks + col_start] = t;
}
}
__device__ inline float iou_normal(float const *const a, float const *const b) {
float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
// params: a: [x, y, z, dx, dy, dz, heading]
// params: b: [x, y, z, dx, dy, dz, heading]
float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2),
right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2),
bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
float interS = width * height;
float Sa = (a[2] - a[0]) * (a[3] - a[1]);
float Sb = (b[2] - b[0]) * (b[3] - b[1]);
float Sa = a[3] * a[4];
float Sb = b[3] * b[4];
return interS / fmaxf(Sa + Sb - interS, EPS);
}
__global__ void nms_normal_forward_cuda_kernel(const int boxes_num,
const float nms_overlap_thresh,
const float *boxes,
unsigned long long *mask) {
// params: boxes (N, 5) [x1, y1, x2, y2, ry]
__global__ void iou3d_nms3d_normal_forward_cuda_kernel(
const int boxes_num, const float nms_overlap_thresh, const float *boxes,
unsigned long long *mask) {
// params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
// params: mask (N, N/THREADS_PER_BLOCK_NMS)
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
__shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
const int blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
// if (row_start > col_start) return;
const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
THREADS_PER_BLOCK_NMS);
__shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 7 + 0] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
block_boxes[threadIdx.x * 7 + 1] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
block_boxes[threadIdx.x * 7 + 2] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
block_boxes[threadIdx.x * 7 + 3] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
block_boxes[threadIdx.x * 7 + 4] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
block_boxes[threadIdx.x * 7 + 5] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
block_boxes[threadIdx.x * 7 + 6] =
boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
const float *cur_box = boxes + cur_box_idx * 5;
if (threadIdx.x < row_size) {
const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
const float *cur_box = boxes + cur_box_idx * 7;
int i = 0;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i;
int i = 0;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks =
(boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
mask[cur_box_idx * col_blocks + col_start] = t;
}
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
mask[cur_box_idx * col_blocks + col_start] = t;
}
}
......
......@@ -51,40 +51,41 @@ __global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample,
const T *xyz, const T *new_xyz,
int *__restrict__ idx, T *dist2) {
int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || pt_idx >= m) return;
CUDA_1D_KERNEL_LOOP(pt_idx, m) {
if (bs_idx >= b) return;
new_xyz += bs_idx * m * 3 + pt_idx * 3;
xyz += bs_idx * n * 3;
idx += bs_idx * m * nsample + pt_idx * nsample;
dist2 += bs_idx * m * nsample + pt_idx * nsample;
new_xyz += bs_idx * m * 3 + pt_idx * 3;
xyz += bs_idx * n * 3;
idx += bs_idx * m * nsample + pt_idx * nsample;
dist2 += bs_idx * m * nsample + pt_idx * nsample;
T new_x = new_xyz[0];
T new_y = new_xyz[1];
T new_z = new_xyz[2];
T new_x = new_xyz[0];
T new_y = new_xyz[1];
T new_z = new_xyz[2];
float best_dist[100];
int best_idx[100];
for (int i = 0; i < nsample; i++) {
best_dist[i] = 1e10;
best_idx[i] = 0;
}
for (int i = 0; i < n; i++) {
T x = xyz[i * 3 + 0];
T y = xyz[i * 3 + 1];
T z = xyz[i * 3 + 2];
T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
(new_z - z) * (new_z - z);
if (d2 < best_dist[0]) {
best_dist[0] = d2;
best_idx[0] = i;
reheap(best_dist, best_idx, nsample);
float best_dist[100];
int best_idx[100];
for (int i = 0; i < nsample; i++) {
best_dist[i] = 1e10;
best_idx[i] = 0;
}
for (int i = 0; i < n; i++) {
T x = xyz[i * 3 + 0];
T y = xyz[i * 3 + 1];
T z = xyz[i * 3 + 2];
T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
(new_z - z) * (new_z - z);
if (d2 < best_dist[0]) {
best_dist[0] = d2;
best_idx[0] = i;
reheap(best_dist, best_idx, nsample);
}
}
heap_sort(best_dist, best_idx, nsample);
for (int i = 0; i < nsample; i++) {
idx[i] = best_idx[i];
dist2[i] = best_dist[i];
}
}
heap_sort(best_dist, best_idx, nsample);
for (int i = 0; i < nsample; i++) {
idx[i] = best_idx[i];
dist2[i] = best_dist[i];
}
}
......
// Copyright (c) OpenMMLab. All rights reserved
#ifndef MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
#define MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#define MAXN 20
__device__ const float PI = 3.1415926;
struct Point {
float x, y;
__device__ Point() {}
__device__ Point(float x, float y) : x(x), y(y) {}
};
__device__ inline void swap1(Point *a, Point *b) {
Point temp;
temp.x = a->x;
temp.y = a->y;
a->x = b->x;
a->y = b->y;
b->x = temp.x;
b->y = temp.y;
}
__device__ inline float cross(Point o, Point a, Point b) {
return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
}
__device__ inline float dis(Point a, Point b) {
return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
}
__device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) {
float convex_points[2][MAXN];
for (int j = 0; j < n_points; j++) {
convex_points[0][j] = ps[j].x;
}
for (int j = 0; j < n_points; j++) {
convex_points[1][j] = ps[j].y;
}
Point edges[MAXN];
float edges_angles[MAXN];
float unique_angles[MAXN];
int n_edges = n_points - 1;
int n_unique = 0;
int unique_flag = 0;
for (int i = 0; i < n_edges; i++) {
edges[i].x = ps[i + 1].x - ps[i].x;
edges[i].y = ps[i + 1].y - ps[i].y;
}
for (int i = 0; i < n_edges; i++) {
edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x);
if (edges_angles[i] >= 0) {
edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2);
} else {
edges_angles[i] =
edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2);
}
}
unique_angles[0] = edges_angles[0];
n_unique += 1;
for (int i = 1; i < n_edges; i++) {
for (int j = 0; j < n_unique; j++) {
if (edges_angles[i] == unique_angles[j]) {
unique_flag += 1;
}
}
if (unique_flag == 0) {
unique_angles[n_unique] = edges_angles[i];
n_unique += 1;
unique_flag = 0;
} else {
unique_flag = 0;
}
}
float minarea = 1e12;
for (int i = 0; i < n_unique; i++) {
float R[2][2];
float rot_points[2][MAXN];
R[0][0] = cos(unique_angles[i]);
R[0][1] = sin(unique_angles[i]);
R[1][0] = -sin(unique_angles[i]);
R[1][1] = cos(unique_angles[i]);
// R x Points
for (int m = 0; m < 2; m++) {
for (int n = 0; n < n_points; n++) {
float sum = 0.0;
for (int k = 0; k < 2; k++) {
sum = sum + R[m][k] * convex_points[k][n];
}
rot_points[m][n] = sum;
}
}
// xmin;
float xmin, ymin, xmax, ymax;
xmin = 1e12;
for (int j = 0; j < n_points; j++) {
if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
continue;
} else {
if (rot_points[0][j] < xmin) {
xmin = rot_points[0][j];
}
}
}
// ymin
ymin = 1e12;
for (int j = 0; j < n_points; j++) {
if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
continue;
} else {
if (rot_points[1][j] < ymin) {
ymin = rot_points[1][j];
}
}
}
// xmax
xmax = -1e12;
for (int j = 0; j < n_points; j++) {
if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
continue;
} else {
if (rot_points[0][j] > xmax) {
xmax = rot_points[0][j];
}
}
}
// ymax
ymax = -1e12;
for (int j = 0; j < n_points; j++) {
if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
continue;
} else {
if (rot_points[1][j] > ymax) {
ymax = rot_points[1][j];
}
}
}
float area = (xmax - xmin) * (ymax - ymin);
if (area < minarea) {
minarea = area;
minbox[0] = unique_angles[i];
minbox[1] = xmin;
minbox[2] = ymin;
minbox[3] = xmax;
minbox[4] = ymax;
}
}
}
// convex_find
__device__ inline void Jarvis(Point *in_poly, int &n_poly) {
int n_input = n_poly;
Point input_poly[20];
for (int i = 0; i < n_input; i++) {
input_poly[i].x = in_poly[i].x;
input_poly[i].y = in_poly[i].y;
}
Point p_max, p_k;
int max_index, k_index;
int Stack[20], top1, top2;
// float sign;
double sign;
Point right_point[10], left_point[10];
for (int i = 0; i < n_poly; i++) {
if (in_poly[i].y < in_poly[0].y ||
in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
Point *j = &(in_poly[0]);
Point *k = &(in_poly[i]);
swap1(j, k);
}
if (i == 0) {
p_max = in_poly[0];
max_index = 0;
}
if (in_poly[i].y > p_max.y ||
in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
p_max = in_poly[i];
max_index = i;
}
}
if (max_index == 0) {
max_index = 1;
p_max = in_poly[max_index];
}
k_index = 0, Stack[0] = 0, top1 = 0;
while (k_index != max_index) {
p_k = p_max;
k_index = max_index;
for (int i = 1; i < n_poly; i++) {
sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
dis(in_poly[Stack[top1]], p_k)))) {
p_k = in_poly[i];
k_index = i;
}
}
top1++;
Stack[top1] = k_index;
}
for (int i = 0; i <= top1; i++) {
right_point[i] = in_poly[Stack[i]];
}
k_index = 0, Stack[0] = 0, top2 = 0;
while (k_index != max_index) {
p_k = p_max;
k_index = max_index;
for (int i = 1; i < n_poly; i++) {
sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
dis(in_poly[Stack[top2]], p_k))) {
p_k = in_poly[i];
k_index = i;
}
}
top2++;
Stack[top2] = k_index;
}
for (int i = top2 - 1; i >= 0; i--) {
left_point[i] = in_poly[Stack[i]];
}
for (int i = 0; i < top1 + top2; i++) {
if (i <= top1) {
in_poly[i] = right_point[i];
} else {
in_poly[i] = left_point[top2 - (i - top1)];
}
}
n_poly = top1 + top2;
}
template <typename T>
__device__ inline void Findminbox(T const *const p, T *minpoints) {
Point ps1[MAXN];
Point convex[MAXN];
for (int i = 0; i < 9; i++) {
convex[i].x = p[i * 2];
convex[i].y = p[i * 2 + 1];
}
int n_convex = 9;
Jarvis(convex, n_convex);
int n1 = n_convex;
for (int i = 0; i < n1; i++) {
ps1[i].x = convex[i].x;
ps1[i].y = convex[i].y;
}
ps1[n1].x = convex[0].x;
ps1[n1].y = convex[0].y;
float minbbox[5] = {0};
minBoundingRect(ps1, n1 + 1, minbbox);
float angle = minbbox[0];
float xmin = minbbox[1];
float ymin = minbbox[2];
float xmax = minbbox[3];
float ymax = minbbox[4];
float R[2][2];
R[0][0] = cos(angle);
R[0][1] = sin(angle);
R[1][0] = -sin(angle);
R[1][1] = cos(angle);
minpoints[0] = xmax * R[0][0] + ymin * R[1][0];
minpoints[1] = xmax * R[0][1] + ymin * R[1][1];
minpoints[2] = xmin * R[0][0] + ymin * R[1][0];
minpoints[3] = xmin * R[0][1] + ymin * R[1][1];
minpoints[4] = xmin * R[0][0] + ymax * R[1][0];
minpoints[5] = xmin * R[0][1] + ymax * R[1][1];
minpoints[6] = xmax * R[0][0] + ymax * R[1][0];
minpoints[7] = xmax * R[0][1] + ymax * R[1][1];
}
template <typename T>
__global__ void min_area_polygons_cuda_kernel(const int ex_n_boxes,
const T *ex_boxes, T *minbox) {
CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
const T *cur_box = ex_boxes + index * 18;
T *cur_min_box = minbox + index * 8;
Findminbox(cur_box, cur_min_box);
}
}
#endif // MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
......@@ -14,11 +14,6 @@
#include "common_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N, const int num_threads) {
return (N + num_threads - 1) / num_threads;
}
template <typename scalar_t>
__device__ scalar_t ms_deform_attn_im2col_bilinear(
const scalar_t *&bottom_data, const int &height, const int &width,
......@@ -267,10 +262,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
const int channels, const int num_levels, const int num_query,
const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight) {
__shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
__shared__ scalar_t cache_grad_attn_weight[blockSize];
unsigned int tid = threadIdx.x;
const int qid_stride = num_heads * channels;
CUDA_1D_KERNEL_LOOP(index, n) {
__shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
__shared__ scalar_t cache_grad_attn_weight[blockSize];
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
......@@ -285,11 +281,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
scalar_t *grad_sampling_loc_out =
grad_sampling_loc + (grad_sampling_ptr << 1);
scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col = 0; l_col < num_levels; ++l_col) {
......@@ -326,23 +322,23 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
_grad_h = cache_grad_sampling_loc[1],
_grad_a = cache_grad_attn_weight[0];
int sid = 2;
for (unsigned int tid = 1; tid < blockSize; ++tid) {
for (unsigned int _tid = 1; _tid < blockSize; ++_tid) {
_grad_w += cache_grad_sampling_loc[sid];
_grad_h += cache_grad_sampling_loc[sid + 1];
_grad_a += cache_grad_attn_weight[tid];
_grad_a += cache_grad_attn_weight[_tid];
sid += 2;
}
*grad_sampling_loc = _grad_w;
*(grad_sampling_loc + 1) = _grad_h;
*grad_attn_weight = _grad_a;
*grad_sampling_loc_out = _grad_w;
*(grad_sampling_loc_out + 1) = _grad_h;
*grad_attn_weight_out = _grad_a;
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
grad_attn_weight_out += grad_weight_stride;
grad_sampling_loc_out += grad_loc_stride;
}
}
}
......@@ -357,10 +353,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
const int channels, const int num_levels, const int num_query,
const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight) {
__shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
__shared__ scalar_t cache_grad_attn_weight[blockSize];
unsigned int tid = threadIdx.x;
CUDA_1D_KERNEL_LOOP(index, n) {
__shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
__shared__ scalar_t cache_grad_attn_weight[blockSize];
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
......@@ -375,8 +371,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
scalar_t *grad_sampling_loc_out =
grad_sampling_loc + (grad_sampling_ptr << 1);
scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
......@@ -425,16 +422,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
}
if (tid == 0) {
*grad_sampling_loc = cache_grad_sampling_loc[0];
*(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
*grad_attn_weight = cache_grad_attn_weight[0];
*grad_sampling_loc_out = cache_grad_sampling_loc[0];
*(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
*grad_attn_weight_out = cache_grad_attn_weight[0];
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
grad_attn_weight_out += grad_weight_stride;
grad_sampling_loc_out += grad_loc_stride;
}
}
}
......@@ -449,11 +446,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
const int channels, const int num_levels, const int num_query,
const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight) {
extern __shared__ int _s[];
scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
CUDA_1D_KERNEL_LOOP(index, n) {
extern __shared__ int _s[];
scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
......@@ -468,8 +465,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
scalar_t *grad_sampling_loc_out =
grad_sampling_loc + (grad_sampling_ptr << 1);
scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
......@@ -509,23 +507,23 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
_grad_h = cache_grad_sampling_loc[1],
_grad_a = cache_grad_attn_weight[0];
int sid = 2;
for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
for (unsigned int _tid = 1; _tid < blockDim.x; ++_tid) {
_grad_w += cache_grad_sampling_loc[sid];
_grad_h += cache_grad_sampling_loc[sid + 1];
_grad_a += cache_grad_attn_weight[tid];
_grad_a += cache_grad_attn_weight[_tid];
sid += 2;
}
*grad_sampling_loc = _grad_w;
*(grad_sampling_loc + 1) = _grad_h;
*grad_attn_weight = _grad_a;
*grad_sampling_loc_out = _grad_w;
*(grad_sampling_loc_out + 1) = _grad_h;
*grad_attn_weight_out = _grad_a;
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
grad_attn_weight_out += grad_weight_stride;
grad_sampling_loc_out += grad_loc_stride;
}
}
}
......@@ -540,11 +538,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
const int channels, const int num_levels, const int num_query,
const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight) {
extern __shared__ int _s[];
scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
CUDA_1D_KERNEL_LOOP(index, n) {
extern __shared__ int _s[];
scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
......@@ -559,8 +557,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
scalar_t *grad_sampling_loc_out =
grad_sampling_loc + (grad_sampling_ptr << 1);
scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
......@@ -618,16 +617,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
}
if (tid == 0) {
*grad_sampling_loc = cache_grad_sampling_loc[0];
*(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
*grad_attn_weight = cache_grad_attn_weight[0];
*grad_sampling_loc_out = cache_grad_sampling_loc[0];
*(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
*grad_attn_weight_out = cache_grad_attn_weight[0];
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
grad_attn_weight_out += grad_weight_stride;
grad_sampling_loc_out += grad_loc_stride;
}
}
}
......@@ -642,11 +641,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
const int channels, const int num_levels, const int num_query,
const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight) {
extern __shared__ int _s[];
scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
CUDA_1D_KERNEL_LOOP(index, n) {
extern __shared__ int _s[];
scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
......@@ -661,8 +660,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
scalar_t *grad_sampling_loc_out =
grad_sampling_loc + (grad_sampling_ptr << 1);
scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
......@@ -720,16 +720,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
}
if (tid == 0) {
atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
atomicAdd(grad_sampling_loc_out, cache_grad_sampling_loc[0]);
atomicAdd(grad_sampling_loc_out + 1, cache_grad_sampling_loc[1]);
atomicAdd(grad_attn_weight_out, cache_grad_attn_weight[0]);
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
grad_attn_weight_out += grad_weight_stride;
grad_sampling_loc_out += grad_loc_stride;
}
}
}
......@@ -759,8 +759,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
scalar_t *grad_sampling_loc_out =
grad_sampling_loc + (grad_sampling_ptr << 1);
scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
......@@ -787,12 +788,12 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(
ms_deform_attn_col2im_bilinear_gm(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
grad_sampling_loc, grad_attn_weight);
grad_sampling_loc_out, grad_attn_weight_out);
}
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
grad_attn_weight_out += grad_weight_stride;
grad_sampling_loc_out += grad_loc_stride;
}
}
}
......
......@@ -30,45 +30,88 @@ __device__ inline bool devIoU(float const *const a, float const *const b,
__global__ void nms_cuda(const int n_boxes, const float iou_threshold,
const int offset, const float *dev_boxes,
unsigned long long *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
const int tid = threadIdx.x;
int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
const int tid = threadIdx.x;
if (row_start > col_start) return;
const int row_size =
fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ float block_boxes[threadsPerBlock * 4];
if (tid < col_size) {
block_boxes[tid * 4 + 0] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
block_boxes[tid * 4 + 1] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
block_boxes[tid * 4 + 2] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
block_boxes[tid * 4 + 3] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
}
__syncthreads();
if (tid < row_size) {
const int cur_box_idx = threadsPerBlock * row_start + tid;
const float *cur_box = dev_boxes + cur_box_idx * 4;
int i = 0;
unsigned long long int t = 0;
int start = 0;
if (row_start == col_start) {
start = tid + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
t |= 1ULL << i;
}
}
dev_mask[cur_box_idx * gridDim.y + col_start] = t;
}
}
}
if (row_start > col_start) return;
__global__ void gather_keep_from_mask(bool *keep,
const unsigned long long *dev_mask,
const int n_boxes) {
const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
const int tid = threadIdx.x;
const int row_size =
fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
// mark the bboxes which have been removed.
extern __shared__ unsigned long long removed[];
__shared__ float block_boxes[threadsPerBlock * 4];
if (tid < col_size) {
block_boxes[tid * 4 + 0] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
block_boxes[tid * 4 + 1] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
block_boxes[tid * 4 + 2] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
block_boxes[tid * 4 + 3] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
// initialize removed.
for (int i = tid; i < col_blocks; i += blockDim.x) {
removed[i] = 0;
}
__syncthreads();
if (tid < row_size) {
const int cur_box_idx = threadsPerBlock * row_start + tid;
const float *cur_box = dev_boxes + cur_box_idx * 4;
int i = 0;
unsigned long long int t = 0;
int start = 0;
if (row_start == col_start) {
start = tid + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
t |= 1ULL << i;
for (int nblock = 0; nblock < col_blocks; ++nblock) {
auto removed_val = removed[nblock];
__syncthreads();
const int i_offset = nblock * threadsPerBlock;
#pragma unroll
for (int inblock = 0; inblock < threadsPerBlock; ++inblock) {
const int i = i_offset + inblock;
if (i >= n_boxes) break;
// select a candidate, check if it should kept.
if (!(removed_val & (1ULL << inblock))) {
if (tid == 0) {
// mark the output.
keep[i] = true;
}
auto p = dev_mask + i * col_blocks;
// remove all bboxes which overlap the candidate.
for (int j = tid; j < col_blocks; j += blockDim.x) {
if (j >= nblock) removed[j] |= p[j];
}
__syncthreads();
removed_val = removed[nblock];
}
}
dev_mask[cur_box_idx * gridDim.y + col_start] = t;
}
}
#endif // NMS_CUDA_KERNEL_CUH
......@@ -43,18 +43,16 @@ __global__ void nms_rotated_cuda_kernel(const int n_boxes,
// (x_center, y_center, width, height, angle_degrees) here.
__shared__ T block_boxes[threadsPerBlock * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 6 + 0] =
block_boxes[threadIdx.x * 5 + 0] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
block_boxes[threadIdx.x * 6 + 1] =
block_boxes[threadIdx.x * 5 + 1] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
block_boxes[threadIdx.x * 6 + 2] =
block_boxes[threadIdx.x * 5 + 2] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
block_boxes[threadIdx.x * 6 + 3] =
block_boxes[threadIdx.x * 5 + 3] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
block_boxes[threadIdx.x * 6 + 4] =
block_boxes[threadIdx.x * 5 + 4] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
block_boxes[threadIdx.x * 6 + 5] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5];
}
__syncthreads();
......@@ -71,7 +69,7 @@ __global__ void nms_rotated_cuda_kernel(const int n_boxes,
// Instead of devIoU used by original horizontal nms, here
// we use the single_box_iou_rotated function from
// box_iou_rotated_utils.h
if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 6, 0) >
if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >
iou_threshold) {
t |= 1ULL << i;
}
......
......@@ -45,20 +45,21 @@ __global__ void points_in_boxes_part_forward_cuda_kernel(
// (B, npoints), default -1
int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= batch_size || pt_idx >= pts_num) return;
CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
if (bs_idx >= batch_size) return;
boxes += bs_idx * boxes_num * 7;
pts += bs_idx * pts_num * 3 + pt_idx * 3;
box_idx_of_points += bs_idx * pts_num + pt_idx;
boxes += bs_idx * boxes_num * 7;
pts += bs_idx * pts_num * 3 + pt_idx * 3;
box_idx_of_points += bs_idx * pts_num + pt_idx;
T local_x = 0, local_y = 0;
int cur_in_flag = 0;
for (int k = 0; k < boxes_num; k++) {
cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
if (cur_in_flag) {
box_idx_of_points[0] = k;
break;
T local_x = 0, local_y = 0;
int cur_in_flag = 0;
for (int k = 0; k < boxes_num; k++) {
cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
if (cur_in_flag) {
box_idx_of_points[0] = k;
break;
}
}
}
}
......@@ -73,19 +74,20 @@ __global__ void points_in_boxes_all_forward_cuda_kernel(
// (B, npoints), default -1
int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= batch_size || pt_idx >= pts_num) return;
CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
if (bs_idx >= batch_size) return;
boxes += bs_idx * boxes_num * 7;
pts += bs_idx * pts_num * 3 + pt_idx * 3;
box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
boxes += bs_idx * boxes_num * 7;
pts += bs_idx * pts_num * 3 + pt_idx * 3;
box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
T local_x = 0, local_y = 0;
for (int k = 0; k < boxes_num; k++) {
const int cur_in_flag =
check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
if (cur_in_flag) {
box_idx_of_points[k] = 1;
T local_x = 0, local_y = 0;
for (int k = 0; k < boxes_num; k++) {
const int cur_in_flag =
check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
if (cur_in_flag) {
box_idx_of_points[k] = 1;
}
}
}
}
......
// Copyright (c) OpenMMLab. All rights reserved
#ifndef POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
#define POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
struct point {
float x, y;
};
template <typename scalar_t>
__global__ void points_in_polygons_forward_cuda_kernel(
const int nthreads, const scalar_t *vertex1, const scalar_t *vertex2,
const int rows, const int cols, scalar_t *inside_flag) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int row = index / cols;
int col = index % cols;
const scalar_t *offset_vertex1 = vertex1 + row * 2;
const scalar_t *offset_vertex2 = vertex2 + col * 8;
point point_[1];
point polygon[4];
point_[0].x = offset_vertex1[0];
point_[0].y = offset_vertex1[1];
polygon[0].x = offset_vertex2[0];
polygon[0].y = offset_vertex2[1];
polygon[1].x = offset_vertex2[2];
polygon[1].y = offset_vertex2[3];
polygon[2].x = offset_vertex2[4];
polygon[2].y = offset_vertex2[5];
polygon[3].x = offset_vertex2[6];
polygon[3].y = offset_vertex2[7];
int nCross = 0;
int i, j;
float sx, sy, tx, ty, px, py, x;
for (i = 0, j = 3; i < 4; j = i, i++) {
sx = polygon[i].x;
sy = polygon[i].y;
tx = polygon[j].x;
ty = polygon[j].y;
px = point_[0].x;
py = point_[0].y;
if (py < min(sy, ty)) continue;
if (py > max(sy, ty)) continue;
if ((sx == px && sy == py) || (tx == px && ty == py)) {
break;
} else {
if ((sy < py && ty >= py) || (sy >= py && ty < py)) {
x = sx + (py - sy) * (tx - sx) / (ty - sy);
if (x == px) {
break;
}
if (x > px) {
nCross++;
}
}
}
}
if (nCross % 2 == 1) {
inside_flag[index] = 1.0;
} else {
inside_flag[index] = 0.0;
}
return;
}
}
#endif // POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu
// Distributed under terms of the MIT license.
#ifndef PRROI_POOL_CUDA_KERNEL_CUH
#define PRROI_POOL_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
template <typename T>
__device__ static __forceinline__ T PrRoIPoolingGetData(const T *data,
const int h,
const int w,
const int height,
const int width) {
bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
T retVal = overflow ? 0.0f : data[h * width + w];
return retVal;
}
template <typename T>
__device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) {
return (1.0f - abs(dh)) * (1.0f - abs(dw));
}
template <typename T>
__device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t,
T c1, T c2) {
return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1;
}
template <typename T>
__device__ static T PrRoIPoolingInterpolation(const T *data, const T h,
const T w, const int height,
const int width) {
T retVal = 0.0f;
int h1 = floorf(h);
int w1 = floorf(w);
retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
h1 = floorf(h) + 1;
w1 = floorf(w);
retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
h1 = floorf(h);
w1 = floorf(w) + 1;
retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
h1 = floorf(h) + 1;
w1 = floorf(w) + 1;
retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
return retVal;
}
template <typename T>
__device__ static T PrRoIPoolingMatCalculation(const T *this_data,
const int s_h, const int s_w,
const int e_h, const int e_w,
const T y0, const T x0,
const T y1, const T x1,
const int h0, const int w0) {
T alpha, beta, lim_alpha, lim_beta, tmp;
T sum_out = 0;
alpha = x0 - T(s_w);
beta = y0 - T(s_h);
lim_alpha = x1 - T(s_w);
lim_beta = y1 - T(s_h);
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp;
alpha = T(e_w) - x1;
lim_alpha = T(e_w) - x0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp;
alpha = x0 - T(s_w);
beta = T(e_h) - y1;
lim_alpha = x1 - T(s_w);
lim_beta = T(e_h) - y0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp;
alpha = T(e_w) - x1;
lim_alpha = T(e_w) - x0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp;
return sum_out;
}
template <typename T>
__device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff,
const int h, const int w,
const int height,
const int width,
const T coeff) {
bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff);
}
template <typename T>
__device__ static void PrRoIPoolingMatDistributeDiff(
T *diff, const T top_diff, const int s_h, const int s_w, const int e_h,
const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0,
const int w0) {
T alpha, beta, lim_alpha, lim_beta, tmp;
alpha = x0 - T(s_w);
beta = y0 - T(s_h);
lim_alpha = x1 - T(s_w);
lim_beta = y1 - T(s_h);
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp);
alpha = T(e_w) - x1;
lim_alpha = T(e_w) - x0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp);
alpha = x0 - T(s_w);
beta = T(e_h) - y1;
lim_alpha = x1 - T(s_w);
lim_beta = T(e_h) - y0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp);
alpha = T(e_w) - x1;
lim_alpha = T(e_w) - x0;
tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
0.5f * alpha * alpha) *
(lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp);
}
template <typename T>
__global__ void prroi_pool_forward_cuda_kernel(
const int nthreads, const T *input, const T *rois, T *output,
const int pooled_height, const int pooled_width, const T spatial_scale,
const int channels, const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T *offset_rois = rois + n * 5;
int roi_batch_ind = offset_rois[0];
T roi_x1 = offset_rois[1] * spatial_scale;
T roi_y1 = offset_rois[2] * spatial_scale;
T roi_x2 = offset_rois[3] * spatial_scale;
T roi_y2 = offset_rois[4] * spatial_scale;
T roi_width = max(roi_x2 - roi_x1, ((T)0.0));
T roi_height = max(roi_y2 - roi_y1, ((T)0.0));
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
const T *this_data =
input + (roi_batch_ind * channels + c) * height * width;
T *this_out = output + index;
T bin_x1 = roi_x1 + bin_size_w * pw;
T bin_y1 = roi_y1 + bin_size_h * ph;
T bin_x2 = bin_x1 + bin_size_w;
T bin_y2 = bin_y1 + bin_size_h;
T bin_size = max(T(0.0), bin_size_w * bin_size_h);
if (bin_size == 0) {
*this_out = 0;
continue;
}
T sum_out = 0;
int start_x, start_y, end_x, end_y;
start_x = floorf(bin_x1);
end_x = ceilf(bin_x2);
start_y = floorf(bin_y1);
end_y = ceilf(bin_y2);
for (int bin_x = start_x; bin_x < end_x; ++bin_x)
for (int bin_y = start_y; bin_y < end_y; ++bin_y)
sum_out += PrRoIPoolingMatCalculation(
this_data, bin_y, bin_x, bin_y + 1, bin_x + 1,
max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
width);
*this_out = sum_out / bin_size;
}
}
template <typename T>
__global__ void prroi_pool_backward_cuda_kernel(
const int nthreads, const T *grad_output, const T *rois, T *grad_input,
const int pooled_height, const int pooled_width, const T spatial_scale,
const int channels, const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
rois += n * 5;
int roi_batch_ind = rois[0];
T roi_x1 = rois[1] * spatial_scale;
T roi_y1 = rois[2] * spatial_scale;
T roi_x2 = rois[3] * spatial_scale;
T roi_y2 = rois[4] * spatial_scale;
T roi_width = max(roi_x2 - roi_x1, (T)0);
T roi_height = max(roi_y2 - roi_y1, (T)0);
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
const T *this_out_grad = grad_output + index;
T *this_data_grad =
grad_input + (roi_batch_ind * channels + c) * height * width;
T bin_x1 = roi_x1 + bin_size_w * pw;
T bin_y1 = roi_y1 + bin_size_h * ph;
T bin_x2 = bin_x1 + bin_size_w;
T bin_y2 = bin_y1 + bin_size_h;
T bin_size = max(T(0.0), bin_size_w * bin_size_h);
T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size;
int start_x, start_y, end_x, end_y;
start_x = floorf(bin_x1);
end_x = ceilf(bin_x2);
start_y = floorf(bin_y1);
end_y = ceilf(bin_y2);
for (int bin_x = start_x; bin_x < end_x; ++bin_x)
for (int bin_y = start_y; bin_y < end_y; ++bin_y)
PrRoIPoolingMatDistributeDiff(
this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1,
max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
width);
}
}
template <typename T>
__global__ void prroi_pool_coor_backward_cuda_kernel(
const int nthreads, const T *output, const T *grad_output, const T *input,
const T *rois, T *grad_rois, const int pooled_height,
const int pooled_width, const T spatial_scale, const int channels,
const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
rois += n * 5;
int roi_batch_ind = rois[0];
T roi_x1 = rois[1] * spatial_scale;
T roi_y1 = rois[2] * spatial_scale;
T roi_x2 = rois[3] * spatial_scale;
T roi_y2 = rois[4] * spatial_scale;
T roi_width = max(roi_x2 - roi_x1, (T)0);
T roi_height = max(roi_y2 - roi_y1, (T)0);
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
const T output_grad_val = grad_output[index];
const T *this_input_data =
input + (roi_batch_ind * channels + c) * height * width;
const T output_val = output[index];
T *this_rois_grad = grad_rois + n * 5;
T bin_x1 = roi_x1 + bin_size_w * pw;
T bin_y1 = roi_y1 + bin_size_h * ph;
T bin_x2 = bin_x1 + bin_size_w;
T bin_y2 = bin_y1 + bin_size_h;
T bin_size = max(T(0.0), bin_size_w * bin_size_h);
T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size;
// WARNING: to be discussed
if (sum_out == 0) return;
int start_x, start_y, end_x, end_y;
start_x = floorf(bin_x1);
end_x = ceilf(bin_x2);
start_y = floorf(bin_y1);
end_y = ceilf(bin_y2);
T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0;
for (int bin_y = start_y; bin_y < end_y; ++bin_y) {
grad_x1_y += PrRoIPoolingSingleCoorIntegral(
max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1,
height, width),
PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1,
height, width));
grad_x2_y += PrRoIPoolingSingleCoorIntegral(
max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2,
height, width),
PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2,
height, width));
}
for (int bin_x = start_x; bin_x < end_x; ++bin_x) {
grad_x_y1 += PrRoIPoolingSingleCoorIntegral(
max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x),
height, width),
PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1),
height, width));
grad_x_y2 += PrRoIPoolingSingleCoorIntegral(
max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x),
height, width),
PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1),
height, width));
}
T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val;
T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val;
T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val;
T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val;
partial_x1 = partial_x1 / bin_size * spatial_scale;
partial_x2 = partial_x2 / bin_size * spatial_scale;
partial_y1 = partial_y1 / bin_size * spatial_scale;
partial_y2 = partial_y2 / bin_size * spatial_scale;
// (index, x1, y1, x2, y2)
this_rois_grad[0] = 0;
atomicAdd(this_rois_grad + 1,
(partial_x1 * (1.0f - T(pw) / pooled_width) +
partial_x2 * (1.0f - T(pw + 1) / pooled_width)) *
output_grad_val);
atomicAdd(this_rois_grad + 2,
(partial_y1 * (1.0f - T(ph) / pooled_height) +
partial_y2 * (1.0f - T(ph + 1) / pooled_height)) *
output_grad_val);
atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width +
partial_x1 * T(pw) / pooled_width) *
output_grad_val);
atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height +
partial_y1 * T(ph) / pooled_height) *
output_grad_val);
}
}
#endif // ROI_POOL_CUDA_KERNEL_CUH
// Modified from
// https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu
#ifndef RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
#define RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
#include <float.h>
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else // MMCV_USE_PARROTS
#include "pytorch_cuda_helper.hpp"
#endif // MMCV_USE_PARROTS
/*** Forward ***/
template <typename scalar_t>
__global__ void riroi_align_rotated_forward_cuda_kernel(
const int nthreads, const scalar_t *bottom_data,
const scalar_t *bottom_rois, const scalar_t spatial_scale,
const int num_samples, const bool clockwise, const int channels,
const int height, const int width, const int pooled_height,
const int pooled_width, const int num_orientations, scalar_t *top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int o = (index / pooled_width / pooled_height) % num_orientations;
int c =
(index / pooled_width / pooled_height / num_orientations) % channels;
int n = index / pooled_width / pooled_height / num_orientations / channels;
const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
int roi_batch_ind = offset_bottom_rois[0];
// Do not using rounding; this implementation detail is critical
scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
// scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
scalar_t theta = offset_bottom_rois[5];
// Force malformed ROIs to be 1x1
roi_width = max(roi_width, (scalar_t)1.);
roi_height = max(roi_height, (scalar_t)1.);
scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
static_cast<scalar_t>(pooled_height);
scalar_t bin_size_w =
static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
// find aligned index
scalar_t ind_float = theta * num_orientations / (2 * M_PI);
int ind = floorf(ind_float);
scalar_t l_var = ind_float - (scalar_t)ind;
scalar_t r_var = 1.0 - l_var;
// correct start channel
ind = (ind + num_orientations) % num_orientations;
// rotated channel
int ind_rot = (o - ind + num_orientations) % num_orientations;
int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
const scalar_t *offset_bottom_data =
bottom_data + (roi_batch_ind * channels * num_orientations +
c * num_orientations + ind_rot) *
height * width;
const scalar_t *offset_bottom_data_plus =
bottom_data + (roi_batch_ind * channels * num_orientations +
c * num_orientations + ind_rot_plus) *
height * width;
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (num_samples > 0)
? num_samples
: ceilf(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
if (clockwise) {
theta = -theta; // If clockwise, the angle needs to be reversed.
}
scalar_t roi_start_h = -roi_height / 2.0;
scalar_t roi_start_w = -roi_width / 2.0;
scalar_t cosscalar_theta = cos(theta);
scalar_t sinscalar_theta = sin(theta);
// We do average (integral) pooling inside a bin
const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
scalar_t output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1
const scalar_t yy =
roi_start_h + ph * bin_size_h +
static_cast<scalar_t>(iy + .5f) * bin_size_h /
static_cast<scalar_t>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const scalar_t xx = roi_start_w + pw * bin_size_w +
static_cast<scalar_t>(ix + .5f) * bin_size_w /
static_cast<scalar_t>(roi_bin_grid_w);
// Rotate by theta (counterclockwise) around the center and translate
scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
scalar_t val = bilinear_interpolate<scalar_t>(
offset_bottom_data, height, width, y, x, index);
scalar_t val_plus = bilinear_interpolate<scalar_t>(
offset_bottom_data_plus, height, width, y, x, index);
output_val += r_var * val + l_var * val_plus;
}
}
output_val /= count;
top_data[index] = output_val;
}
}
/*** Backward ***/
template <typename scalar_t>
__global__ void riroi_align_rotated_backward_cuda_kernel(
const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
const scalar_t spatial_scale, const int num_samples, const bool clockwise,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, const int num_orientations,
scalar_t *bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int o = (index / pooled_width / pooled_height) % num_orientations;
int c =
(index / pooled_width / pooled_height / num_orientations) % channels;
int n = index / pooled_width / pooled_height / num_orientations / channels;
const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
int roi_batch_ind = offset_bottom_rois[0];
// Do not round
scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
// scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
scalar_t theta = offset_bottom_rois[5];
// Force malformed ROIs to be 1x1
roi_width = max(roi_width, (scalar_t)1.);
roi_height = max(roi_height, (scalar_t)1.);
scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
static_cast<scalar_t>(pooled_height);
scalar_t bin_size_w =
static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
// find aligned index
scalar_t ind_float = theta * num_orientations / (2 * M_PI);
int ind = floorf(ind_float);
scalar_t l_var = ind_float - (scalar_t)ind;
scalar_t r_var = 1.0 - l_var;
// correct start channel
ind = (ind + num_orientations) % num_orientations;
// rotated channel
int ind_rot = (o - ind + num_orientations) % num_orientations;
int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
scalar_t *offset_bottom_diff =
bottom_diff + (roi_batch_ind * channels * num_orientations +
c * num_orientations + ind_rot) *
height * width;
scalar_t *offset_bottom_diff_plus =
bottom_diff + (roi_batch_ind * channels * num_orientations +
c * num_orientations + ind_rot_plus) *
height * width;
int top_offset =
(n * channels * num_orientations + c * num_orientations + o) *
pooled_height * pooled_width;
const scalar_t *offset_top_diff = top_diff + top_offset;
const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (num_samples > 0)
? num_samples
: ceilf(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
if (clockwise) {
theta = -theta; // If clockwise, the angle needs to be reversed.
}
scalar_t roi_start_h = -roi_height / 2.0;
scalar_t roi_start_w = -roi_width / 2.0;
scalar_t cosTheta = cos(theta);
scalar_t sinTheta = sin(theta);
// We do average (integral) pooling inside a bin
const scalar_t count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1
const scalar_t yy =
roi_start_h + ph * bin_size_h +
static_cast<scalar_t>(iy + .5f) * bin_size_h /
static_cast<scalar_t>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const scalar_t xx = roi_start_w + pw * bin_size_w +
static_cast<scalar_t>(ix + .5f) * bin_size_w /
static_cast<scalar_t>(roi_bin_grid_w);
// Rotate by theta around the center and translate
scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;
scalar_t w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
w4, x_low, x_high, y_low,
y_high, index);
scalar_t g1 = top_diff_this_bin * w1 / count;
scalar_t g2 = top_diff_this_bin * w2 / count;
scalar_t g3 = top_diff_this_bin * w3 / count;
scalar_t g4 = top_diff_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
atomicAdd(offset_bottom_diff + y_low * width + x_low, g1 * r_var);
atomicAdd(offset_bottom_diff + y_low * width + x_high, g2 * r_var);
atomicAdd(offset_bottom_diff + y_high * width + x_low, g3 * r_var);
atomicAdd(offset_bottom_diff + y_high * width + x_high, g4 * r_var);
atomicAdd(offset_bottom_diff_plus + y_low * width + x_low,
g1 * l_var);
atomicAdd(offset_bottom_diff_plus + y_low * width + x_high,
g2 * l_var);
atomicAdd(offset_bottom_diff_plus + y_high * width + x_low,
g3 * l_var);
atomicAdd(offset_bottom_diff_plus + y_high * width + x_high,
g4 * l_var);
} // if
} // ix
} // iy
} // CUDA_1D_KERNEL_LOOP
} // RiRoIAlignBackward
#endif // RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
......@@ -20,7 +20,7 @@ template <typename scalar_t>
__global__ void roi_align_rotated_forward_cuda_kernel(
const int nthreads, const scalar_t *bottom_data,
const scalar_t *bottom_rois, const scalar_t spatial_scale,
const int sample_num, const bool aligned, const bool clockwise,
const int sampling_ratio, const bool aligned, const bool clockwise,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, scalar_t *top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
......@@ -58,11 +58,11 @@ __global__ void roi_align_rotated_forward_cuda_kernel(
bottom_data + (roi_batch_ind * channels + c) * height * width;
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sample_num > 0)
? sample_num
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceilf(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width);
(sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
......@@ -104,7 +104,7 @@ __global__ void roi_align_rotated_forward_cuda_kernel(
template <typename scalar_t>
__global__ void roi_align_rotated_backward_cuda_kernel(
const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
const scalar_t spatial_scale, const int sample_num, const bool aligned,
const scalar_t spatial_scale, const int sampling_ratio, const bool aligned,
const bool clockwise, const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, scalar_t *bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
......@@ -146,11 +146,11 @@ __global__ void roi_align_rotated_backward_cuda_kernel(
const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sample_num > 0)
? sample_num
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceilf(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width);
(sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
......
......@@ -44,37 +44,38 @@ __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
// coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N,
// npoints): -1 means point does not in this box, otherwise: encode (x_idxs,
// y_idxs, z_idxs) by binary bit
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
int box_idx = blockIdx.y;
if (pt_idx >= pts_num || box_idx >= boxes_num) return;
CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
if (box_idx >= boxes_num) return;
pts += pt_idx * 3;
rois += box_idx * 7;
pts_mask += box_idx * pts_num + pt_idx;
pts += pt_idx * 3;
rois += box_idx * 7;
pts_mask += box_idx * pts_num + pt_idx;
T local_x = 0, local_y = 0;
int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
T local_x = 0, local_y = 0;
int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
pts_mask[0] = -1;
if (cur_in_flag > 0) {
T local_z = pts[2] - rois[2];
T x_size = rois[3], y_size = rois[4], z_size = rois[5];
pts_mask[0] = -1;
if (cur_in_flag > 0) {
T local_z = pts[2] - rois[2];
T x_size = rois[3], y_size = rois[4], z_size = rois[5];
T x_res = x_size / out_x;
T y_res = y_size / out_y;
T z_res = z_size / out_z;
T x_res = x_size / out_x;
T y_res = y_size / out_y;
T z_res = z_size / out_z;
unsigned int x_idx = int((local_x + x_size / 2) / x_res);
unsigned int y_idx = int((local_y + y_size / 2) / y_res);
unsigned int z_idx = int(local_z / z_res);
unsigned int x_idx = int((local_x + x_size / 2) / x_res);
unsigned int y_idx = int((local_y + y_size / 2) / y_res);
unsigned int z_idx = int(local_z / z_res);
x_idx = min(max(x_idx, 0), out_x - 1);
y_idx = min(max(y_idx, 0), out_y - 1);
z_idx = min(max(z_idx, 0), out_z - 1);
x_idx = min(max(x_idx, 0), out_x - 1);
y_idx = min(max(y_idx, 0), out_y - 1);
z_idx = min(max(z_idx, 0), out_z - 1);
unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
pts_mask[0] = idx_encoding;
pts_mask[0] = idx_encoding;
}
}
}
......@@ -86,26 +87,24 @@ __global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
T *pts_idx_of_voxels) {
// params pts_mask: (N, npoints) 0 or 1
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
int box_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (box_idx >= boxes_num) return;
int max_num_pts = max_pts_each_voxel - 1; // index 0 is the counter
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
for (int k = 0; k < pts_num; k++) {
if (pts_mask[box_idx * pts_num + k] != -1) {
unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
unsigned int z_idx = idx_encoding & 0xFF;
unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
y_idx * out_z * max_pts_each_voxel +
z_idx * max_pts_each_voxel;
unsigned int cnt = pts_idx_of_voxels[base_offset];
if (cnt < max_num_pts) {
pts_idx_of_voxels[base_offset + cnt + 1] = k;
pts_idx_of_voxels[base_offset]++;
CUDA_1D_KERNEL_LOOP(box_idx, boxes_num) {
int max_num_pts = max_pts_each_voxel - 1; // index 0 is the counter
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
for (int k = 0; k < pts_num; k++) {
if (pts_mask[box_idx * pts_num + k] != -1) {
unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
unsigned int z_idx = idx_encoding & 0xFF;
unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
y_idx * out_z * max_pts_each_voxel +
z_idx * max_pts_each_voxel;
unsigned int cnt = pts_idx_of_voxels[base_offset];
if (cnt < max_num_pts) {
pts_idx_of_voxels[base_offset + cnt + 1] = k;
pts_idx_of_voxels[base_offset]++;
}
}
}
}
......@@ -124,39 +123,38 @@ __global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
int box_idx = blockIdx.z;
int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
int x_idx = voxel_idx_flat / (out_y * out_z);
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z)
return;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
offset_base * max_pts_each_voxel;
pooled_features += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
argmax += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
int argmax_idx = -1;
float max_val = -1e50;
int total_pts = pts_idx_of_voxels[0];
for (int k = 1; k <= total_pts; k++) {
if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) {
max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
argmax_idx = pts_idx_of_voxels[k];
CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
int x_idx = voxel_idx_flat / (out_y * out_z);
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z;
if (box_idx >= boxes_num || channel_idx >= channels) return;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
offset_base * max_pts_each_voxel;
pooled_features += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
argmax += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
int argmax_idx = -1;
float max_val = -1e50;
int total_pts = pts_idx_of_voxels[0];
for (int k = 1; k <= total_pts; k++) {
if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] >
max_val) {
max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
argmax_idx = pts_idx_of_voxels[k];
}
}
}
if (argmax_idx != -1) {
pooled_features[0] = max_val;
if (argmax_idx != -1) {
pooled_features[0] = max_val;
}
argmax[0] = argmax_idx;
}
argmax[0] = argmax_idx;
}
template <typename T>
......@@ -172,30 +170,28 @@ __global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
int box_idx = blockIdx.z;
int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
int x_idx = voxel_idx_flat / (out_y * out_z);
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z)
return;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
offset_base * max_pts_each_voxel;
pooled_features += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
float sum_val = 0;
int total_pts = pts_idx_of_voxels[0];
for (int k = 1; k <= total_pts; k++) {
sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
}
CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
int x_idx = voxel_idx_flat / (out_y * out_z);
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z;
if (box_idx >= boxes_num || channel_idx >= channels) return;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
offset_base * max_pts_each_voxel;
pooled_features += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
float sum_val = 0;
int total_pts = pts_idx_of_voxels[0];
for (int k = 1; k <= total_pts; k++) {
sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
}
if (total_pts > 0) {
pooled_features[0] = sum_val / total_pts;
if (total_pts > 0) {
pooled_features[0] = sum_val / total_pts;
}
}
}
......@@ -210,24 +206,22 @@ __global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
int box_idx = blockIdx.z;
int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
int x_idx = voxel_idx_flat / (out_y * out_z);
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z)
return;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
argmax += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
grad_out += box_idx * out_x * out_y * out_z * channels +
CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
int x_idx = voxel_idx_flat / (out_y * out_z);
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z;
if (box_idx >= boxes_num || channel_idx >= channels) return;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
argmax += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
grad_out += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
if (argmax[0] == -1) return;
if (argmax[0] == -1) return;
atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
}
}
template <typename T>
......@@ -242,26 +236,24 @@ __global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
int box_idx = blockIdx.z;
int channel_idx = blockIdx.y;
int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
int x_idx = voxel_idx_flat / (out_y * out_z);
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z;
if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
y_idx >= out_y || z_idx >= out_z)
return;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
offset_base * max_pts_each_voxel;
grad_out += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
int total_pts = pts_idx_of_voxels[0];
float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
for (int k = 1; k <= total_pts; k++) {
atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
grad_out[0] * cur_grad);
CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
int x_idx = voxel_idx_flat / (out_y * out_z);
int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
int z_idx = voxel_idx_flat % out_z;
if (box_idx >= boxes_num || channel_idx >= channels) return;
int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
offset_base * max_pts_each_voxel;
grad_out += box_idx * out_x * out_y * out_z * channels +
offset_base * channels + channel_idx;
int total_pts = pts_idx_of_voxels[0];
float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
for (int k = 1; k <= total_pts; k++) {
atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
grad_out[0] * cur_grad);
}
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment