Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
MMCV
Commits
fdeee889
Commit
fdeee889
authored
May 25, 2025
by
limm
Browse files
release v1.6.1 of mmcv
parent
df465820
Changes
457
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2680 additions
and
569 deletions
+2680
-569
mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
+2
-2
mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
+101
-0
mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
+12
-4
mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
+831
-0
mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
+83
-89
mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
+136
-0
mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
+15
-13
mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
+19
-17
mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
+144
-146
mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
+31
-30
mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
+300
-0
mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
+69
-68
mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
+74
-31
mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
+6
-8
mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
+25
-23
mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
+79
-0
mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
+381
-0
mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
.../ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
+242
-0
mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
+8
-8
mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
+122
-130
No files found.
Too many changes to show.
To preserve performance only
457 of 457+
files are displayed.
Plain diff
Email patch
mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
View file @
fdeee889
...
...
@@ -32,12 +32,12 @@ __device__ inline int Loc2Index(const int n, const int c, const int h,
#ifndef HIP_DIFF
/* TODO: move this to a common place */
template
<
typename
scalar_t
>
__device__
inline
scalar_t
mmcv_
min
(
scalar_t
a
,
scalar_t
b
)
{
__device__
inline
scalar_t
min
(
scalar_t
a
,
scalar_t
b
)
{
return
a
<
b
?
a
:
b
;
}
template
<
typename
scalar_t
>
__device__
inline
scalar_t
mmcv_
max
(
scalar_t
a
,
scalar_t
b
)
{
__device__
inline
scalar_t
max
(
scalar_t
a
,
scalar_t
b
)
{
return
a
>
b
?
a
:
b
;
}
#endif
...
...
mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu
#ifndef CHAMFER_DISTANCE_CUDA_KERNEL_CUH
#define CHAMFER_DISTANCE_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#define MAX_SHARED_SCALAR_T 6144 // 49152 / 8 = 6144
template
<
typename
scalar_t
>
__global__
void
chamfer_distance_forward_cuda_kernel
(
int
b
,
int
n
,
const
scalar_t
*
xyz
,
int
m
,
const
scalar_t
*
xyz2
,
scalar_t
*
result
,
int
*
result_i
)
{
__shared__
scalar_t
buf
[
MAX_SHARED_SCALAR_T
];
for
(
int
i
=
blockIdx
.
x
;
i
<
b
;
i
+=
gridDim
.
x
)
{
for
(
int
k2
=
0
;
k2
<
m
;
k2
+=
THREADS_PER_BLOCK
)
{
int
end_k
=
min
(
m
,
k2
+
THREADS_PER_BLOCK
)
-
k2
;
for
(
int
j
=
threadIdx
.
x
;
j
<
end_k
*
2
;
j
+=
blockDim
.
x
)
{
buf
[
j
]
=
xyz2
[(
i
*
m
+
k2
)
*
2
+
j
];
}
__syncthreads
();
for
(
int
j
=
threadIdx
.
x
;
j
<
n
;
j
+=
blockDim
.
x
*
gridDim
.
y
)
{
scalar_t
x1
=
xyz
[(
i
*
n
+
j
)
*
2
+
0
];
scalar_t
y1
=
xyz
[(
i
*
n
+
j
)
*
2
+
1
];
int
best_i
=
0
;
scalar_t
best
=
1e10
;
int
end_ka
=
end_k
&
(
~
2
);
if
(
end_ka
==
THREADS_PER_BLOCK
)
{
for
(
int
k
=
0
;
k
<
THREADS_PER_BLOCK
;
k
+=
4
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
scalar_t
x2
=
buf
[(
k
+
j
)
*
2
]
-
x1
;
scalar_t
y2
=
buf
[(
k
+
j
)
*
2
+
1
]
-
y1
;
scalar_t
d
=
x2
*
x2
+
y2
*
y2
;
if
(
d
<
best
)
{
best
=
d
;
best_i
=
k
+
k2
+
j
;
}
}
}
}
else
{
for
(
int
k
=
0
;
k
<
end_ka
;
k
+=
4
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
scalar_t
x2
=
buf
[(
k
+
j
)
*
2
]
-
x1
;
scalar_t
y2
=
buf
[(
k
+
j
)
*
2
+
1
]
-
y1
;
scalar_t
d
=
x2
*
x2
+
y2
*
y2
;
if
(
d
<
best
)
{
best
=
d
;
best_i
=
k
+
k2
+
j
;
}
}
}
}
for
(
int
k
=
end_ka
;
k
<
end_k
;
k
++
)
{
scalar_t
x2
=
buf
[
k
*
2
+
0
]
-
x1
;
scalar_t
y2
=
buf
[
k
*
2
+
1
]
-
y1
;
scalar_t
d
=
x2
*
x2
+
y2
*
y2
;
if
(
k
==
0
||
d
<
best
)
{
best
=
d
;
best_i
=
k
+
k2
;
}
}
if
(
k2
==
0
||
result
[(
i
*
n
+
j
)]
>
best
)
{
result
[(
i
*
n
+
j
)]
=
best
;
result_i
[(
i
*
n
+
j
)]
=
best_i
;
}
}
__syncthreads
();
}
}
}
template
<
typename
scalar_t
>
__global__
void
chamfer_distance_backward_cuda_kernel
(
int
b
,
int
n
,
const
scalar_t
*
xyz1
,
int
m
,
const
scalar_t
*
xyz2
,
const
scalar_t
*
grad_dist1
,
const
int
*
idx1
,
scalar_t
*
grad_xyz1
,
scalar_t
*
grad_xyz2
)
{
for
(
int
i
=
blockIdx
.
x
;
i
<
b
;
i
+=
gridDim
.
x
)
{
for
(
int
j
=
threadIdx
.
x
;
j
<
n
;
j
+=
blockDim
.
x
*
gridDim
.
y
)
{
scalar_t
x1
=
xyz1
[(
i
*
n
+
j
)
*
2
+
0
];
scalar_t
y1
=
xyz1
[(
i
*
n
+
j
)
*
2
+
1
];
int
j2
=
idx1
[
i
*
n
+
j
];
scalar_t
x2
=
xyz2
[(
i
*
m
+
j2
)
*
2
+
0
];
scalar_t
y2
=
xyz2
[(
i
*
m
+
j2
)
*
2
+
1
];
scalar_t
g
=
grad_dist1
[
i
*
n
+
j
]
*
2
;
atomicAdd
(
&
(
grad_xyz1
[(
i
*
n
+
j
)
*
2
+
0
]),
g
*
(
x1
-
x2
));
atomicAdd
(
&
(
grad_xyz1
[(
i
*
n
+
j
)
*
2
+
1
]),
g
*
(
y1
-
y2
));
atomicAdd
(
&
(
grad_xyz2
[(
i
*
m
+
j2
)
*
2
+
0
]),
-
(
g
*
(
x1
-
x2
)));
atomicAdd
(
&
(
grad_xyz2
[(
i
*
m
+
j2
)
*
2
+
1
]),
-
(
g
*
(
y1
-
y2
)));
}
}
}
#endif // CHAMFER_DISTANCE_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
View file @
fdeee889
...
...
@@ -7,12 +7,20 @@
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
#define THREADS_PER_BLOCK 512
#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x) \
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
j += blockDim.y * gridDim.y)
#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
#define
DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define
THREADS_PER_BLOCK 512
inline
int
GET_BLOCKS
(
const
int
N
)
{
int
optimal_block_num
=
(
N
+
THREADS_PER_BLOCK
-
1
)
/
THREADS_PER_BLOCK
;
inline
int
GET_BLOCKS
(
const
int
N
,
const
int
num_threads
=
THREADS_PER_BLOCK
)
{
int
optimal_block_num
=
(
N
+
num_threads
-
1
)
/
num_threads
;
int
max_block_num
=
4096
;
return
min
(
optimal_block_num
,
max_block_num
);
}
...
...
mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved
#ifndef CONVEX_IOU_CUDA_KERNEL_CUH
#define CONVEX_IOU_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#define MAXN 100
#define NMAX 512
__device__
const
double
EPS
=
1E-8
;
__device__
inline
int
sig
(
double
d
)
{
return
(
d
>
EPS
)
-
(
d
<
-
EPS
);
}
struct
Point
{
double
x
,
y
;
__device__
Point
()
{}
__device__
Point
(
double
x
,
double
y
)
:
x
(
x
),
y
(
y
)
{}
};
__device__
inline
bool
point_same
(
Point
&
a
,
Point
&
b
)
{
return
sig
(
a
.
x
-
b
.
x
)
==
0
&&
sig
(
a
.
y
-
b
.
y
)
==
0
;
}
__device__
inline
void
swap1
(
Point
*
a
,
Point
*
b
)
{
Point
temp
;
temp
.
x
=
a
->
x
;
temp
.
y
=
a
->
y
;
a
->
x
=
b
->
x
;
a
->
y
=
b
->
y
;
b
->
x
=
temp
.
x
;
b
->
y
=
temp
.
y
;
}
__device__
inline
void
reverse1
(
Point
*
a
,
const
int
n
)
{
for
(
int
i
=
0
;
i
<
(
n
-
1
)
/
2.0
;
i
++
)
{
Point
*
j
=
&
(
a
[
i
]);
Point
*
k
=
&
(
a
[
n
-
1
-
i
]);
swap1
(
j
,
k
);
}
}
__device__
inline
double
cross
(
Point
o
,
Point
a
,
Point
b
)
{
return
(
a
.
x
-
o
.
x
)
*
(
b
.
y
-
o
.
y
)
-
(
b
.
x
-
o
.
x
)
*
(
a
.
y
-
o
.
y
);
}
__device__
inline
double
dis
(
Point
a
,
Point
b
)
{
return
(
a
.
x
-
b
.
x
)
*
(
a
.
x
-
b
.
x
)
+
(
a
.
y
-
b
.
y
)
*
(
a
.
y
-
b
.
y
);
}
__device__
inline
double
area
(
Point
*
ps
,
int
n
)
{
ps
[
n
]
=
ps
[
0
];
double
res
=
0
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
res
+=
ps
[
i
].
x
*
ps
[
i
+
1
].
y
-
ps
[
i
].
y
*
ps
[
i
+
1
].
x
;
}
return
res
/
2.0
;
}
__device__
inline
double
polygon_area_grad
(
Point
*
ps
,
int
n
,
int
*
polygon_to_pred_index
,
int
n_pred
,
double
*
grad_C
)
{
ps
[
n
]
=
ps
[
0
];
double
partion_grad
[
4
*
30
+
2
];
double
res
=
0
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
res
+=
ps
[
i
].
x
*
ps
[
i
+
1
].
y
-
ps
[
i
].
y
*
ps
[
i
+
1
].
x
;
partion_grad
[
i
*
4
+
2
]
=
ps
[
i
+
1
].
y
;
partion_grad
[
i
*
4
+
3
]
=
-
ps
[
i
+
1
].
x
;
if
(
i
!=
n
-
1
)
{
partion_grad
[
i
*
4
+
4
]
=
-
ps
[
i
].
y
;
partion_grad
[
i
*
4
+
5
]
=
ps
[
i
].
x
;
}
else
{
partion_grad
[
0
]
=
-
ps
[
i
].
y
;
partion_grad
[
1
]
=
ps
[
i
].
x
;
}
}
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n_pred
;
j
++
)
{
if
(
i
==
polygon_to_pred_index
[
j
])
{
grad_C
[
2
*
polygon_to_pred_index
[
j
+
n_pred
]]
=
(
partion_grad
[
i
*
4
]
+
partion_grad
[
i
*
4
+
2
])
/
2
;
break
;
}
}
for
(
int
j
=
0
;
j
<
n_pred
;
j
++
)
{
if
(
i
==
polygon_to_pred_index
[
j
])
{
grad_C
[
2
*
polygon_to_pred_index
[
j
+
n_pred
]
+
1
]
=
(
partion_grad
[
i
*
4
+
1
]
+
partion_grad
[
i
*
4
+
1
+
2
])
/
2
;
break
;
}
}
}
return
res
/
2.0
;
}
__device__
inline
int
lineCross
(
Point
a
,
Point
b
,
Point
c
,
Point
d
,
Point
&
p
,
double
*
cut_grad
,
int
m
,
int
n
,
int
i
)
{
double
s1
,
s2
;
double
s2_s1_2
;
double
ds1_dxc
,
ds1_dyc
,
ds2_dxd
,
ds2_dyd
;
double
dxp_dxc
,
dxp_dyc
,
dxp_dxd
,
dxp_dyd
,
dyp_dxc
,
dyp_dyc
,
dyp_dxd
,
dyp_dyd
;
s1
=
cross
(
a
,
b
,
c
);
s2
=
cross
(
a
,
b
,
d
);
ds1_dxc
=
-
(
b
.
y
-
a
.
y
);
ds1_dyc
=
b
.
x
-
a
.
x
;
ds2_dxd
=
ds1_dxc
;
ds2_dyd
=
ds1_dyc
;
s2_s1_2
=
(
s2
-
s1
)
*
(
s2
-
s1
);
if
(
sig
(
s1
)
==
0
&&
sig
(
s2
)
==
0
)
return
2
;
if
(
sig
(
s2
-
s1
)
==
0
)
return
0
;
dxp_dxc
=
((
s2
-
d
.
x
*
ds1_dxc
)
*
(
s2
-
s1
)
-
(
c
.
x
*
s2
-
d
.
x
*
s1
)
*
(
-
ds1_dxc
))
/
(
s2_s1_2
);
dxp_dyc
=
((
0
-
d
.
x
*
ds1_dyc
)
*
(
s2
-
s1
)
-
(
c
.
x
*
s2
-
d
.
x
*
s1
)
*
(
-
ds1_dyc
))
/
(
s2_s1_2
);
dxp_dxd
=
((
c
.
x
*
ds2_dxd
-
s1
)
*
(
s2
-
s1
)
-
(
c
.
x
*
s2
-
d
.
x
*
s1
)
*
(
ds2_dxd
))
/
(
s2_s1_2
);
dxp_dyd
=
((
c
.
x
*
ds2_dyd
-
0
)
*
(
s2
-
s1
)
-
(
c
.
x
*
s2
-
d
.
x
*
s1
)
*
(
ds2_dyd
))
/
(
s2_s1_2
);
dyp_dxc
=
((
0
-
d
.
y
*
ds1_dxc
)
*
(
s2
-
s1
)
-
(
c
.
y
*
s2
-
d
.
y
*
s1
)
*
(
-
ds1_dxc
))
/
(
s2_s1_2
);
dyp_dyc
=
((
s2
-
d
.
y
*
ds1_dyc
)
*
(
s2
-
s1
)
-
(
c
.
y
*
s2
-
d
.
y
*
s1
)
*
(
-
ds1_dyc
))
/
(
s2_s1_2
);
dyp_dxd
=
((
c
.
y
*
ds2_dxd
-
0
)
*
(
s2
-
s1
)
-
(
c
.
y
*
s2
-
d
.
y
*
s1
)
*
(
ds2_dxd
))
/
(
s2_s1_2
);
dyp_dyd
=
((
c
.
y
*
ds2_dyd
-
s1
)
*
(
s2
-
s1
)
-
(
c
.
y
*
s2
-
d
.
y
*
s1
)
*
(
ds2_dyd
))
/
(
s2_s1_2
);
p
.
x
=
(
c
.
x
*
s2
-
d
.
x
*
s1
)
/
(
s2
-
s1
);
p
.
y
=
(
c
.
y
*
s2
-
d
.
y
*
s1
)
/
(
s2
-
s1
);
if
(
i
==
n
-
1
)
{
cut_grad
[
4
*
n
*
m
+
4
*
i
]
=
dxp_dxc
;
// + dyp_dxc;
cut_grad
[
4
*
n
*
m
+
4
*
i
+
1
]
=
dyp_dxc
;
cut_grad
[
4
*
n
*
m
+
4
*
i
+
2
]
=
dxp_dyc
;
// + dyp_dyc;
cut_grad
[
4
*
n
*
m
+
4
*
i
+
3
]
=
dyp_dyc
;
cut_grad
[
4
*
n
*
m
+
0
]
=
dxp_dxd
;
// + dyp_dxd;
cut_grad
[
4
*
n
*
m
+
1
]
=
dyp_dxd
;
cut_grad
[
4
*
n
*
m
+
2
]
=
dxp_dyd
;
// + dyp_dyd;
cut_grad
[
4
*
n
*
m
+
3
]
=
dyp_dyd
;
}
else
{
cut_grad
[
4
*
n
*
m
+
4
*
i
]
=
dxp_dxc
;
// + dyp_dxc;
cut_grad
[
4
*
n
*
m
+
4
*
i
+
1
]
=
dyp_dxc
;
cut_grad
[
4
*
n
*
m
+
4
*
i
+
2
]
=
dxp_dyc
;
// + dyp_dyc;
cut_grad
[
4
*
n
*
m
+
4
*
i
+
3
]
=
dyp_dyc
;
cut_grad
[
4
*
n
*
m
+
4
*
(
i
+
1
)]
=
dxp_dxd
;
// + dyp_dxd;
cut_grad
[
4
*
n
*
m
+
4
*
(
i
+
1
)
+
1
]
=
dyp_dxd
;
cut_grad
[
4
*
n
*
m
+
4
*
(
i
+
1
)
+
2
]
=
dxp_dyd
;
// + dyp_dyd;
cut_grad
[
4
*
n
*
m
+
4
*
(
i
+
1
)
+
3
]
=
dyp_dyd
;
}
return
1
;
}
__device__
inline
void
polygon_cut
(
Point
*
p
,
int
&
n
,
Point
a
,
Point
b
,
double
*
cut_grad
)
{
Point
pp
[
MAXN
];
double
ccur_grad
[
MAXN
]
=
{};
int
m
=
0
;
p
[
n
]
=
p
[
0
];
int
k
=
n
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
if
(
sig
(
cross
(
a
,
b
,
p
[
i
]))
>
0
)
{
pp
[
m
]
=
p
[
i
];
ccur_grad
[
4
*
n
*
m
+
4
*
i
]
=
1.0
;
ccur_grad
[
4
*
n
*
m
+
4
*
i
+
3
]
=
1.0
;
m
++
;
}
if
(
sig
(
cross
(
a
,
b
,
p
[
i
]))
!=
sig
(
cross
(
a
,
b
,
p
[
i
+
1
])))
{
lineCross
(
a
,
b
,
p
[
i
],
p
[
i
+
1
],
pp
[
m
],
ccur_grad
,
m
,
n
,
i
);
m
++
;
}
}
n
=
0
;
for
(
int
i
=
0
;
i
<
m
;
i
++
)
{
if
(
!
i
||
!
(
point_same
(
pp
[
i
],
pp
[
i
-
1
])))
{
p
[
n
]
=
pp
[
i
];
for
(
int
j
=
0
;
j
<
4
*
k
;
j
++
)
{
cut_grad
[
4
*
k
*
n
+
j
]
=
ccur_grad
[
4
*
k
*
i
+
j
];
}
n
++
;
}
}
while
(
n
>
1
&&
point_same
(
p
[
n
-
1
],
p
[
0
]))
n
--
;
}
__device__
inline
double
intersectArea
(
Point
a
,
Point
b
,
Point
c
,
Point
d
,
double
*
grad_AB
,
int
order
,
int
convex_n
)
{
Point
o
(
0
,
0
);
int
res_flag
=
0
;
int
s1
=
sig
(
cross
(
o
,
a
,
b
));
int
s2
=
sig
(
cross
(
o
,
c
,
d
));
if
(
s1
==
0
||
s2
==
0
)
return
0.0
;
if
(
s1
==
-
1
)
{
Point
*
i
=
&
a
;
Point
*
j
=
&
b
;
swap1
(
i
,
j
);
res_flag
=
1
;
}
if
(
s2
==
-
1
)
{
Point
*
i
=
&
c
;
Point
*
j
=
&
d
;
swap1
(
i
,
j
);
}
Point
p
[
10
]
=
{
o
,
a
,
b
};
int
n
=
3
,
n0
=
3
,
n1
,
n2
,
n3
;
double
cut_grad1
[
MAXN
]
=
{};
double
cut_grad2
[
MAXN
]
=
{};
double
cut_grad3
[
MAXN
]
=
{};
double
p1_p_grad
[
10
][
10
]
=
{};
double
p2_p1_grad
[
10
][
10
]
=
{};
double
p3_p2_grad
[
10
][
10
]
=
{};
double
p3_p1_grad
[
10
][
10
]
=
{};
double
p3_p_grad
[
10
][
10
]
=
{};
// 1
polygon_cut
(
p
,
n
,
o
,
c
,
cut_grad1
);
n1
=
n
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
for
(
int
j
=
0
;
j
<
4
*
n0
;
j
++
)
{
if
(
!
(
j
%
2
))
{
p1_p_grad
[
2
*
i
][
j
/
2
]
=
cut_grad1
[
4
*
n0
*
i
+
j
];
}
else
{
p1_p_grad
[
2
*
i
+
1
][
j
/
2
]
=
cut_grad1
[
4
*
n0
*
i
+
j
];
}
}
}
// 2
polygon_cut
(
p
,
n
,
c
,
d
,
cut_grad2
);
n2
=
n
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
for
(
int
j
=
0
;
j
<
4
*
n1
;
j
++
)
{
if
(
!
(
j
%
2
))
{
p2_p1_grad
[
2
*
i
][
j
/
2
]
=
cut_grad2
[
4
*
n1
*
i
+
j
];
}
else
{
p2_p1_grad
[
2
*
i
+
1
][
j
/
2
]
=
cut_grad2
[
4
*
n1
*
i
+
j
];
}
}
}
// 3
polygon_cut
(
p
,
n
,
d
,
o
,
cut_grad3
);
n3
=
n
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
for
(
int
j
=
0
;
j
<
4
*
n2
;
j
++
)
{
if
(
!
(
j
%
2
))
{
p3_p2_grad
[
2
*
i
][
j
/
2
]
=
cut_grad3
[
4
*
n2
*
i
+
j
];
}
else
{
p3_p2_grad
[
2
*
i
+
1
][
j
/
2
]
=
cut_grad3
[
4
*
n2
*
i
+
j
];
}
}
}
// mul
// p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1)
for
(
int
i
=
0
;
i
<
2
*
n3
;
i
++
)
{
for
(
int
j
=
0
;
j
<
2
*
n1
;
j
++
)
{
double
sum
=
0.0
;
for
(
int
m
=
0
;
m
<
2
*
n2
;
m
++
)
{
sum
=
sum
+
p3_p2_grad
[
i
][
m
]
*
p2_p1_grad
[
m
][
j
];
}
p3_p1_grad
[
i
][
j
]
=
sum
;
}
}
// p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0)
for
(
int
i
=
0
;
i
<
2
*
n3
;
i
++
)
{
for
(
int
j
=
0
;
j
<
2
*
n0
;
j
++
)
{
double
sum
=
0.0
;
for
(
int
m
=
0
;
m
<
2
*
n1
;
m
++
)
{
sum
=
sum
+
p3_p1_grad
[
i
][
m
]
*
p1_p_grad
[
m
][
j
];
}
p3_p_grad
[
i
][
j
]
=
sum
;
}
}
// calculate S_grad
int
polygon_index_box_index
[
20
];
double
grad_polygon
[
20
];
double
S_grad
[
6
];
for
(
int
i
=
0
;
i
<
n3
;
i
++
)
{
polygon_index_box_index
[
i
]
=
i
;
polygon_index_box_index
[
i
+
n3
]
=
i
;
}
double
res
=
polygon_area_grad
(
p
,
n3
,
polygon_index_box_index
,
n3
,
grad_polygon
);
if
(
s1
*
s2
==
-
1
)
{
for
(
int
j
=
0
;
j
<
2
*
3
;
j
++
)
{
double
sum
=
0.0
;
for
(
int
m
=
0
;
m
<
2
*
n3
;
m
++
)
{
sum
=
sum
-
grad_polygon
[
m
]
*
p3_p_grad
[
m
][
j
];
}
S_grad
[
j
]
=
sum
;
}
if
(
order
!=
convex_n
-
1
)
{
if
(
res_flag
)
{
grad_AB
[
2
*
order
]
+=
S_grad
[
4
];
grad_AB
[
2
*
order
+
1
]
+=
S_grad
[
5
];
grad_AB
[
2
*
order
+
2
]
+=
S_grad
[
2
];
grad_AB
[
2
*
order
+
3
]
+=
S_grad
[
3
];
}
else
{
grad_AB
[
2
*
order
]
+=
S_grad
[
2
];
grad_AB
[
2
*
order
+
1
]
+=
S_grad
[
3
];
grad_AB
[
2
*
order
+
2
]
+=
S_grad
[
4
];
grad_AB
[
2
*
order
+
3
]
+=
S_grad
[
5
];
}
}
else
{
if
(
res_flag
)
{
grad_AB
[
2
*
order
]
+=
S_grad
[
4
];
grad_AB
[
2
*
order
+
1
]
+=
S_grad
[
5
];
grad_AB
[
0
]
+=
S_grad
[
2
];
grad_AB
[
1
]
+=
S_grad
[
3
];
}
else
{
grad_AB
[
2
*
order
]
+=
S_grad
[
2
];
grad_AB
[
2
*
order
+
1
]
+=
S_grad
[
3
];
grad_AB
[
0
]
+=
S_grad
[
4
];
grad_AB
[
1
]
+=
S_grad
[
5
];
}
}
res
=
-
res
;
}
else
{
for
(
int
j
=
0
;
j
<
2
*
3
;
j
++
)
{
double
sum
=
0.0
;
for
(
int
m
=
0
;
m
<
2
*
n3
;
m
++
)
{
sum
=
sum
+
grad_polygon
[
m
]
*
p3_p_grad
[
m
][
j
];
}
S_grad
[
j
]
=
sum
;
}
if
(
order
!=
convex_n
-
1
)
{
if
(
res_flag
)
{
grad_AB
[
2
*
order
]
+=
S_grad
[
4
];
grad_AB
[
2
*
order
+
1
]
+=
S_grad
[
5
];
grad_AB
[
2
*
order
+
2
]
+=
S_grad
[
2
];
grad_AB
[
2
*
order
+
3
]
+=
S_grad
[
3
];
}
else
{
grad_AB
[
2
*
order
]
+=
S_grad
[
2
];
grad_AB
[
2
*
order
+
1
]
+=
S_grad
[
3
];
grad_AB
[
2
*
order
+
2
]
+=
S_grad
[
4
];
grad_AB
[
2
*
order
+
3
]
+=
S_grad
[
5
];
}
}
else
{
if
(
res_flag
)
{
grad_AB
[
2
*
order
]
+=
S_grad
[
4
];
grad_AB
[
2
*
order
+
1
]
+=
S_grad
[
5
];
grad_AB
[
0
]
+=
S_grad
[
2
];
grad_AB
[
1
]
+=
S_grad
[
3
];
}
else
{
grad_AB
[
2
*
order
]
+=
S_grad
[
2
];
grad_AB
[
2
*
order
+
1
]
+=
S_grad
[
3
];
grad_AB
[
0
]
+=
S_grad
[
4
];
grad_AB
[
1
]
+=
S_grad
[
5
];
}
}
}
return
res
;
}
__device__
inline
double
intersectAreaO
(
Point
*
ps1
,
int
n1
,
Point
*
ps2
,
int
n2
,
double
*
grad_AB
)
{
if
(
area
(
ps1
,
n1
)
<
0
)
reverse1
(
ps1
,
n1
);
if
(
area
(
ps2
,
n2
)
<
0
)
reverse1
(
ps2
,
n2
);
ps1
[
n1
]
=
ps1
[
0
];
ps2
[
n2
]
=
ps2
[
0
];
double
res
=
0
;
for
(
int
i
=
0
;
i
<
n1
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n2
;
j
++
)
{
res
+=
intersectArea
(
ps1
[
i
],
ps1
[
i
+
1
],
ps2
[
j
],
ps2
[
j
+
1
],
grad_AB
,
i
,
n1
);
}
}
return
res
;
}
__device__
inline
void
Jarvis
(
Point
*
in_poly
,
int
&
n_poly
)
{
Point
p_max
,
p_k
;
int
max_index
,
k_index
;
int
Stack
[
NMAX
]
=
{},
top1
,
top2
;
double
sign
;
Point
right_point
[
10
],
left_point
[
10
];
for
(
int
i
=
0
;
i
<
n_poly
;
i
++
)
{
if
(
in_poly
[
i
].
y
<
in_poly
[
0
].
y
||
in_poly
[
i
].
y
==
in_poly
[
0
].
y
&&
in_poly
[
i
].
x
<
in_poly
[
0
].
x
)
{
Point
*
j
=
&
(
in_poly
[
0
]);
Point
*
k
=
&
(
in_poly
[
i
]);
swap1
(
j
,
k
);
}
if
(
i
==
0
)
{
p_max
=
in_poly
[
0
];
max_index
=
0
;
}
if
(
in_poly
[
i
].
y
>
p_max
.
y
||
in_poly
[
i
].
y
==
p_max
.
y
&&
in_poly
[
i
].
x
>
p_max
.
x
)
{
p_max
=
in_poly
[
i
];
max_index
=
i
;
}
}
if
(
max_index
==
0
)
{
max_index
=
1
;
p_max
=
in_poly
[
max_index
];
}
k_index
=
0
,
Stack
[
0
]
=
0
,
top1
=
0
;
while
(
k_index
!=
max_index
)
{
p_k
=
p_max
;
k_index
=
max_index
;
for
(
int
i
=
1
;
i
<
n_poly
;
i
++
)
{
sign
=
cross
(
in_poly
[
Stack
[
top1
]],
in_poly
[
i
],
p_k
);
if
((
sign
>
0
)
||
((
sign
==
0
)
&&
(
dis
(
in_poly
[
Stack
[
top1
]],
in_poly
[
i
])
>
dis
(
in_poly
[
Stack
[
top1
]],
p_k
))))
{
p_k
=
in_poly
[
i
];
k_index
=
i
;
}
}
top1
++
;
Stack
[
top1
]
=
k_index
;
}
for
(
int
i
=
0
;
i
<=
top1
;
i
++
)
right_point
[
i
]
=
in_poly
[
Stack
[
i
]];
k_index
=
0
,
Stack
[
0
]
=
0
,
top2
=
0
;
while
(
k_index
!=
max_index
)
{
p_k
=
p_max
;
k_index
=
max_index
;
for
(
int
i
=
1
;
i
<
n_poly
;
i
++
)
{
sign
=
cross
(
in_poly
[
Stack
[
top2
]],
in_poly
[
i
],
p_k
);
if
((
sign
<
0
)
||
(
sign
==
0
)
&&
(
dis
(
in_poly
[
Stack
[
top2
]],
in_poly
[
i
])
>
dis
(
in_poly
[
Stack
[
top2
]],
p_k
)))
{
p_k
=
in_poly
[
i
];
k_index
=
i
;
}
}
top2
++
;
Stack
[
top2
]
=
k_index
;
}
for
(
int
i
=
top2
-
1
;
i
>=
0
;
i
--
)
left_point
[
i
]
=
in_poly
[
Stack
[
i
]];
for
(
int
i
=
0
;
i
<
top1
+
top2
;
i
++
)
{
if
(
i
<=
top1
)
{
in_poly
[
i
]
=
right_point
[
i
];
}
else
{
in_poly
[
i
]
=
left_point
[
top2
-
(
i
-
top1
)];
}
}
n_poly
=
top1
+
top2
;
}
__device__
inline
double
intersectAreaPoly
(
Point
*
ps1
,
int
n1
,
Point
*
ps2
,
int
n2
,
double
*
grad_C
)
{
Point
polygon
[
MAXN
];
int
n
=
n1
+
n2
,
n_poly
=
0
;
for
(
int
i
=
0
;
i
<
n1
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n
-
n1
;
j
++
)
{
if
(
point_same
(
ps1
[
i
],
ps2
[
j
]))
{
for
(
int
k
=
j
;
k
<
n
-
n1
-
1
;
k
++
)
{
ps2
[
k
]
=
ps2
[
k
+
1
];
}
n2
--
;
break
;
}
}
}
n_poly
=
n1
+
n2
;
for
(
int
i
=
0
;
i
<
n_poly
;
i
++
)
{
if
(
i
<
n1
)
{
polygon
[
i
]
=
ps1
[
i
];
}
else
{
polygon
[
i
]
=
ps2
[
i
-
n1
];
}
}
Jarvis
(
polygon
,
n_poly
);
int
polygon_to_pred_index
[
18
]
=
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
};
int
n_pred
=
0
;
for
(
int
i
=
0
;
i
<
n_poly
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n1
;
j
++
)
{
if
(
polygon
[
i
].
x
==
ps1
[
j
].
x
&&
polygon
[
i
].
y
==
ps1
[
j
].
y
)
{
polygon_to_pred_index
[
n_pred
]
=
i
;
polygon_to_pred_index
[
n_pred
+
n1
]
=
j
;
n_pred
+=
1
;
break
;
}
}
}
if
(
n_pred
==
0
)
{
double
polygon_area
=
fabs
(
area
(
polygon
,
n_poly
));
for
(
int
i
=
0
;
i
<
18
;
i
++
)
{
grad_C
[
i
]
=
0.0
;
}
return
polygon_area
;
}
else
{
double
polygon_area
=
polygon_area_grad
(
polygon
,
n_poly
,
polygon_to_pred_index
,
n1
,
grad_C
);
if
(
polygon_area
<
0
)
{
for
(
int
i
=
0
;
i
<
18
;
i
++
)
{
grad_C
[
i
]
=
-
grad_C
[
i
];
}
}
return
fabs
(
polygon_area
);
}
}
// convex_find and get the polygon_index_box_index
__device__
inline
void
Jarvis_and_index
(
Point
*
in_poly
,
int
&
n_poly
,
int
*
points_to_convex_ind
)
{
int
n_input
=
n_poly
;
Point
input_poly
[
20
];
for
(
int
i
=
0
;
i
<
n_input
;
i
++
)
{
input_poly
[
i
].
x
=
in_poly
[
i
].
x
;
input_poly
[
i
].
y
=
in_poly
[
i
].
y
;
}
Point
p_max
,
p_k
;
int
max_index
,
k_index
;
int
Stack
[
20
],
top1
,
top2
;
double
sign
;
Point
right_point
[
10
],
left_point
[
10
];
for
(
int
i
=
0
;
i
<
n_poly
;
i
++
)
{
if
(
in_poly
[
i
].
y
<
in_poly
[
0
].
y
||
in_poly
[
i
].
y
==
in_poly
[
0
].
y
&&
in_poly
[
i
].
x
<
in_poly
[
0
].
x
)
{
Point
*
j
=
&
(
in_poly
[
0
]);
Point
*
k
=
&
(
in_poly
[
i
]);
swap1
(
j
,
k
);
}
if
(
i
==
0
)
{
p_max
=
in_poly
[
0
];
max_index
=
0
;
}
if
(
in_poly
[
i
].
y
>
p_max
.
y
||
in_poly
[
i
].
y
==
p_max
.
y
&&
in_poly
[
i
].
x
>
p_max
.
x
)
{
p_max
=
in_poly
[
i
];
max_index
=
i
;
}
}
if
(
max_index
==
0
)
{
max_index
=
1
;
p_max
=
in_poly
[
max_index
];
}
k_index
=
0
,
Stack
[
0
]
=
0
,
top1
=
0
;
while
(
k_index
!=
max_index
)
{
p_k
=
p_max
;
k_index
=
max_index
;
for
(
int
i
=
1
;
i
<
n_poly
;
i
++
)
{
sign
=
cross
(
in_poly
[
Stack
[
top1
]],
in_poly
[
i
],
p_k
);
if
((
sign
>
0
)
||
((
sign
==
0
)
&&
(
dis
(
in_poly
[
Stack
[
top1
]],
in_poly
[
i
])
>
dis
(
in_poly
[
Stack
[
top1
]],
p_k
))))
{
p_k
=
in_poly
[
i
];
k_index
=
i
;
}
}
top1
++
;
Stack
[
top1
]
=
k_index
;
}
for
(
int
i
=
0
;
i
<=
top1
;
i
++
)
{
right_point
[
i
]
=
in_poly
[
Stack
[
i
]];
}
k_index
=
0
,
Stack
[
0
]
=
0
,
top2
=
0
;
while
(
k_index
!=
max_index
)
{
p_k
=
p_max
;
k_index
=
max_index
;
for
(
int
i
=
1
;
i
<
n_poly
;
i
++
)
{
sign
=
cross
(
in_poly
[
Stack
[
top2
]],
in_poly
[
i
],
p_k
);
if
((
sign
<
0
)
||
(
sign
==
0
)
&&
(
dis
(
in_poly
[
Stack
[
top2
]],
in_poly
[
i
])
>
dis
(
in_poly
[
Stack
[
top2
]],
p_k
)))
{
p_k
=
in_poly
[
i
];
k_index
=
i
;
}
}
top2
++
;
Stack
[
top2
]
=
k_index
;
}
for
(
int
i
=
top2
-
1
;
i
>=
0
;
i
--
)
{
left_point
[
i
]
=
in_poly
[
Stack
[
i
]];
}
for
(
int
i
=
0
;
i
<
top1
+
top2
;
i
++
)
{
if
(
i
<=
top1
)
{
in_poly
[
i
]
=
right_point
[
i
];
}
else
{
in_poly
[
i
]
=
left_point
[
top2
-
(
i
-
top1
)];
}
}
n_poly
=
top1
+
top2
;
for
(
int
i
=
0
;
i
<
n_poly
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n_input
;
j
++
)
{
if
(
point_same
(
in_poly
[
i
],
input_poly
[
j
]))
{
points_to_convex_ind
[
i
]
=
j
;
break
;
}
}
}
}
template
<
typename
T
>
__device__
inline
float
devrIoU
(
T
const
*
const
p
,
T
const
*
const
q
,
T
*
point_grad
,
const
int
idx
)
{
Point
ps1
[
MAXN
],
ps2
[
MAXN
];
Point
convex
[
MAXN
];
for
(
int
i
=
0
;
i
<
9
;
i
++
)
{
convex
[
i
].
x
=
(
double
)
p
[
i
*
2
];
convex
[
i
].
y
=
(
double
)
p
[
i
*
2
+
1
];
}
int
n_convex
=
9
;
int
points_to_convex_ind
[
9
]
=
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
};
Jarvis_and_index
(
convex
,
n_convex
,
points_to_convex_ind
);
int
n1
=
n_convex
;
int
n2
=
4
;
for
(
int
i
=
0
;
i
<
n1
;
i
++
)
{
ps1
[
i
].
x
=
(
double
)
convex
[
i
].
x
;
ps1
[
i
].
y
=
(
double
)
convex
[
i
].
y
;
}
for
(
int
i
=
0
;
i
<
n2
;
i
++
)
{
ps2
[
i
].
x
=
(
double
)
q
[
i
*
2
];
ps2
[
i
].
y
=
(
double
)
q
[
i
*
2
+
1
];
}
int
polygon_index_box_index
[
18
];
for
(
int
i
=
0
;
i
<
n1
;
i
++
)
{
polygon_index_box_index
[
i
]
=
i
;
polygon_index_box_index
[
i
+
n1
]
=
i
;
}
double
grad_A
[
18
]
=
{};
double
grad_AB
[
18
]
=
{};
double
grad_C
[
18
]
=
{};
double
inter_area
=
intersectAreaO
(
ps1
,
n1
,
ps2
,
n2
,
grad_AB
);
double
S_pred
=
polygon_area_grad
(
ps1
,
n1
,
polygon_index_box_index
,
n1
,
grad_A
);
if
(
S_pred
<
0
)
{
for
(
int
i
=
0
;
i
<
n_convex
*
2
;
i
++
)
{
grad_A
[
i
]
=
-
grad_A
[
i
];
}
}
double
union_area
=
fabs
(
S_pred
)
+
fabs
(
area
(
ps2
,
n2
))
-
inter_area
;
double
iou
=
inter_area
/
union_area
;
double
polygon_area
=
intersectAreaPoly
(
ps1
,
n1
,
ps2
,
n2
,
grad_C
);
// printf("%d:live\n", idx);
double
rot_giou
=
iou
-
(
polygon_area
-
union_area
)
/
polygon_area
;
float
grad_point_temp
[
18
]
=
{};
for
(
int
i
=
0
;
i
<
n_convex
;
i
++
)
{
int
grad_point
=
points_to_convex_ind
[
i
];
grad_point_temp
[
2
*
grad_point
]
=
(
float
)((
union_area
+
inter_area
)
/
(
union_area
*
union_area
)
*
grad_AB
[
2
*
i
]
-
iou
/
union_area
*
grad_A
[
2
*
i
]
-
1
/
polygon_area
*
(
grad_AB
[
2
*
i
]
-
grad_A
[
2
*
i
])
-
(
union_area
)
/
polygon_area
/
polygon_area
*
grad_C
[
2
*
i
]);
grad_point_temp
[
2
*
grad_point
+
1
]
=
(
float
)((
union_area
+
inter_area
)
/
(
union_area
*
union_area
)
*
grad_AB
[
2
*
i
+
1
]
-
iou
/
union_area
*
grad_A
[
2
*
i
+
1
]
-
1
/
polygon_area
*
(
grad_AB
[
2
*
i
+
1
]
-
grad_A
[
2
*
i
+
1
])
-
(
union_area
)
/
polygon_area
/
polygon_area
*
grad_C
[
2
*
i
+
1
]);
}
for
(
int
i
=
0
;
i
<
9
;
i
++
)
{
point_grad
[
2
*
i
]
=
grad_point_temp
[
2
*
i
];
point_grad
[
2
*
i
+
1
]
=
grad_point_temp
[
2
*
i
+
1
];
}
return
(
float
)
rot_giou
;
}
template
<
typename
T
>
__global__
void
convex_giou_cuda_kernel
(
const
int
ex_n_boxes
,
const
int
gt_n_boxes
,
const
T
*
ex_boxes
,
const
T
*
gt_boxes
,
T
*
point_grad
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
ex_n_boxes
)
{
const
T
*
cur_box
=
ex_boxes
+
index
*
18
;
const
T
*
cur_gt_box
=
gt_boxes
+
index
*
8
;
T
*
cur_grad
=
point_grad
+
index
*
19
;
T
giou
=
devrIoU
(
cur_box
,
cur_gt_box
,
cur_grad
,
threadIdx
.
x
);
cur_grad
[
18
]
=
giou
;
}
}
__device__
inline
int
lineCross
(
Point
a
,
Point
b
,
Point
c
,
Point
d
,
Point
&
p
)
{
double
s1
,
s2
;
s1
=
cross
(
a
,
b
,
c
);
s2
=
cross
(
a
,
b
,
d
);
if
(
sig
(
s1
)
==
0
&&
sig
(
s2
)
==
0
)
return
2
;
if
(
sig
(
s2
-
s1
)
==
0
)
return
0
;
p
.
x
=
(
c
.
x
*
s2
-
d
.
x
*
s1
)
/
(
s2
-
s1
);
p
.
y
=
(
c
.
y
*
s2
-
d
.
y
*
s1
)
/
(
s2
-
s1
);
return
1
;
}
__device__
inline
void
polygon_cut
(
Point
*
p
,
int
&
n
,
Point
a
,
Point
b
)
{
Point
pp
[
MAXN
];
int
m
=
0
;
p
[
n
]
=
p
[
0
];
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
if
(
sig
(
cross
(
a
,
b
,
p
[
i
]))
>
0
)
{
pp
[
m
]
=
p
[
i
];
m
++
;
}
if
(
sig
(
cross
(
a
,
b
,
p
[
i
]))
!=
sig
(
cross
(
a
,
b
,
p
[
i
+
1
])))
{
lineCross
(
a
,
b
,
p
[
i
],
p
[
i
+
1
],
pp
[
m
]);
m
++
;
}
}
n
=
0
;
for
(
int
i
=
0
;
i
<
m
;
i
++
)
{
if
(
!
i
||
!
(
point_same
(
pp
[
i
],
pp
[
i
-
1
])))
{
p
[
n
]
=
pp
[
i
];
n
++
;
}
}
while
(
n
>
1
&&
point_same
(
p
[
n
-
1
],
p
[
0
]))
n
--
;
}
__device__
inline
double
intersectArea
(
Point
a
,
Point
b
,
Point
c
,
Point
d
)
{
Point
o
(
0
,
0
);
int
s1
=
sig
(
cross
(
o
,
a
,
b
));
int
s2
=
sig
(
cross
(
o
,
c
,
d
));
if
(
s1
==
0
||
s2
==
0
)
return
0.0
;
if
(
s1
==
-
1
)
{
Point
*
i
=
&
a
;
Point
*
j
=
&
b
;
swap1
(
i
,
j
);
}
if
(
s2
==
-
1
)
{
Point
*
i
=
&
c
;
Point
*
j
=
&
d
;
swap1
(
i
,
j
);
}
Point
p
[
10
]
=
{
o
,
a
,
b
};
int
n
=
3
;
polygon_cut
(
p
,
n
,
o
,
c
);
polygon_cut
(
p
,
n
,
c
,
d
);
polygon_cut
(
p
,
n
,
d
,
o
);
double
res
=
area
(
p
,
n
);
if
(
s1
*
s2
==
-
1
)
res
=
-
res
;
return
res
;
}
__device__
inline
double
intersectAreaO
(
Point
*
ps1
,
int
n1
,
Point
*
ps2
,
int
n2
)
{
if
(
area
(
ps1
,
n1
)
<
0
)
reverse1
(
ps1
,
n1
);
if
(
area
(
ps2
,
n2
)
<
0
)
reverse1
(
ps2
,
n2
);
ps1
[
n1
]
=
ps1
[
0
];
ps2
[
n2
]
=
ps2
[
0
];
double
res
=
0
;
for
(
int
i
=
0
;
i
<
n1
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n2
;
j
++
)
{
res
+=
intersectArea
(
ps1
[
i
],
ps1
[
i
+
1
],
ps2
[
j
],
ps2
[
j
+
1
]);
}
}
return
res
;
}
template
<
typename
T
>
__device__
inline
float
devrIoU
(
T
const
*
const
p
,
T
const
*
const
q
)
{
Point
ps1
[
MAXN
],
ps2
[
MAXN
];
Point
convex
[
MAXN
];
for
(
int
i
=
0
;
i
<
9
;
i
++
)
{
convex
[
i
].
x
=
(
double
)
p
[
i
*
2
];
convex
[
i
].
y
=
(
double
)
p
[
i
*
2
+
1
];
}
int
n_convex
=
9
;
int
points_to_convex_ind
[
9
]
=
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
};
Jarvis_and_index
(
convex
,
n_convex
,
points_to_convex_ind
);
int
n1
=
n_convex
;
for
(
int
i
=
0
;
i
<
n1
;
i
++
)
{
ps1
[
i
].
x
=
(
double
)
convex
[
i
].
x
;
ps1
[
i
].
y
=
(
double
)
convex
[
i
].
y
;
}
int
n2
=
4
;
for
(
int
i
=
0
;
i
<
n2
;
i
++
)
{
ps2
[
i
].
x
=
(
double
)
q
[
i
*
2
];
ps2
[
i
].
y
=
(
double
)
q
[
i
*
2
+
1
];
}
double
inter_area
=
intersectAreaO
(
ps1
,
n1
,
ps2
,
n2
);
double
S_pred
=
area
(
ps1
,
n1
);
double
union_area
=
fabs
(
S_pred
)
+
fabs
(
area
(
ps2
,
n2
))
-
inter_area
;
double
iou
=
inter_area
/
union_area
;
return
(
float
)
iou
;
}
template
<
typename
T
>
__global__
void
convex_iou_cuda_kernel
(
const
int
ex_n_boxes
,
const
int
gt_n_boxes
,
const
T
*
ex_boxes
,
const
T
*
gt_boxes
,
T
*
iou
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
ex_n_boxes
)
{
const
T
*
cur_box
=
ex_boxes
+
index
*
18
;
for
(
int
i
=
0
;
i
<
gt_n_boxes
;
i
++
)
{
iou
[
index
*
gt_n_boxes
+
i
]
=
devrIoU
(
cur_box
,
gt_boxes
+
i
*
8
);
}
}
}
#endif // CONVEX_IOU_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
View file @
fdeee889
...
...
@@ -29,8 +29,8 @@ using namespace torch;
#define TensorAcc5R PackedTensorAccessor32<scalar_t, 5, RestrictPtrTraits>
#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
#define
THREADS_FORWARD
32
#define
THREADS_BACKWARD 16
#define
WARP_SIZE
32
#define
FULL_MASK 0xffffffff
template
<
typename
scalar_t
>
__global__
void
correlation_forward_cuda_kernel
(
...
...
@@ -42,8 +42,8 @@ __global__ void correlation_forward_cuda_kernel(
const
int
C
=
rInput1
.
size
(
3
);
const
int
n
=
blockIdx
.
x
;
const
int
h
=
blockIdx
.
y
;
const
int
w
=
blockIdx
.
z
;
const
int
h
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
const
int
w
=
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
;
const
int
thread
=
threadIdx
.
x
;
const
int
start_i
=
-
padH
+
h
*
dH
;
...
...
@@ -52,13 +52,11 @@ __global__ void correlation_forward_cuda_kernel(
const
int
patchRadH
=
dilation_patchH
*
(
patchH
-
1
)
/
2
;
const
int
patchRadW
=
dilation_patchW
*
(
patchW
-
1
)
/
2
;
__shared__
scalar_t
prod_sum
[
THREADS_FORWARD
];
for
(
int
ph
=
0
;
ph
<
patchH
;
++
ph
)
{
int
ph_dilated
=
ph
*
dilation_patchH
-
patchRadH
;
for
(
int
pw
=
0
;
pw
<
patchW
;
++
pw
)
{
int
pw_dilated
=
pw
*
dilation_patchW
-
patchRadW
;
prod_sum
[
thread
]
=
0
;
scalar_t
prod_sum
=
0
.0
f
;
for
(
int
i
=
0
;
i
<
kH
;
++
i
)
{
int
i1
=
start_i
+
i
*
dilationH
;
int
i2
=
i1
+
ph_dilated
;
...
...
@@ -69,23 +67,20 @@ __global__ void correlation_forward_cuda_kernel(
int
j2
=
j1
+
pw_dilated
;
if
WITHIN_BOUNDS
(
j1
,
j2
,
iW
,
iW
)
{
for
(
int
c
=
thread
;
c
<
C
;
c
+=
THREADS_FORWARD
)
{
for
(
int
c
=
thread
;
c
<
C
;
c
+=
WARP_SIZE
)
{
scalar_t
v1
=
rInput1
[
n
][
i1
][
j1
][
c
];
scalar_t
v2
=
rInput2
[
n
][
i2
][
j2
][
c
];
prod_sum
[
thread
]
+=
v1
*
v2
;
prod_sum
+=
v1
*
v2
;
}
}
}
}
}
// accumulate
__syncthreads
();
for
(
int
offset
=
16
;
offset
>
0
;
offset
/=
2
)
prod_sum
+=
__shfl_down_sync
(
FULL_MASK
,
float
(
prod_sum
),
offset
);
if
(
thread
==
0
)
{
scalar_t
reduce_sum
=
0
;
for
(
int
index
=
0
;
index
<
THREADS_FORWARD
;
++
index
)
{
reduce_sum
+=
prod_sum
[
index
];
}
output
[
n
][
ph
][
pw
][
h
][
w
]
=
reduce_sum
;
output
[
n
][
ph
][
pw
][
h
][
w
]
=
prod_sum
;
}
}
}
...
...
@@ -97,9 +92,10 @@ __global__ void correlation_backward_cuda_kernel_input1(
TensorAcc4R
grad_input1
,
const
int
kH
,
const
int
kW
,
const
int
patchH
,
const
int
patchW
,
const
int
padH
,
const
int
padW
,
const
int
dilationH
,
const
int
dilationW
,
const
int
dilation_patchH
,
const
int
dilation_patchW
,
const
int
dH
,
const
int
dW
,
const
int
batch
)
{
const
int
iH
=
input2
.
size
(
2
);
const
int
iW
=
input2
.
size
(
3
);
const
int
dH
,
const
int
dW
)
{
const
int
iH
=
input2
.
size
(
1
);
const
int
iW
=
input2
.
size
(
2
);
const
int
C
=
input2
.
size
(
3
);
const
int
H
=
grad_output
.
size
(
3
);
const
int
W
=
grad_output
.
size
(
4
);
...
...
@@ -107,54 +103,53 @@ __global__ void correlation_backward_cuda_kernel_input1(
const
int
patchRadH
=
(
patchH
-
1
)
/
2
;
const
int
patchRadW
=
(
patchW
-
1
)
/
2
;
const
int
n
=
batch
;
const
int
c
=
blockIdx
.
x
;
const
int
n
=
blockIdx
.
x
;
const
int
h
=
blockIdx
.
y
;
const
int
w
=
blockIdx
.
z
;
const
int
ph_off
=
threadIdx
.
x
;
const
int
pw_off
=
threadIdx
.
y
;
const
int
h_2
=
h
+
padH
;
const
int
w_2
=
w
+
padW
;
const
int
min_h
=
h_2
-
kH
*
dilationH
;
const
int
min_w
=
w_2
-
kW
*
dilationW
;
__shared__
scalar_t
prod_sum
[
THREADS_BACKWARD
][
THREADS_BACKWARD
];
prod_sum
[
ph_off
][
pw_off
]
=
0
;
for
(
int
ph
=
ph_off
;
ph
<
patchH
;
ph
+=
THREADS_BACKWARD
)
{
extern
__shared__
__align__
(
sizeof
(
4
))
unsigned
char
grad_cache_char
[];
scalar_t
*
grad_cache
=
reinterpret_cast
<
scalar_t
*>
(
grad_cache_char
);
for
(
int
i
=
threadIdx
.
x
;
i
<
patchH
*
patchW
;
i
+=
blockDim
.
x
)
{
const
int
ph
=
i
/
patchW
;
const
int
pw
=
i
%
patchW
;
int
i1
=
h
+
dilation_patchH
*
(
ph
-
patchRadH
);
for
(
int
pw
=
pw_off
;
pw
<
patchW
;
pw
+=
THREADS_BACKWARD
)
{
int
j1
=
w
+
dilation_patchW
*
(
pw
-
patchRadW
);
if
(
WITHIN_BOUNDS
(
i1
,
j1
,
iH
,
iW
))
{
scalar_t
val
=
input2
[
n
][
c
][
i1
][
j1
];
for
(
int
h_3
=
h_2
;
h_3
>
min_h
;
h_3
-=
dilationH
)
{
int
i2
=
(
h_3
)
/
dH
;
if
(
i2
*
dH
!=
h_3
)
continue
;
for
(
int
w_3
=
w_2
;
w_3
>
min_w
;
w_3
-=
dilationW
)
{
int
j2
=
(
w_3
)
/
dW
;
if
(
j2
*
dW
!=
w_3
)
continue
;
if
WITHIN_BOUNDS
(
i2
,
j2
,
H
,
W
)
{
prod_sum
[
ph_off
][
pw_off
]
+=
grad_output
[
n
][
ph
][
pw
][
i2
][
j2
]
*
val
;
}
int
j1
=
w
+
dilation_patchW
*
(
pw
-
patchRadW
);
if
(
WITHIN_BOUNDS
(
i1
,
j1
,
iH
,
iW
))
{
scalar_t
grad_val
=
0.0
f
;
for
(
int
h_3
=
h_2
;
h_3
>
min_h
;
h_3
-=
dilationH
)
{
int
i2
=
(
h_3
)
/
dH
;
if
(
i2
*
dH
!=
h_3
)
continue
;
for
(
int
w_3
=
w_2
;
w_3
>
min_w
;
w_3
-=
dilationW
)
{
int
j2
=
(
w_3
)
/
dW
;
if
(
j2
*
dW
!=
w_3
)
continue
;
if
(
WITHIN_BOUNDS
(
i2
,
j2
,
H
,
W
))
{
grad_val
+=
grad_output
[
n
][
ph
][
pw
][
i2
][
j2
];
}
}
}
grad_cache
[
i
]
=
grad_val
;
}
}
__syncthreads
();
if
(
ph_off
==
0
&&
pw_off
==
0
)
{
scalar_t
reduce_sum
=
0
;
for
(
int
ph
=
0
;
ph
<
THREADS_BACKWARD
;
++
ph
)
{
for
(
int
pw
=
0
;
pw
<
THREADS_BACKWARD
;
++
pw
)
{
reduce_sum
+=
prod_sum
[
ph
][
pw
];
for
(
int
c
=
threadIdx
.
x
;
c
<
C
;
c
+=
blockDim
.
x
)
{
scalar_t
grad_input_val
=
0.0
f
;
for
(
int
ph
=
0
;
ph
<
patchH
;
++
ph
)
{
int
i1
=
h
+
dilation_patchH
*
(
ph
-
patchRadH
);
for
(
int
pw
=
0
;
pw
<
patchW
;
++
pw
)
{
int
j1
=
w
+
dilation_patchW
*
(
pw
-
patchRadW
);
if
(
WITHIN_BOUNDS
(
i1
,
j1
,
iH
,
iW
))
{
grad_input_val
+=
input2
[
n
][
i1
][
j1
][
c
]
*
grad_cache
[
ph
*
patchW
+
pw
];
}
}
}
grad_input1
[
n
][
c
][
h
][
w
]
=
reduce_sum
;
grad_input1
[
n
][
c
][
h
][
w
]
=
grad_input_val
;
}
}
...
...
@@ -163,9 +158,10 @@ __global__ void correlation_backward_cuda_kernel_input2(
const
TensorAcc5R
grad_output
,
const
TensorAcc4R
input1
,
TensorAcc4R
grad_input2
,
int
kH
,
int
kW
,
int
patchH
,
int
patchW
,
int
padH
,
int
padW
,
int
dilationH
,
int
dilationW
,
int
dilation_patchH
,
int
dilation_patchW
,
int
dH
,
int
dW
,
int
batch
)
{
const
int
iH
=
input1
.
size
(
2
);
const
int
iW
=
input1
.
size
(
3
);
int
dilation_patchW
,
int
dH
,
int
dW
)
{
const
int
iH
=
input1
.
size
(
1
);
const
int
iW
=
input1
.
size
(
2
);
const
int
C
=
input1
.
size
(
3
);
const
int
patchRadH
=
(
patchH
-
1
)
/
2
;
const
int
patchRadW
=
(
patchW
-
1
)
/
2
;
...
...
@@ -176,56 +172,54 @@ __global__ void correlation_backward_cuda_kernel_input2(
const
int
dilatedKH
=
kH
*
dilationH
;
const
int
dilatedKW
=
kW
*
dilationW
;
const
int
n
=
batch
;
const
int
c
=
blockIdx
.
x
;
const
int
n
=
blockIdx
.
x
;
const
int
h
=
blockIdx
.
y
;
const
int
w
=
blockIdx
.
z
;
const
int
ph_off
=
threadIdx
.
x
;
const
int
pw_off
=
threadIdx
.
y
;
__shared__
scalar_t
prod_sum
[
THREADS_BACKWARD
][
THREADS_BACKWARD
];
prod_sum
[
ph_off
][
pw_off
]
=
0
;
for
(
int
ph
=
ph_off
;
ph
<
patchH
;
ph
+=
THREADS_BACKWARD
)
{
extern
__shared__
__align__
(
sizeof
(
4
))
unsigned
char
grad_cache_char
[];
scalar_t
*
grad_cache
=
reinterpret_cast
<
scalar_t
*>
(
grad_cache_char
);
for
(
int
i
=
threadIdx
.
x
;
i
<
patchH
*
patchW
;
i
+=
blockDim
.
x
)
{
const
int
ph
=
i
/
patchW
;
const
int
pw
=
i
%
patchW
;
int
i1
=
h
-
dilation_patchH
*
(
ph
-
patchRadH
);
for
(
int
pw
=
pw_off
;
pw
<
patchW
;
pw
+=
THREADS_BACKWARD
)
{
int
j1
=
w
-
dilation_patchW
*
(
pw
-
patchRadW
);
if
WITHIN_BOUNDS
(
i1
,
j1
,
iH
,
iW
)
{
scalar_t
val
=
input1
[
n
][
c
][
i1
][
j1
];
const
int
h_2
=
i1
+
padH
;
const
int
w_2
=
j1
+
padW
;
const
int
min_h
=
h_2
-
dilatedKH
;
const
int
min_w
=
w_2
-
dilatedKW
;
for
(
int
h_3
=
h_2
;
h_3
>
min_h
;
h_3
-=
dilationH
)
{
int
i2
=
(
h_3
)
/
dH
;
if
(
i2
*
dH
!=
h_3
)
continue
;
for
(
int
w_3
=
w_2
;
w_3
>
min_w
;
w_3
-=
dilationW
)
{
int
j2
=
(
w_3
)
/
dW
;
if
(
j2
*
dW
!=
w_3
)
continue
;
if
WITHIN_BOUNDS
(
i2
,
j2
,
H
,
W
)
{
prod_sum
[
ph_off
][
pw_off
]
+=
grad_output
[
n
][
ph
][
pw
][
i2
][
j2
]
*
val
;
}
}
int
j1
=
w
-
dilation_patchW
*
(
pw
-
patchRadW
);
if
(
WITHIN_BOUNDS
(
i1
,
j1
,
iH
,
iW
))
{
scalar_t
grad_val
=
0.0
f
;
const
int
h_2
=
i1
+
padH
;
const
int
w_2
=
j1
+
padW
;
const
int
min_h
=
h_2
-
dilatedKH
;
const
int
min_w
=
w_2
-
dilatedKW
;
for
(
int
h_3
=
h_2
;
h_3
>
min_h
;
h_3
-=
dilationH
)
{
int
i2
=
(
h_3
)
/
dH
;
if
(
i2
*
dH
!=
h_3
)
continue
;
for
(
int
w_3
=
w_2
;
w_3
>
min_w
;
w_3
-=
dilationW
)
{
int
j2
=
(
w_3
)
/
dW
;
if
(
j2
*
dW
!=
w_3
)
continue
;
if
(
WITHIN_BOUNDS
(
i2
,
j2
,
H
,
W
))
{
grad_val
+=
grad_output
[
n
][
ph
][
pw
][
i2
][
j2
];
}
}
}
grad_cache
[
i
]
=
grad_val
;
}
}
__syncthreads
();
if
(
ph_off
==
0
&&
pw_off
==
0
)
{
scalar_t
reduce_sum
=
0
;
for
(
int
ph
=
0
;
ph
<
THREADS_BACKWARD
;
++
ph
)
{
for
(
int
pw
=
0
;
pw
<
THREADS_BACKWARD
;
++
pw
)
{
reduce_sum
+=
prod_sum
[
ph
][
pw
];
for
(
int
c
=
threadIdx
.
x
;
c
<
C
;
c
+=
blockDim
.
x
)
{
scalar_t
grad_input_val
=
0.0
f
;
for
(
int
ph
=
0
;
ph
<
patchH
;
++
ph
)
{
int
i1
=
h
-
dilation_patchH
*
(
ph
-
patchRadH
);
for
(
int
pw
=
0
;
pw
<
patchW
;
++
pw
)
{
int
j1
=
w
-
dilation_patchW
*
(
pw
-
patchRadW
);
if
(
WITHIN_BOUNDS
(
i1
,
j1
,
iH
,
iW
))
{
grad_input_val
+=
input1
[
n
][
i1
][
j1
][
c
]
*
grad_cache
[
ph
*
patchW
+
pw
];
}
}
}
grad_input2
[
n
][
c
][
h
][
w
]
=
reduce_sum
;
grad_input2
[
n
][
c
][
h
][
w
]
=
grad_input_val
;
}
}
#endif
mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved
// Adapted from
// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu # noqa
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#define MAX_NUM_VERT_IDX 9
#define INTERSECTION_OFFSET 8
#define EPSILON 1e-8
inline
int
opt_n_thread
(
int
work_size
)
{
const
int
pow_2
=
std
::
log
(
static_cast
<
double
>
(
work_size
))
/
std
::
log
(
2.0
);
return
max
(
min
(
1
<<
pow_2
,
THREADS_PER_BLOCK
),
1
);
}
/*
compare normalized vertices (vertices around (0,0))
if vertex1 < vertex2 return true.
order: minimum at x-aixs, become larger in anti-clockwise direction
*/
__device__
bool
compare_vertices
(
float
x1
,
float
y1
,
float
x2
,
float
y2
)
{
if
(
fabs
(
x1
-
x2
)
<
EPSILON
&&
fabs
(
y2
-
y1
)
<
EPSILON
)
return
false
;
// if equal, return false
if
(
y1
>
0
&&
y2
<
0
)
return
true
;
if
(
y1
<
0
&&
y2
>
0
)
return
false
;
float
n1
=
x1
*
x1
+
y1
*
y1
+
EPSILON
;
float
n2
=
x2
*
x2
+
y2
*
y2
+
EPSILON
;
float
diff
=
fabs
(
x1
)
*
x1
/
n1
-
fabs
(
x2
)
*
x2
/
n2
;
if
(
y1
>
0
&&
y2
>
0
)
{
if
(
diff
>
EPSILON
)
return
true
;
else
return
false
;
}
if
(
y1
<
0
&&
y2
<
0
)
{
if
(
diff
<
EPSILON
)
return
true
;
else
return
false
;
}
}
__global__
void
diff_iou_rotated_sort_vertices_forward_cuda_kernel
(
int
b
,
int
n
,
int
m
,
const
float
*
__restrict__
vertices
,
const
bool
*
__restrict__
mask
,
const
int
*
__restrict__
num_valid
,
int
*
__restrict__
idx
)
{
int
batch_idx
=
blockIdx
.
x
;
vertices
+=
batch_idx
*
n
*
m
*
2
;
mask
+=
batch_idx
*
n
*
m
;
num_valid
+=
batch_idx
*
n
;
idx
+=
batch_idx
*
n
*
MAX_NUM_VERT_IDX
;
int
index
=
threadIdx
.
x
;
// index of polygon
int
stride
=
blockDim
.
x
;
for
(
int
i
=
index
;
i
<
n
;
i
+=
stride
)
{
int
pad
;
// index of arbitrary invalid intersection point (not box corner!)
for
(
int
j
=
INTERSECTION_OFFSET
;
j
<
m
;
++
j
)
{
if
(
!
mask
[
i
*
m
+
j
])
{
pad
=
j
;
break
;
}
}
if
(
num_valid
[
i
]
<
3
)
{
// not enough vertices, take an invalid intersection point
// (zero padding)
for
(
int
j
=
0
;
j
<
MAX_NUM_VERT_IDX
;
++
j
)
{
idx
[
i
*
MAX_NUM_VERT_IDX
+
j
]
=
pad
;
}
}
else
{
// sort the valid vertices
// note the number of valid vertices is known
// note: check that num_valid[i] < MAX_NUM_VERT_IDX
for
(
int
j
=
0
;
j
<
num_valid
[
i
];
++
j
)
{
// initialize with a "big" value
float
x_min
=
1
;
float
y_min
=
-
EPSILON
;
int
i_take
=
0
;
int
i2
;
float
x2
,
y2
;
if
(
j
!=
0
)
{
i2
=
idx
[
i
*
MAX_NUM_VERT_IDX
+
j
-
1
];
x2
=
vertices
[
i
*
m
*
2
+
i2
*
2
+
0
];
y2
=
vertices
[
i
*
m
*
2
+
i2
*
2
+
1
];
}
for
(
int
k
=
0
;
k
<
m
;
++
k
)
{
float
x
=
vertices
[
i
*
m
*
2
+
k
*
2
+
0
];
float
y
=
vertices
[
i
*
m
*
2
+
k
*
2
+
1
];
if
(
mask
[
i
*
m
+
k
]
&&
compare_vertices
(
x
,
y
,
x_min
,
y_min
))
{
if
((
j
==
0
)
||
(
j
!=
0
&&
compare_vertices
(
x2
,
y2
,
x
,
y
)))
{
x_min
=
x
;
y_min
=
y
;
i_take
=
k
;
}
}
}
idx
[
i
*
MAX_NUM_VERT_IDX
+
j
]
=
i_take
;
}
// duplicate the first idx
idx
[
i
*
MAX_NUM_VERT_IDX
+
num_valid
[
i
]]
=
idx
[
i
*
MAX_NUM_VERT_IDX
+
0
];
// pad zeros
for
(
int
j
=
num_valid
[
i
]
+
1
;
j
<
MAX_NUM_VERT_IDX
;
++
j
)
{
idx
[
i
*
MAX_NUM_VERT_IDX
+
j
]
=
pad
;
}
// for corner case: the two boxes are exactly the same.
// in this case, idx would have duplicate elements, which makes the
// shoelace formula broken because of the definition, the duplicate
// elements only appear in the first 8 positions (they are "corners in
// box", not "intersection of edges")
if
(
num_valid
[
i
]
==
8
)
{
int
counter
=
0
;
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
int
check
=
idx
[
i
*
MAX_NUM_VERT_IDX
+
j
];
for
(
int
k
=
4
;
k
<
INTERSECTION_OFFSET
;
++
k
)
{
if
(
idx
[
i
*
MAX_NUM_VERT_IDX
+
k
]
==
check
)
counter
++
;
}
}
if
(
counter
==
4
)
{
idx
[
i
*
MAX_NUM_VERT_IDX
+
4
]
=
idx
[
i
*
MAX_NUM_VERT_IDX
+
0
];
for
(
int
j
=
5
;
j
<
MAX_NUM_VERT_IDX
;
++
j
)
{
idx
[
i
*
MAX_NUM_VERT_IDX
+
j
]
=
pad
;
}
}
}
// TODO: still might need to cover some other corner cases :(
}
}
}
mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
View file @
fdeee889
...
...
@@ -22,13 +22,14 @@ __global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m,
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
m
)
return
;
out
+=
bs_idx
*
c
*
m
+
c_idx
*
m
+
pt_idx
;
idx
+=
bs_idx
*
m
+
pt_idx
;
points
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
out
[
0
]
=
points
[
idx
[
0
]];
CUDA_1D_KERNEL_LOOP
(
pt_idx
,
m
)
{
if
(
bs_idx
>=
b
||
c_idx
>=
c
)
return
;
out
+=
bs_idx
*
c
*
m
+
c_idx
*
m
+
pt_idx
;
idx
+=
bs_idx
*
m
+
pt_idx
;
points
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
out
[
0
]
=
points
[
idx
[
0
]];
}
}
template
<
typename
T
>
...
...
@@ -43,14 +44,15 @@ __global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m,
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
m
)
return
;
CUDA_1D_KERNEL_LOOP
(
pt_idx
,
m
)
{
if
(
bs_idx
>=
b
||
c_idx
>=
c
)
return
;
grad_out
+=
bs_idx
*
c
*
m
+
c_idx
*
m
+
pt_idx
;
idx
+=
bs_idx
*
m
+
pt_idx
;
grad_points
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
grad_out
+=
bs_idx
*
c
*
m
+
c_idx
*
m
+
pt_idx
;
idx
+=
bs_idx
*
m
+
pt_idx
;
grad_points
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
atomicAdd
(
grad_points
+
idx
[
0
],
grad_out
[
0
]);
atomicAdd
(
grad_points
+
idx
[
0
],
grad_out
[
0
]);
}
}
#endif // GATHER_POINTS_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
View file @
fdeee889
...
...
@@ -22,18 +22,19 @@ __global__ void group_points_forward_cuda_kernel(int b, int c, int n,
// out: (B, C, npoints, nsample)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
pt_idx
=
index
/
nsample
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
npoints
)
return
;
CUDA_1D_KERNEL_LOOP
(
index
,
npoints
*
nsample
)
{
if
(
bs_idx
>=
b
||
c_idx
>=
c
)
return
;
int
sample_idx
=
index
%
nsample
;
int
pt_idx
=
index
/
nsample
;
int
sample_idx
=
index
%
nsample
;
idx
+=
bs_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
int
in_idx
=
bs_idx
*
c
*
n
+
c_idx
*
n
+
idx
[
0
];
int
out_idx
=
bs_idx
*
c
*
npoints
*
nsample
+
c_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
idx
+=
bs_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
int
in_idx
=
bs_idx
*
c
*
n
+
c_idx
*
n
+
idx
[
0
];
int
out_idx
=
bs_idx
*
c
*
npoints
*
nsample
+
c_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
out
[
out_idx
]
=
points
[
in_idx
];
out
[
out_idx
]
=
points
[
in_idx
];
}
}
template
<
typename
T
>
...
...
@@ -48,16 +49,17 @@ __global__ void group_points_backward_cuda_kernel(int b, int c, int n,
// grad_points: (B, C, N)
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
pt_idx
=
index
/
nsample
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
npoints
)
return
;
CUDA_1D_KERNEL_LOOP
(
index
,
npoints
*
nsample
)
{
int
pt_idx
=
index
/
nsample
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
)
return
;
int
sample_idx
=
index
%
nsample
;
grad_out
+=
bs_idx
*
c
*
npoints
*
nsample
+
c_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
idx
+=
bs_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
int
sample_idx
=
index
%
nsample
;
grad_out
+=
bs_idx
*
c
*
npoints
*
nsample
+
c_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
idx
+=
bs_idx
*
npoints
*
nsample
+
pt_idx
*
nsample
+
sample_idx
;
atomicAdd
(
grad_points
+
bs_idx
*
c
*
n
+
c_idx
*
n
+
idx
[
0
],
grad_out
[
0
]);
atomicAdd
(
grad_points
+
bs_idx
*
c
*
n
+
c_idx
*
n
+
idx
[
0
],
grad_out
[
0
]);
}
}
#endif // GROUP_POINTS_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
View file @
fdeee889
...
...
@@ -50,21 +50,17 @@ __device__ int check_rect_cross(const Point &p1, const Point &p2,
}
__device__
inline
int
check_in_box2d
(
const
float
*
box
,
const
Point
&
p
)
{
// params: box (5) [x1, y1, x2, y2, angle]
const
float
MARGIN
=
1e-5
;
float
center_x
=
(
box
[
0
]
+
box
[
2
])
/
2
;
float
center_y
=
(
box
[
1
]
+
box
[
3
])
/
2
;
float
angle_cos
=
cos
(
-
box
[
4
]),
angle_sin
=
sin
(
-
box
[
4
]);
// rotate the point in the opposite direction of box
float
rot_x
=
(
p
.
x
-
center_x
)
*
angle_cos
-
(
p
.
y
-
center_y
)
*
angle_sin
+
center_x
;
float
rot_y
=
(
p
.
x
-
center_x
)
*
angle_sin
+
(
p
.
y
-
center_y
)
*
angle_cos
+
center_y
;
return
(
rot_x
>
box
[
0
]
-
MARGIN
&&
rot_x
<
box
[
2
]
+
MARGIN
&&
rot_y
>
box
[
1
]
-
MARGIN
&&
rot_y
<
box
[
3
]
+
MARGIN
);
// params: box (7) [x, y, z, dx, dy, dz, heading]
const
float
MARGIN
=
1e-2
;
float
center_x
=
box
[
0
],
center_y
=
box
[
1
];
// rotate the point in the opposite direction of box
float
angle_cos
=
cos
(
-
box
[
6
]),
angle_sin
=
sin
(
-
box
[
6
]);
float
rot_x
=
(
p
.
x
-
center_x
)
*
angle_cos
+
(
p
.
y
-
center_y
)
*
(
-
angle_sin
);
float
rot_y
=
(
p
.
x
-
center_x
)
*
angle_sin
+
(
p
.
y
-
center_y
)
*
angle_cos
;
return
(
fabs
(
rot_x
)
<
box
[
3
]
/
2
+
MARGIN
&&
fabs
(
rot_y
)
<
box
[
4
]
/
2
+
MARGIN
);
}
__device__
inline
int
intersection
(
const
Point
&
p1
,
const
Point
&
p0
,
...
...
@@ -116,16 +112,19 @@ __device__ inline int point_cmp(const Point &a, const Point &b,
}
__device__
inline
float
box_overlap
(
const
float
*
box_a
,
const
float
*
box_b
)
{
// params
:
box_a
(5)
[x
1
, y
1
,
x2, y2, angle
]
// params
:
box_b
(5)
[x
1
, y
1
,
x2, y2, angle
]
// params box_a
:
[x, y,
z, dx, dy, dz, heading
]
// params box_b
:
[x, y,
z, dx, dy, dz, heading
]
float
a_x1
=
box_a
[
0
],
a_y1
=
box_a
[
1
],
a_x2
=
box_a
[
2
],
a_y2
=
box_a
[
3
],
a_angle
=
box_a
[
4
];
float
b_x1
=
box_b
[
0
],
b_y1
=
box_b
[
1
],
b_x2
=
box_b
[
2
],
b_y2
=
box_b
[
3
],
b_angle
=
box_b
[
4
];
float
a_angle
=
box_a
[
6
],
b_angle
=
box_b
[
6
];
float
a_dx_half
=
box_a
[
3
]
/
2
,
b_dx_half
=
box_b
[
3
]
/
2
,
a_dy_half
=
box_a
[
4
]
/
2
,
b_dy_half
=
box_b
[
4
]
/
2
;
float
a_x1
=
box_a
[
0
]
-
a_dx_half
,
a_y1
=
box_a
[
1
]
-
a_dy_half
;
float
a_x2
=
box_a
[
0
]
+
a_dx_half
,
a_y2
=
box_a
[
1
]
+
a_dy_half
;
float
b_x1
=
box_b
[
0
]
-
b_dx_half
,
b_y1
=
box_b
[
1
]
-
b_dy_half
;
float
b_x2
=
box_b
[
0
]
+
b_dx_half
,
b_y2
=
box_b
[
1
]
+
b_dy_half
;
Point
center_a
(
(
a_x1
+
a_x2
)
/
2
,
(
a_y1
+
a_y2
)
/
2
);
Point
center_b
(
(
b_x1
+
b_x2
)
/
2
,
(
b_y1
+
b_y2
)
/
2
);
Point
center_a
(
box_a
[
0
],
box_a
[
1
]
);
Point
center_b
(
box_b
[
0
],
box_b
[
1
]
);
Point
box_a_corners
[
5
];
box_a_corners
[
0
].
set
(
a_x1
,
a_y1
);
...
...
@@ -209,10 +208,10 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b) {
}
__device__
inline
float
iou_bev
(
const
float
*
box_a
,
const
float
*
box_b
)
{
// params
:
box_a
(5)
[x
1
, y
1
,
x2, y2, angle
]
// params
:
box_b
(5)
[x
1
, y
1
,
x2, y2, angle
]
float
sa
=
(
box_a
[
2
]
-
box_a
[
0
])
*
(
box_a
[
3
]
-
box_a
[
1
])
;
float
sb
=
(
box_b
[
2
]
-
box_b
[
0
])
*
(
box_b
[
3
]
-
box_b
[
1
])
;
// params box_a
:
[x, y,
z, dx, dy, dz, heading
]
// params box_b
:
[x, y,
z, dx, dy, dz, heading
]
float
sa
=
box_a
[
3
]
*
box_a
[
4
]
;
float
sb
=
box_b
[
3
]
*
box_b
[
4
]
;
float
s_overlap
=
box_overlap
(
box_a
,
box_b
);
return
s_overlap
/
fmaxf
(
sa
+
sb
-
s_overlap
,
EPS
);
}
...
...
@@ -220,149 +219,148 @@ __device__ inline float iou_bev(const float *box_a, const float *box_b) {
__global__
void
iou3d_boxes_overlap_bev_forward_cuda_kernel
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
const
float
*
boxes_b
,
float
*
ans_overlap
)
{
const
int
a_idx
=
blockIdx
.
y
*
THREADS_PER_BLOCK
+
threadIdx
.
y
;
const
int
b_idx
=
blockIdx
.
x
*
THREADS_PER_BLOCK
+
threadIdx
.
x
;
if
(
a_idx
>=
num_a
||
b_idx
>=
num_b
)
{
return
;
}
const
float
*
cur_box_a
=
boxes_a
+
a_idx
*
5
;
const
float
*
cur_box_b
=
boxes_b
+
b_idx
*
5
;
float
s_overlap
=
box_overlap
(
cur_box_a
,
cur_box_b
);
ans_overlap
[
a_idx
*
num_b
+
b_idx
]
=
s_overlap
;
}
__global__
void
iou3d_boxes_iou_bev_forward_cuda_kernel
(
const
int
num_a
,
const
float
*
boxes_a
,
const
int
num_b
,
const
float
*
boxes_b
,
float
*
ans_iou
)
{
const
int
a_idx
=
blockIdx
.
y
*
THREADS_PER_BLOCK
+
threadIdx
.
y
;
const
int
b_idx
=
blockIdx
.
x
*
THREADS_PER_BLOCK
+
threadIdx
.
x
;
// params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
// params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
CUDA_2D_KERNEL_LOOP
(
b_idx
,
num_b
,
a_idx
,
num_a
)
{
if
(
a_idx
>=
num_a
||
b_idx
>=
num_b
)
{
return
;
}
if
(
a_idx
>=
num_a
||
b_idx
>=
num_b
)
{
return
;
const
float
*
cur_box_a
=
boxes_a
+
a_idx
*
7
;
const
float
*
cur_box_b
=
boxes_b
+
b_idx
*
7
;
float
cur_overlap
=
box_overlap
(
cur_box_a
,
cur_box_b
);
ans_overlap
[
a_idx
*
num_b
+
b_idx
]
=
cur_overlap
;
}
const
float
*
cur_box_a
=
boxes_a
+
a_idx
*
5
;
const
float
*
cur_box_b
=
boxes_b
+
b_idx
*
5
;
float
cur_iou_bev
=
iou_bev
(
cur_box_a
,
cur_box_b
);
ans_iou
[
a_idx
*
num_b
+
b_idx
]
=
cur_iou_bev
;
}
__global__
void
nms_forward_cuda_kernel
(
const
int
boxes_num
,
const
float
nms_overlap_thresh
,
const
float
*
boxes
,
unsigned
long
long
*
mask
)
{
// params: boxes (N,
5
) [x
1
, y
1
,
x2, y2, ry
]
__global__
void
iou3d_
nms
3d
_forward_cuda_kernel
(
const
int
boxes_num
,
const
float
nms_overlap_thresh
,
const
float
*
boxes
,
unsigned
long
long
*
mask
)
{
// params: boxes (N,
7
) [x, y,
z, dx, dy, dz, heading
]
// params: mask (N, N/THREADS_PER_BLOCK_NMS)
const
int
blocks
=
(
boxes_num
+
THREADS_PER_BLOCK_NMS
-
1
)
/
THREADS_PER_BLOCK_NMS
;
CUDA_2D_KERNEL_BLOCK_LOOP
(
col_start
,
blocks
,
row_start
,
blocks
)
{
// if (row_start > col_start) return;
const
int
row_size
=
fminf
(
boxes_num
-
row_start
*
THREADS_PER_BLOCK_NMS
,
THREADS_PER_BLOCK_NMS
);
const
int
col_size
=
fminf
(
boxes_num
-
col_start
*
THREADS_PER_BLOCK_NMS
,
THREADS_PER_BLOCK_NMS
);
__shared__
float
block_boxes
[
THREADS_PER_BLOCK_NMS
*
7
];
if
(
threadIdx
.
x
<
col_size
)
{
block_boxes
[
threadIdx
.
x
*
7
+
0
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
7
+
0
];
block_boxes
[
threadIdx
.
x
*
7
+
1
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
7
+
1
];
block_boxes
[
threadIdx
.
x
*
7
+
2
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
7
+
2
];
block_boxes
[
threadIdx
.
x
*
7
+
3
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
7
+
3
];
block_boxes
[
threadIdx
.
x
*
7
+
4
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
7
+
4
];
block_boxes
[
threadIdx
.
x
*
7
+
5
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
7
+
5
];
block_boxes
[
threadIdx
.
x
*
7
+
6
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
7
+
6
];
}
__syncthreads
();
const
int
row_start
=
blockIdx
.
y
;
const
int
col_start
=
blockIdx
.
x
;
// if (row_start > col_start) return;
const
int
row_size
=
fminf
(
boxes_num
-
row_start
*
THREADS_PER_BLOCK_NMS
,
THREADS_PER_BLOCK_NMS
);
const
int
col_size
=
fminf
(
boxes_num
-
col_start
*
THREADS_PER_BLOCK_NMS
,
THREADS_PER_BLOCK_NMS
);
__shared__
float
block_boxes
[
THREADS_PER_BLOCK_NMS
*
5
];
if
(
threadIdx
.
x
<
col_size
)
{
block_boxes
[
threadIdx
.
x
*
5
+
0
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
0
];
block_boxes
[
threadIdx
.
x
*
5
+
1
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
1
];
block_boxes
[
threadIdx
.
x
*
5
+
2
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
2
];
block_boxes
[
threadIdx
.
x
*
5
+
3
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
3
];
block_boxes
[
threadIdx
.
x
*
5
+
4
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
4
];
}
__syncthreads
();
if
(
threadIdx
.
x
<
row_size
)
{
const
int
cur_box_idx
=
THREADS_PER_BLOCK_NMS
*
row_start
+
threadIdx
.
x
;
const
float
*
cur_box
=
boxes
+
cur_box_idx
*
5
;
if
(
threadIdx
.
x
<
row_size
)
{
const
int
cur_box_idx
=
THREADS_PER_BLOCK_NMS
*
row_start
+
threadIdx
.
x
;
const
float
*
cur_box
=
boxes
+
cur_box_idx
*
7
;
int
i
=
0
;
unsigned
long
long
t
=
0
;
int
start
=
0
;
if
(
row_start
==
col_start
)
{
start
=
threadIdx
.
x
+
1
;
}
for
(
i
=
start
;
i
<
col_size
;
i
++
)
{
if
(
iou_bev
(
cur_box
,
block_boxes
+
i
*
5
)
>
nms_overlap_thresh
)
{
t
|=
1ULL
<<
i
;
int
i
=
0
;
unsigned
long
long
t
=
0
;
int
start
=
0
;
if
(
row_start
==
col_start
)
{
start
=
threadIdx
.
x
+
1
;
}
for
(
i
=
start
;
i
<
col_size
;
i
++
)
{
if
(
iou_bev
(
cur_box
,
block_boxes
+
i
*
7
)
>
nms_overlap_thresh
)
{
t
|=
1ULL
<<
i
;
}
}
const
int
col_blocks
=
(
boxes_num
+
THREADS_PER_BLOCK_NMS
-
1
)
/
THREADS_PER_BLOCK_NMS
;
mask
[
cur_box_idx
*
col_blocks
+
col_start
]
=
t
;
}
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
mask
[
cur_box_idx
*
col_blocks
+
col_start
]
=
t
;
}
}
__device__
inline
float
iou_normal
(
float
const
*
const
a
,
float
const
*
const
b
)
{
float
left
=
fmaxf
(
a
[
0
],
b
[
0
]),
right
=
fminf
(
a
[
2
],
b
[
2
]);
float
top
=
fmaxf
(
a
[
1
],
b
[
1
]),
bottom
=
fminf
(
a
[
3
],
b
[
3
]);
// params: a: [x, y, z, dx, dy, dz, heading]
// params: b: [x, y, z, dx, dy, dz, heading]
float
left
=
fmaxf
(
a
[
0
]
-
a
[
3
]
/
2
,
b
[
0
]
-
b
[
3
]
/
2
),
right
=
fminf
(
a
[
0
]
+
a
[
3
]
/
2
,
b
[
0
]
+
b
[
3
]
/
2
);
float
top
=
fmaxf
(
a
[
1
]
-
a
[
4
]
/
2
,
b
[
1
]
-
b
[
4
]
/
2
),
bottom
=
fminf
(
a
[
1
]
+
a
[
4
]
/
2
,
b
[
1
]
+
b
[
4
]
/
2
);
float
width
=
fmaxf
(
right
-
left
,
0.
f
),
height
=
fmaxf
(
bottom
-
top
,
0.
f
);
float
interS
=
width
*
height
;
float
Sa
=
(
a
[
2
]
-
a
[
0
])
*
(
a
[
3
]
-
a
[
1
])
;
float
Sb
=
(
b
[
2
]
-
b
[
0
])
*
(
b
[
3
]
-
b
[
1
])
;
float
Sa
=
a
[
3
]
*
a
[
4
]
;
float
Sb
=
b
[
3
]
*
b
[
4
]
;
return
interS
/
fmaxf
(
Sa
+
Sb
-
interS
,
EPS
);
}
__global__
void
nms_normal_forward_cuda_kernel
(
const
int
boxes_num
,
const
float
nms_overlap_thresh
,
const
float
*
boxes
,
unsigned
long
long
*
mask
)
{
// params: boxes (N, 5) [x1, y1, x2, y2, ry]
__global__
void
iou3d_nms3d_normal_forward_cuda_kernel
(
const
int
boxes_num
,
const
float
nms_overlap_thresh
,
const
float
*
boxes
,
unsigned
long
long
*
mask
)
{
// params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
// params: mask (N, N/THREADS_PER_BLOCK_NMS)
const
int
row_start
=
blockIdx
.
y
;
const
int
col_start
=
blockIdx
.
x
;
// if (row_start > col_start) return;
const
int
row_size
=
fminf
(
boxes_num
-
row_start
*
THREADS_PER_BLOCK_NMS
,
THREADS_PER_BLOCK_NMS
);
const
int
col_size
=
fminf
(
boxes_num
-
col_start
*
THREADS_PER_BLOCK_NMS
,
THREADS_PER_BLOCK_NMS
);
__shared__
float
block_boxes
[
THREADS_PER_BLOCK_NMS
*
5
];
if
(
threadIdx
.
x
<
col_size
)
{
block_boxes
[
threadIdx
.
x
*
5
+
0
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
0
];
block_boxes
[
threadIdx
.
x
*
5
+
1
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
1
];
block_boxes
[
threadIdx
.
x
*
5
+
2
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
2
];
block_boxes
[
threadIdx
.
x
*
5
+
3
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
3
];
block_boxes
[
threadIdx
.
x
*
5
+
4
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
5
+
4
];
}
__syncthreads
();
const
int
blocks
=
(
boxes_num
+
THREADS_PER_BLOCK_NMS
-
1
)
/
THREADS_PER_BLOCK_NMS
;
CUDA_2D_KERNEL_BLOCK_LOOP
(
col_start
,
blocks
,
row_start
,
blocks
)
{
// if (row_start > col_start) return;
const
int
row_size
=
fminf
(
boxes_num
-
row_start
*
THREADS_PER_BLOCK_NMS
,
THREADS_PER_BLOCK_NMS
);
const
int
col_size
=
fminf
(
boxes_num
-
col_start
*
THREADS_PER_BLOCK_NMS
,
THREADS_PER_BLOCK_NMS
);
__shared__
float
block_boxes
[
THREADS_PER_BLOCK_NMS
*
7
];
if
(
threadIdx
.
x
<
col_size
)
{
block_boxes
[
threadIdx
.
x
*
7
+
0
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
7
+
0
];
block_boxes
[
threadIdx
.
x
*
7
+
1
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
7
+
1
];
block_boxes
[
threadIdx
.
x
*
7
+
2
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
7
+
2
];
block_boxes
[
threadIdx
.
x
*
7
+
3
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
7
+
3
];
block_boxes
[
threadIdx
.
x
*
7
+
4
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
7
+
4
];
block_boxes
[
threadIdx
.
x
*
7
+
5
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
7
+
5
];
block_boxes
[
threadIdx
.
x
*
7
+
6
]
=
boxes
[(
THREADS_PER_BLOCK_NMS
*
col_start
+
threadIdx
.
x
)
*
7
+
6
];
}
__syncthreads
();
if
(
threadIdx
.
x
<
row_size
)
{
const
int
cur_box_idx
=
THREADS_PER_BLOCK_NMS
*
row_start
+
threadIdx
.
x
;
const
float
*
cur_box
=
boxes
+
cur_box_idx
*
5
;
if
(
threadIdx
.
x
<
row_size
)
{
const
int
cur_box_idx
=
THREADS_PER_BLOCK_NMS
*
row_start
+
threadIdx
.
x
;
const
float
*
cur_box
=
boxes
+
cur_box_idx
*
7
;
int
i
=
0
;
unsigned
long
long
t
=
0
;
int
start
=
0
;
if
(
row_start
==
col_start
)
{
start
=
threadIdx
.
x
+
1
;
}
for
(
i
=
start
;
i
<
col_size
;
i
++
)
{
if
(
iou_normal
(
cur_box
,
block_boxes
+
i
*
5
)
>
nms_overlap_thresh
)
{
t
|=
1ULL
<<
i
;
int
i
=
0
;
unsigned
long
long
t
=
0
;
int
start
=
0
;
if
(
row_start
==
col_start
)
{
start
=
threadIdx
.
x
+
1
;
}
for
(
i
=
start
;
i
<
col_size
;
i
++
)
{
if
(
iou_normal
(
cur_box
,
block_boxes
+
i
*
7
)
>
nms_overlap_thresh
)
{
t
|=
1ULL
<<
i
;
}
}
const
int
col_blocks
=
(
boxes_num
+
THREADS_PER_BLOCK_NMS
-
1
)
/
THREADS_PER_BLOCK_NMS
;
mask
[
cur_box_idx
*
col_blocks
+
col_start
]
=
t
;
}
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
mask
[
cur_box_idx
*
col_blocks
+
col_start
]
=
t
;
}
}
...
...
mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
View file @
fdeee889
...
...
@@ -51,40 +51,41 @@ __global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample,
const
T
*
xyz
,
const
T
*
new_xyz
,
int
*
__restrict__
idx
,
T
*
dist2
)
{
int
bs_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
pt_idx
>=
m
)
return
;
CUDA_1D_KERNEL_LOOP
(
pt_idx
,
m
)
{
if
(
bs_idx
>=
b
)
return
;
new_xyz
+=
bs_idx
*
m
*
3
+
pt_idx
*
3
;
xyz
+=
bs_idx
*
n
*
3
;
idx
+=
bs_idx
*
m
*
nsample
+
pt_idx
*
nsample
;
dist2
+=
bs_idx
*
m
*
nsample
+
pt_idx
*
nsample
;
new_xyz
+=
bs_idx
*
m
*
3
+
pt_idx
*
3
;
xyz
+=
bs_idx
*
n
*
3
;
idx
+=
bs_idx
*
m
*
nsample
+
pt_idx
*
nsample
;
dist2
+=
bs_idx
*
m
*
nsample
+
pt_idx
*
nsample
;
T
new_x
=
new_xyz
[
0
];
T
new_y
=
new_xyz
[
1
];
T
new_z
=
new_xyz
[
2
];
T
new_x
=
new_xyz
[
0
];
T
new_y
=
new_xyz
[
1
];
T
new_z
=
new_xyz
[
2
];
float
best_dist
[
100
];
int
best_idx
[
100
];
for
(
int
i
=
0
;
i
<
nsample
;
i
++
)
{
best_dist
[
i
]
=
1e10
;
best_idx
[
i
]
=
0
;
}
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
T
x
=
xyz
[
i
*
3
+
0
];
T
y
=
xyz
[
i
*
3
+
1
];
T
z
=
xyz
[
i
*
3
+
2
];
T
d2
=
(
new_x
-
x
)
*
(
new_x
-
x
)
+
(
new_y
-
y
)
*
(
new_y
-
y
)
+
(
new_z
-
z
)
*
(
new_z
-
z
);
if
(
d2
<
best_dist
[
0
])
{
best_dist
[
0
]
=
d2
;
best_idx
[
0
]
=
i
;
reheap
(
best_dist
,
best_idx
,
nsample
);
float
best_dist
[
100
];
int
best_idx
[
100
];
for
(
int
i
=
0
;
i
<
nsample
;
i
++
)
{
best_dist
[
i
]
=
1e10
;
best_idx
[
i
]
=
0
;
}
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
T
x
=
xyz
[
i
*
3
+
0
];
T
y
=
xyz
[
i
*
3
+
1
];
T
z
=
xyz
[
i
*
3
+
2
];
T
d2
=
(
new_x
-
x
)
*
(
new_x
-
x
)
+
(
new_y
-
y
)
*
(
new_y
-
y
)
+
(
new_z
-
z
)
*
(
new_z
-
z
);
if
(
d2
<
best_dist
[
0
])
{
best_dist
[
0
]
=
d2
;
best_idx
[
0
]
=
i
;
reheap
(
best_dist
,
best_idx
,
nsample
);
}
}
heap_sort
(
best_dist
,
best_idx
,
nsample
);
for
(
int
i
=
0
;
i
<
nsample
;
i
++
)
{
idx
[
i
]
=
best_idx
[
i
];
dist2
[
i
]
=
best_dist
[
i
];
}
}
heap_sort
(
best_dist
,
best_idx
,
nsample
);
for
(
int
i
=
0
;
i
<
nsample
;
i
++
)
{
idx
[
i
]
=
best_idx
[
i
];
dist2
[
i
]
=
best_dist
[
i
];
}
}
...
...
mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved
#ifndef MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
#define MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#define MAXN 20
__device__
const
float
PI
=
3.1415926
;
struct
Point
{
float
x
,
y
;
__device__
Point
()
{}
__device__
Point
(
float
x
,
float
y
)
:
x
(
x
),
y
(
y
)
{}
};
__device__
inline
void
swap1
(
Point
*
a
,
Point
*
b
)
{
Point
temp
;
temp
.
x
=
a
->
x
;
temp
.
y
=
a
->
y
;
a
->
x
=
b
->
x
;
a
->
y
=
b
->
y
;
b
->
x
=
temp
.
x
;
b
->
y
=
temp
.
y
;
}
__device__
inline
float
cross
(
Point
o
,
Point
a
,
Point
b
)
{
return
(
a
.
x
-
o
.
x
)
*
(
b
.
y
-
o
.
y
)
-
(
b
.
x
-
o
.
x
)
*
(
a
.
y
-
o
.
y
);
}
__device__
inline
float
dis
(
Point
a
,
Point
b
)
{
return
(
a
.
x
-
b
.
x
)
*
(
a
.
x
-
b
.
x
)
+
(
a
.
y
-
b
.
y
)
*
(
a
.
y
-
b
.
y
);
}
__device__
inline
void
minBoundingRect
(
Point
*
ps
,
int
n_points
,
float
*
minbox
)
{
float
convex_points
[
2
][
MAXN
];
for
(
int
j
=
0
;
j
<
n_points
;
j
++
)
{
convex_points
[
0
][
j
]
=
ps
[
j
].
x
;
}
for
(
int
j
=
0
;
j
<
n_points
;
j
++
)
{
convex_points
[
1
][
j
]
=
ps
[
j
].
y
;
}
Point
edges
[
MAXN
];
float
edges_angles
[
MAXN
];
float
unique_angles
[
MAXN
];
int
n_edges
=
n_points
-
1
;
int
n_unique
=
0
;
int
unique_flag
=
0
;
for
(
int
i
=
0
;
i
<
n_edges
;
i
++
)
{
edges
[
i
].
x
=
ps
[
i
+
1
].
x
-
ps
[
i
].
x
;
edges
[
i
].
y
=
ps
[
i
+
1
].
y
-
ps
[
i
].
y
;
}
for
(
int
i
=
0
;
i
<
n_edges
;
i
++
)
{
edges_angles
[
i
]
=
atan2
((
double
)
edges
[
i
].
y
,
(
double
)
edges
[
i
].
x
);
if
(
edges_angles
[
i
]
>=
0
)
{
edges_angles
[
i
]
=
fmod
((
double
)
edges_angles
[
i
],
(
double
)
PI
/
2
);
}
else
{
edges_angles
[
i
]
=
edges_angles
[
i
]
-
(
int
)(
edges_angles
[
i
]
/
(
PI
/
2
)
-
1
)
*
(
PI
/
2
);
}
}
unique_angles
[
0
]
=
edges_angles
[
0
];
n_unique
+=
1
;
for
(
int
i
=
1
;
i
<
n_edges
;
i
++
)
{
for
(
int
j
=
0
;
j
<
n_unique
;
j
++
)
{
if
(
edges_angles
[
i
]
==
unique_angles
[
j
])
{
unique_flag
+=
1
;
}
}
if
(
unique_flag
==
0
)
{
unique_angles
[
n_unique
]
=
edges_angles
[
i
];
n_unique
+=
1
;
unique_flag
=
0
;
}
else
{
unique_flag
=
0
;
}
}
float
minarea
=
1e12
;
for
(
int
i
=
0
;
i
<
n_unique
;
i
++
)
{
float
R
[
2
][
2
];
float
rot_points
[
2
][
MAXN
];
R
[
0
][
0
]
=
cos
(
unique_angles
[
i
]);
R
[
0
][
1
]
=
sin
(
unique_angles
[
i
]);
R
[
1
][
0
]
=
-
sin
(
unique_angles
[
i
]);
R
[
1
][
1
]
=
cos
(
unique_angles
[
i
]);
// R x Points
for
(
int
m
=
0
;
m
<
2
;
m
++
)
{
for
(
int
n
=
0
;
n
<
n_points
;
n
++
)
{
float
sum
=
0.0
;
for
(
int
k
=
0
;
k
<
2
;
k
++
)
{
sum
=
sum
+
R
[
m
][
k
]
*
convex_points
[
k
][
n
];
}
rot_points
[
m
][
n
]
=
sum
;
}
}
// xmin;
float
xmin
,
ymin
,
xmax
,
ymax
;
xmin
=
1e12
;
for
(
int
j
=
0
;
j
<
n_points
;
j
++
)
{
if
(
isinf
(
rot_points
[
0
][
j
])
||
isnan
(
rot_points
[
0
][
j
]))
{
continue
;
}
else
{
if
(
rot_points
[
0
][
j
]
<
xmin
)
{
xmin
=
rot_points
[
0
][
j
];
}
}
}
// ymin
ymin
=
1e12
;
for
(
int
j
=
0
;
j
<
n_points
;
j
++
)
{
if
(
isinf
(
rot_points
[
1
][
j
])
||
isnan
(
rot_points
[
1
][
j
]))
{
continue
;
}
else
{
if
(
rot_points
[
1
][
j
]
<
ymin
)
{
ymin
=
rot_points
[
1
][
j
];
}
}
}
// xmax
xmax
=
-
1e12
;
for
(
int
j
=
0
;
j
<
n_points
;
j
++
)
{
if
(
isinf
(
rot_points
[
0
][
j
])
||
isnan
(
rot_points
[
0
][
j
]))
{
continue
;
}
else
{
if
(
rot_points
[
0
][
j
]
>
xmax
)
{
xmax
=
rot_points
[
0
][
j
];
}
}
}
// ymax
ymax
=
-
1e12
;
for
(
int
j
=
0
;
j
<
n_points
;
j
++
)
{
if
(
isinf
(
rot_points
[
1
][
j
])
||
isnan
(
rot_points
[
1
][
j
]))
{
continue
;
}
else
{
if
(
rot_points
[
1
][
j
]
>
ymax
)
{
ymax
=
rot_points
[
1
][
j
];
}
}
}
float
area
=
(
xmax
-
xmin
)
*
(
ymax
-
ymin
);
if
(
area
<
minarea
)
{
minarea
=
area
;
minbox
[
0
]
=
unique_angles
[
i
];
minbox
[
1
]
=
xmin
;
minbox
[
2
]
=
ymin
;
minbox
[
3
]
=
xmax
;
minbox
[
4
]
=
ymax
;
}
}
}
// convex_find
__device__
inline
void
Jarvis
(
Point
*
in_poly
,
int
&
n_poly
)
{
int
n_input
=
n_poly
;
Point
input_poly
[
20
];
for
(
int
i
=
0
;
i
<
n_input
;
i
++
)
{
input_poly
[
i
].
x
=
in_poly
[
i
].
x
;
input_poly
[
i
].
y
=
in_poly
[
i
].
y
;
}
Point
p_max
,
p_k
;
int
max_index
,
k_index
;
int
Stack
[
20
],
top1
,
top2
;
// float sign;
double
sign
;
Point
right_point
[
10
],
left_point
[
10
];
for
(
int
i
=
0
;
i
<
n_poly
;
i
++
)
{
if
(
in_poly
[
i
].
y
<
in_poly
[
0
].
y
||
in_poly
[
i
].
y
==
in_poly
[
0
].
y
&&
in_poly
[
i
].
x
<
in_poly
[
0
].
x
)
{
Point
*
j
=
&
(
in_poly
[
0
]);
Point
*
k
=
&
(
in_poly
[
i
]);
swap1
(
j
,
k
);
}
if
(
i
==
0
)
{
p_max
=
in_poly
[
0
];
max_index
=
0
;
}
if
(
in_poly
[
i
].
y
>
p_max
.
y
||
in_poly
[
i
].
y
==
p_max
.
y
&&
in_poly
[
i
].
x
>
p_max
.
x
)
{
p_max
=
in_poly
[
i
];
max_index
=
i
;
}
}
if
(
max_index
==
0
)
{
max_index
=
1
;
p_max
=
in_poly
[
max_index
];
}
k_index
=
0
,
Stack
[
0
]
=
0
,
top1
=
0
;
while
(
k_index
!=
max_index
)
{
p_k
=
p_max
;
k_index
=
max_index
;
for
(
int
i
=
1
;
i
<
n_poly
;
i
++
)
{
sign
=
cross
(
in_poly
[
Stack
[
top1
]],
in_poly
[
i
],
p_k
);
if
((
sign
>
0
)
||
((
sign
==
0
)
&&
(
dis
(
in_poly
[
Stack
[
top1
]],
in_poly
[
i
])
>
dis
(
in_poly
[
Stack
[
top1
]],
p_k
))))
{
p_k
=
in_poly
[
i
];
k_index
=
i
;
}
}
top1
++
;
Stack
[
top1
]
=
k_index
;
}
for
(
int
i
=
0
;
i
<=
top1
;
i
++
)
{
right_point
[
i
]
=
in_poly
[
Stack
[
i
]];
}
k_index
=
0
,
Stack
[
0
]
=
0
,
top2
=
0
;
while
(
k_index
!=
max_index
)
{
p_k
=
p_max
;
k_index
=
max_index
;
for
(
int
i
=
1
;
i
<
n_poly
;
i
++
)
{
sign
=
cross
(
in_poly
[
Stack
[
top2
]],
in_poly
[
i
],
p_k
);
if
((
sign
<
0
)
||
(
sign
==
0
)
&&
(
dis
(
in_poly
[
Stack
[
top2
]],
in_poly
[
i
])
>
dis
(
in_poly
[
Stack
[
top2
]],
p_k
)))
{
p_k
=
in_poly
[
i
];
k_index
=
i
;
}
}
top2
++
;
Stack
[
top2
]
=
k_index
;
}
for
(
int
i
=
top2
-
1
;
i
>=
0
;
i
--
)
{
left_point
[
i
]
=
in_poly
[
Stack
[
i
]];
}
for
(
int
i
=
0
;
i
<
top1
+
top2
;
i
++
)
{
if
(
i
<=
top1
)
{
in_poly
[
i
]
=
right_point
[
i
];
}
else
{
in_poly
[
i
]
=
left_point
[
top2
-
(
i
-
top1
)];
}
}
n_poly
=
top1
+
top2
;
}
template
<
typename
T
>
__device__
inline
void
Findminbox
(
T
const
*
const
p
,
T
*
minpoints
)
{
Point
ps1
[
MAXN
];
Point
convex
[
MAXN
];
for
(
int
i
=
0
;
i
<
9
;
i
++
)
{
convex
[
i
].
x
=
p
[
i
*
2
];
convex
[
i
].
y
=
p
[
i
*
2
+
1
];
}
int
n_convex
=
9
;
Jarvis
(
convex
,
n_convex
);
int
n1
=
n_convex
;
for
(
int
i
=
0
;
i
<
n1
;
i
++
)
{
ps1
[
i
].
x
=
convex
[
i
].
x
;
ps1
[
i
].
y
=
convex
[
i
].
y
;
}
ps1
[
n1
].
x
=
convex
[
0
].
x
;
ps1
[
n1
].
y
=
convex
[
0
].
y
;
float
minbbox
[
5
]
=
{
0
};
minBoundingRect
(
ps1
,
n1
+
1
,
minbbox
);
float
angle
=
minbbox
[
0
];
float
xmin
=
minbbox
[
1
];
float
ymin
=
minbbox
[
2
];
float
xmax
=
minbbox
[
3
];
float
ymax
=
minbbox
[
4
];
float
R
[
2
][
2
];
R
[
0
][
0
]
=
cos
(
angle
);
R
[
0
][
1
]
=
sin
(
angle
);
R
[
1
][
0
]
=
-
sin
(
angle
);
R
[
1
][
1
]
=
cos
(
angle
);
minpoints
[
0
]
=
xmax
*
R
[
0
][
0
]
+
ymin
*
R
[
1
][
0
];
minpoints
[
1
]
=
xmax
*
R
[
0
][
1
]
+
ymin
*
R
[
1
][
1
];
minpoints
[
2
]
=
xmin
*
R
[
0
][
0
]
+
ymin
*
R
[
1
][
0
];
minpoints
[
3
]
=
xmin
*
R
[
0
][
1
]
+
ymin
*
R
[
1
][
1
];
minpoints
[
4
]
=
xmin
*
R
[
0
][
0
]
+
ymax
*
R
[
1
][
0
];
minpoints
[
5
]
=
xmin
*
R
[
0
][
1
]
+
ymax
*
R
[
1
][
1
];
minpoints
[
6
]
=
xmax
*
R
[
0
][
0
]
+
ymax
*
R
[
1
][
0
];
minpoints
[
7
]
=
xmax
*
R
[
0
][
1
]
+
ymax
*
R
[
1
][
1
];
}
template
<
typename
T
>
__global__
void
min_area_polygons_cuda_kernel
(
const
int
ex_n_boxes
,
const
T
*
ex_boxes
,
T
*
minbox
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
ex_n_boxes
)
{
const
T
*
cur_box
=
ex_boxes
+
index
*
18
;
T
*
cur_min_box
=
minbox
+
index
*
8
;
Findminbox
(
cur_box
,
cur_min_box
);
}
}
#endif // MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
View file @
fdeee889
...
...
@@ -14,11 +14,6 @@
#include "common_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
const
int
CUDA_NUM_THREADS
=
1024
;
inline
int
GET_BLOCKS
(
const
int
N
,
const
int
num_threads
)
{
return
(
N
+
num_threads
-
1
)
/
num_threads
;
}
template
<
typename
scalar_t
>
__device__
scalar_t
ms_deform_attn_im2col_bilinear
(
const
scalar_t
*&
bottom_data
,
const
int
&
height
,
const
int
&
width
,
...
...
@@ -267,10 +262,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
const
int
channels
,
const
int
num_levels
,
const
int
num_query
,
const
int
num_point
,
scalar_t
*
grad_value
,
scalar_t
*
grad_sampling_loc
,
scalar_t
*
grad_attn_weight
)
{
__shared__
scalar_t
cache_grad_sampling_loc
[
blockSize
*
2
];
__shared__
scalar_t
cache_grad_attn_weight
[
blockSize
];
unsigned
int
tid
=
threadIdx
.
x
;
const
int
qid_stride
=
num_heads
*
channels
;
CUDA_1D_KERNEL_LOOP
(
index
,
n
)
{
__shared__
scalar_t
cache_grad_sampling_loc
[
blockSize
*
2
];
__shared__
scalar_t
cache_grad_attn_weight
[
blockSize
];
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
channels
;
_temp
/=
channels
;
...
...
@@ -285,11 +281,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
int
data_weight_ptr
=
sampling_index
*
num_levels
*
num_point
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_sampling_loc
+=
grad_sampling_ptr
<<
1
;
grad_attn_weight
+=
grad_sampling_ptr
;
scalar_t
*
grad_sampling_loc_out
=
grad_sampling_loc
+
(
grad_sampling_ptr
<<
1
);
scalar_t
*
grad_attn_weight_out
=
grad_attn_weight
+
grad_sampling_ptr
;
const
int
grad_weight_stride
=
1
;
const
int
grad_loc_stride
=
2
;
const
int
qid_stride
=
num_heads
*
channels
;
const
int
data_value_ptr_init_offset
=
b_col
*
spatial_size
*
qid_stride
;
for
(
int
l_col
=
0
;
l_col
<
num_levels
;
++
l_col
)
{
...
...
@@ -326,23 +322,23 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
_grad_h
=
cache_grad_sampling_loc
[
1
],
_grad_a
=
cache_grad_attn_weight
[
0
];
int
sid
=
2
;
for
(
unsigned
int
tid
=
1
;
tid
<
blockSize
;
++
tid
)
{
for
(
unsigned
int
_
tid
=
1
;
_
tid
<
blockSize
;
++
_
tid
)
{
_grad_w
+=
cache_grad_sampling_loc
[
sid
];
_grad_h
+=
cache_grad_sampling_loc
[
sid
+
1
];
_grad_a
+=
cache_grad_attn_weight
[
tid
];
_grad_a
+=
cache_grad_attn_weight
[
_
tid
];
sid
+=
2
;
}
*
grad_sampling_loc
=
_grad_w
;
*
(
grad_sampling_loc
+
1
)
=
_grad_h
;
*
grad_attn_weight
=
_grad_a
;
*
grad_sampling_loc
_out
=
_grad_w
;
*
(
grad_sampling_loc
_out
+
1
)
=
_grad_h
;
*
grad_attn_weight
_out
=
_grad_a
;
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_attn_weight
+=
grad_weight_stride
;
grad_sampling_loc
+=
grad_loc_stride
;
grad_attn_weight
_out
+=
grad_weight_stride
;
grad_sampling_loc
_out
+=
grad_loc_stride
;
}
}
}
...
...
@@ -357,10 +353,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
const
int
channels
,
const
int
num_levels
,
const
int
num_query
,
const
int
num_point
,
scalar_t
*
grad_value
,
scalar_t
*
grad_sampling_loc
,
scalar_t
*
grad_attn_weight
)
{
__shared__
scalar_t
cache_grad_sampling_loc
[
blockSize
*
2
];
__shared__
scalar_t
cache_grad_attn_weight
[
blockSize
];
unsigned
int
tid
=
threadIdx
.
x
;
CUDA_1D_KERNEL_LOOP
(
index
,
n
)
{
__shared__
scalar_t
cache_grad_sampling_loc
[
blockSize
*
2
];
__shared__
scalar_t
cache_grad_attn_weight
[
blockSize
];
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
channels
;
_temp
/=
channels
;
...
...
@@ -375,8 +371,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
int
data_weight_ptr
=
sampling_index
*
num_levels
*
num_point
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_sampling_loc
+=
grad_sampling_ptr
<<
1
;
grad_attn_weight
+=
grad_sampling_ptr
;
scalar_t
*
grad_sampling_loc_out
=
grad_sampling_loc
+
(
grad_sampling_ptr
<<
1
);
scalar_t
*
grad_attn_weight_out
=
grad_attn_weight
+
grad_sampling_ptr
;
const
int
grad_weight_stride
=
1
;
const
int
grad_loc_stride
=
2
;
const
int
qid_stride
=
num_heads
*
channels
;
...
...
@@ -425,16 +422,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
}
if
(
tid
==
0
)
{
*
grad_sampling_loc
=
cache_grad_sampling_loc
[
0
];
*
(
grad_sampling_loc
+
1
)
=
cache_grad_sampling_loc
[
1
];
*
grad_attn_weight
=
cache_grad_attn_weight
[
0
];
*
grad_sampling_loc
_out
=
cache_grad_sampling_loc
[
0
];
*
(
grad_sampling_loc
_out
+
1
)
=
cache_grad_sampling_loc
[
1
];
*
grad_attn_weight
_out
=
cache_grad_attn_weight
[
0
];
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_attn_weight
+=
grad_weight_stride
;
grad_sampling_loc
+=
grad_loc_stride
;
grad_attn_weight
_out
+=
grad_weight_stride
;
grad_sampling_loc
_out
+=
grad_loc_stride
;
}
}
}
...
...
@@ -449,11 +446,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
const
int
channels
,
const
int
num_levels
,
const
int
num_query
,
const
int
num_point
,
scalar_t
*
grad_value
,
scalar_t
*
grad_sampling_loc
,
scalar_t
*
grad_attn_weight
)
{
extern
__shared__
int
_s
[];
scalar_t
*
cache_grad_sampling_loc
=
reinterpret_cast
<
scalar_t
*>
(
_s
);
scalar_t
*
cache_grad_attn_weight
=
cache_grad_sampling_loc
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
CUDA_1D_KERNEL_LOOP
(
index
,
n
)
{
extern
__shared__
int
_s
[];
scalar_t
*
cache_grad_sampling_loc
=
reinterpret_cast
<
scalar_t
*>
(
_s
);
scalar_t
*
cache_grad_attn_weight
=
cache_grad_sampling_loc
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
channels
;
_temp
/=
channels
;
...
...
@@ -468,8 +465,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
int
data_weight_ptr
=
sampling_index
*
num_levels
*
num_point
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_sampling_loc
+=
grad_sampling_ptr
<<
1
;
grad_attn_weight
+=
grad_sampling_ptr
;
scalar_t
*
grad_sampling_loc_out
=
grad_sampling_loc
+
(
grad_sampling_ptr
<<
1
);
scalar_t
*
grad_attn_weight_out
=
grad_attn_weight
+
grad_sampling_ptr
;
const
int
grad_weight_stride
=
1
;
const
int
grad_loc_stride
=
2
;
const
int
qid_stride
=
num_heads
*
channels
;
...
...
@@ -509,23 +507,23 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
_grad_h
=
cache_grad_sampling_loc
[
1
],
_grad_a
=
cache_grad_attn_weight
[
0
];
int
sid
=
2
;
for
(
unsigned
int
tid
=
1
;
tid
<
blockDim
.
x
;
++
tid
)
{
for
(
unsigned
int
_
tid
=
1
;
_
tid
<
blockDim
.
x
;
++
_
tid
)
{
_grad_w
+=
cache_grad_sampling_loc
[
sid
];
_grad_h
+=
cache_grad_sampling_loc
[
sid
+
1
];
_grad_a
+=
cache_grad_attn_weight
[
tid
];
_grad_a
+=
cache_grad_attn_weight
[
_
tid
];
sid
+=
2
;
}
*
grad_sampling_loc
=
_grad_w
;
*
(
grad_sampling_loc
+
1
)
=
_grad_h
;
*
grad_attn_weight
=
_grad_a
;
*
grad_sampling_loc
_out
=
_grad_w
;
*
(
grad_sampling_loc
_out
+
1
)
=
_grad_h
;
*
grad_attn_weight
_out
=
_grad_a
;
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_attn_weight
+=
grad_weight_stride
;
grad_sampling_loc
+=
grad_loc_stride
;
grad_attn_weight
_out
+=
grad_weight_stride
;
grad_sampling_loc
_out
+=
grad_loc_stride
;
}
}
}
...
...
@@ -540,11 +538,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
const
int
channels
,
const
int
num_levels
,
const
int
num_query
,
const
int
num_point
,
scalar_t
*
grad_value
,
scalar_t
*
grad_sampling_loc
,
scalar_t
*
grad_attn_weight
)
{
extern
__shared__
int
_s
[];
scalar_t
*
cache_grad_sampling_loc
=
reinterpret_cast
<
scalar_t
*>
(
_s
);
scalar_t
*
cache_grad_attn_weight
=
cache_grad_sampling_loc
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
CUDA_1D_KERNEL_LOOP
(
index
,
n
)
{
extern
__shared__
int
_s
[];
scalar_t
*
cache_grad_sampling_loc
=
reinterpret_cast
<
scalar_t
*>
(
_s
);
scalar_t
*
cache_grad_attn_weight
=
cache_grad_sampling_loc
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
channels
;
_temp
/=
channels
;
...
...
@@ -559,8 +557,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
int
data_weight_ptr
=
sampling_index
*
num_levels
*
num_point
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_sampling_loc
+=
grad_sampling_ptr
<<
1
;
grad_attn_weight
+=
grad_sampling_ptr
;
scalar_t
*
grad_sampling_loc_out
=
grad_sampling_loc
+
(
grad_sampling_ptr
<<
1
);
scalar_t
*
grad_attn_weight_out
=
grad_attn_weight
+
grad_sampling_ptr
;
const
int
grad_weight_stride
=
1
;
const
int
grad_loc_stride
=
2
;
const
int
qid_stride
=
num_heads
*
channels
;
...
...
@@ -618,16 +617,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
}
if
(
tid
==
0
)
{
*
grad_sampling_loc
=
cache_grad_sampling_loc
[
0
];
*
(
grad_sampling_loc
+
1
)
=
cache_grad_sampling_loc
[
1
];
*
grad_attn_weight
=
cache_grad_attn_weight
[
0
];
*
grad_sampling_loc
_out
=
cache_grad_sampling_loc
[
0
];
*
(
grad_sampling_loc
_out
+
1
)
=
cache_grad_sampling_loc
[
1
];
*
grad_attn_weight
_out
=
cache_grad_attn_weight
[
0
];
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_attn_weight
+=
grad_weight_stride
;
grad_sampling_loc
+=
grad_loc_stride
;
grad_attn_weight
_out
+=
grad_weight_stride
;
grad_sampling_loc
_out
+=
grad_loc_stride
;
}
}
}
...
...
@@ -642,11 +641,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
const
int
channels
,
const
int
num_levels
,
const
int
num_query
,
const
int
num_point
,
scalar_t
*
grad_value
,
scalar_t
*
grad_sampling_loc
,
scalar_t
*
grad_attn_weight
)
{
extern
__shared__
int
_s
[];
scalar_t
*
cache_grad_sampling_loc
=
reinterpret_cast
<
scalar_t
*>
(
_s
);
scalar_t
*
cache_grad_attn_weight
=
cache_grad_sampling_loc
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
CUDA_1D_KERNEL_LOOP
(
index
,
n
)
{
extern
__shared__
int
_s
[];
scalar_t
*
cache_grad_sampling_loc
=
reinterpret_cast
<
scalar_t
*>
(
_s
);
scalar_t
*
cache_grad_attn_weight
=
cache_grad_sampling_loc
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
channels
;
_temp
/=
channels
;
...
...
@@ -661,8 +660,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
int
data_weight_ptr
=
sampling_index
*
num_levels
*
num_point
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_sampling_loc
+=
grad_sampling_ptr
<<
1
;
grad_attn_weight
+=
grad_sampling_ptr
;
scalar_t
*
grad_sampling_loc_out
=
grad_sampling_loc
+
(
grad_sampling_ptr
<<
1
);
scalar_t
*
grad_attn_weight_out
=
grad_attn_weight
+
grad_sampling_ptr
;
const
int
grad_weight_stride
=
1
;
const
int
grad_loc_stride
=
2
;
const
int
qid_stride
=
num_heads
*
channels
;
...
...
@@ -720,16 +720,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
}
if
(
tid
==
0
)
{
atomicAdd
(
grad_sampling_loc
,
cache_grad_sampling_loc
[
0
]);
atomicAdd
(
grad_sampling_loc
+
1
,
cache_grad_sampling_loc
[
1
]);
atomicAdd
(
grad_attn_weight
,
cache_grad_attn_weight
[
0
]);
atomicAdd
(
grad_sampling_loc
_out
,
cache_grad_sampling_loc
[
0
]);
atomicAdd
(
grad_sampling_loc
_out
+
1
,
cache_grad_sampling_loc
[
1
]);
atomicAdd
(
grad_attn_weight
_out
,
cache_grad_attn_weight
[
0
]);
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_attn_weight
+=
grad_weight_stride
;
grad_sampling_loc
+=
grad_loc_stride
;
grad_attn_weight
_out
+=
grad_weight_stride
;
grad_sampling_loc
_out
+=
grad_loc_stride
;
}
}
}
...
...
@@ -759,8 +759,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(
int
data_weight_ptr
=
sampling_index
*
num_levels
*
num_point
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_sampling_loc
+=
grad_sampling_ptr
<<
1
;
grad_attn_weight
+=
grad_sampling_ptr
;
scalar_t
*
grad_sampling_loc_out
=
grad_sampling_loc
+
(
grad_sampling_ptr
<<
1
);
scalar_t
*
grad_attn_weight_out
=
grad_attn_weight
+
grad_sampling_ptr
;
const
int
grad_weight_stride
=
1
;
const
int
grad_loc_stride
=
2
;
const
int
qid_stride
=
num_heads
*
channels
;
...
...
@@ -787,12 +788,12 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(
ms_deform_attn_col2im_bilinear_gm
(
data_value_ptr
,
spatial_h
,
spatial_w
,
num_heads
,
channels
,
h_im
,
w_im
,
m_col
,
c_col
,
top_grad
,
weight
,
grad_value_ptr
,
grad_sampling_loc
,
grad_attn_weight
);
grad_sampling_loc
_out
,
grad_attn_weight
_out
);
}
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_attn_weight
+=
grad_weight_stride
;
grad_sampling_loc
+=
grad_loc_stride
;
grad_attn_weight
_out
+=
grad_weight_stride
;
grad_sampling_loc
_out
+=
grad_loc_stride
;
}
}
}
...
...
mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
View file @
fdeee889
...
...
@@ -30,45 +30,88 @@ __device__ inline bool devIoU(float const *const a, float const *const b,
__global__
void
nms_cuda
(
const
int
n_boxes
,
const
float
iou_threshold
,
const
int
offset
,
const
float
*
dev_boxes
,
unsigned
long
long
*
dev_mask
)
{
const
int
row_start
=
blockIdx
.
y
;
const
int
col_start
=
blockIdx
.
x
;
const
int
tid
=
threadIdx
.
x
;
int
blocks
=
(
n_boxes
+
threadsPerBlock
-
1
)
/
threadsPerBlock
;
CUDA_2D_KERNEL_BLOCK_LOOP
(
col_start
,
blocks
,
row_start
,
blocks
)
{
const
int
tid
=
threadIdx
.
x
;
if
(
row_start
>
col_start
)
return
;
const
int
row_size
=
fminf
(
n_boxes
-
row_start
*
threadsPerBlock
,
threadsPerBlock
);
const
int
col_size
=
fminf
(
n_boxes
-
col_start
*
threadsPerBlock
,
threadsPerBlock
);
__shared__
float
block_boxes
[
threadsPerBlock
*
4
];
if
(
tid
<
col_size
)
{
block_boxes
[
tid
*
4
+
0
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
tid
)
*
4
+
0
];
block_boxes
[
tid
*
4
+
1
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
tid
)
*
4
+
1
];
block_boxes
[
tid
*
4
+
2
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
tid
)
*
4
+
2
];
block_boxes
[
tid
*
4
+
3
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
tid
)
*
4
+
3
];
}
__syncthreads
();
if
(
tid
<
row_size
)
{
const
int
cur_box_idx
=
threadsPerBlock
*
row_start
+
tid
;
const
float
*
cur_box
=
dev_boxes
+
cur_box_idx
*
4
;
int
i
=
0
;
unsigned
long
long
int
t
=
0
;
int
start
=
0
;
if
(
row_start
==
col_start
)
{
start
=
tid
+
1
;
}
for
(
i
=
start
;
i
<
col_size
;
i
++
)
{
if
(
devIoU
(
cur_box
,
block_boxes
+
i
*
4
,
offset
,
iou_threshold
))
{
t
|=
1ULL
<<
i
;
}
}
dev_mask
[
cur_box_idx
*
gridDim
.
y
+
col_start
]
=
t
;
}
}
}
if
(
row_start
>
col_start
)
return
;
__global__
void
gather_keep_from_mask
(
bool
*
keep
,
const
unsigned
long
long
*
dev_mask
,
const
int
n_boxes
)
{
const
int
col_blocks
=
(
n_boxes
+
threadsPerBlock
-
1
)
/
threadsPerBlock
;
const
int
tid
=
threadIdx
.
x
;
const
int
row_size
=
fminf
(
n_boxes
-
row_start
*
threadsPerBlock
,
threadsPerBlock
);
const
int
col_size
=
fminf
(
n_boxes
-
col_start
*
threadsPerBlock
,
threadsPerBlock
);
// mark the bboxes which have been removed.
extern
__shared__
unsigned
long
long
removed
[];
__shared__
float
block_boxes
[
threadsPerBlock
*
4
];
if
(
tid
<
col_size
)
{
block_boxes
[
tid
*
4
+
0
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
tid
)
*
4
+
0
];
block_boxes
[
tid
*
4
+
1
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
tid
)
*
4
+
1
];
block_boxes
[
tid
*
4
+
2
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
tid
)
*
4
+
2
];
block_boxes
[
tid
*
4
+
3
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
tid
)
*
4
+
3
];
// initialize removed.
for
(
int
i
=
tid
;
i
<
col_blocks
;
i
+=
blockDim
.
x
)
{
removed
[
i
]
=
0
;
}
__syncthreads
();
if
(
tid
<
row_size
)
{
const
int
cur_box_idx
=
threadsPerBlock
*
row_start
+
tid
;
const
float
*
cur_box
=
dev_boxes
+
cur_box_idx
*
4
;
int
i
=
0
;
unsigned
long
long
int
t
=
0
;
int
start
=
0
;
if
(
row_start
==
col_start
)
{
start
=
tid
+
1
;
}
for
(
i
=
start
;
i
<
col_size
;
i
++
)
{
if
(
devIoU
(
cur_box
,
block_boxes
+
i
*
4
,
offset
,
iou_threshold
))
{
t
|=
1ULL
<<
i
;
for
(
int
nblock
=
0
;
nblock
<
col_blocks
;
++
nblock
)
{
auto
removed_val
=
removed
[
nblock
];
__syncthreads
();
const
int
i_offset
=
nblock
*
threadsPerBlock
;
#pragma unroll
for
(
int
inblock
=
0
;
inblock
<
threadsPerBlock
;
++
inblock
)
{
const
int
i
=
i_offset
+
inblock
;
if
(
i
>=
n_boxes
)
break
;
// select a candidate, check if it should kept.
if
(
!
(
removed_val
&
(
1ULL
<<
inblock
)))
{
if
(
tid
==
0
)
{
// mark the output.
keep
[
i
]
=
true
;
}
auto
p
=
dev_mask
+
i
*
col_blocks
;
// remove all bboxes which overlap the candidate.
for
(
int
j
=
tid
;
j
<
col_blocks
;
j
+=
blockDim
.
x
)
{
if
(
j
>=
nblock
)
removed
[
j
]
|=
p
[
j
];
}
__syncthreads
();
removed_val
=
removed
[
nblock
];
}
}
dev_mask
[
cur_box_idx
*
gridDim
.
y
+
col_start
]
=
t
;
}
}
#endif // NMS_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
View file @
fdeee889
...
...
@@ -43,18 +43,16 @@ __global__ void nms_rotated_cuda_kernel(const int n_boxes,
// (x_center, y_center, width, height, angle_degrees) here.
__shared__
T
block_boxes
[
threadsPerBlock
*
5
];
if
(
threadIdx
.
x
<
col_size
)
{
block_boxes
[
threadIdx
.
x
*
6
+
0
]
=
block_boxes
[
threadIdx
.
x
*
5
+
0
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
threadIdx
.
x
)
*
6
+
0
];
block_boxes
[
threadIdx
.
x
*
6
+
1
]
=
block_boxes
[
threadIdx
.
x
*
5
+
1
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
threadIdx
.
x
)
*
6
+
1
];
block_boxes
[
threadIdx
.
x
*
6
+
2
]
=
block_boxes
[
threadIdx
.
x
*
5
+
2
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
threadIdx
.
x
)
*
6
+
2
];
block_boxes
[
threadIdx
.
x
*
6
+
3
]
=
block_boxes
[
threadIdx
.
x
*
5
+
3
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
threadIdx
.
x
)
*
6
+
3
];
block_boxes
[
threadIdx
.
x
*
6
+
4
]
=
block_boxes
[
threadIdx
.
x
*
5
+
4
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
threadIdx
.
x
)
*
6
+
4
];
block_boxes
[
threadIdx
.
x
*
6
+
5
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
threadIdx
.
x
)
*
6
+
5
];
}
__syncthreads
();
...
...
@@ -71,7 +69,7 @@ __global__ void nms_rotated_cuda_kernel(const int n_boxes,
// Instead of devIoU used by original horizontal nms, here
// we use the single_box_iou_rotated function from
// box_iou_rotated_utils.h
if
(
single_box_iou_rotated
<
T
>
(
cur_box
,
block_boxes
+
i
*
6
,
0
)
>
if
(
single_box_iou_rotated
<
T
>
(
cur_box
,
block_boxes
+
i
*
5
,
0
)
>
iou_threshold
)
{
t
|=
1ULL
<<
i
;
}
...
...
mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
View file @
fdeee889
...
...
@@ -45,20 +45,21 @@ __global__ void points_in_boxes_part_forward_cuda_kernel(
// (B, npoints), default -1
int
bs_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
batch_size
||
pt_idx
>=
pts_num
)
return
;
CUDA_1D_KERNEL_LOOP
(
pt_idx
,
pts_num
)
{
if
(
bs_idx
>=
batch_size
)
return
;
boxes
+=
bs_idx
*
boxes_num
*
7
;
pts
+=
bs_idx
*
pts_num
*
3
+
pt_idx
*
3
;
box_idx_of_points
+=
bs_idx
*
pts_num
+
pt_idx
;
boxes
+=
bs_idx
*
boxes_num
*
7
;
pts
+=
bs_idx
*
pts_num
*
3
+
pt_idx
*
3
;
box_idx_of_points
+=
bs_idx
*
pts_num
+
pt_idx
;
T
local_x
=
0
,
local_y
=
0
;
int
cur_in_flag
=
0
;
for
(
int
k
=
0
;
k
<
boxes_num
;
k
++
)
{
cur_in_flag
=
check_pt_in_box3d
(
pts
,
boxes
+
k
*
7
,
local_x
,
local_y
);
if
(
cur_in_flag
)
{
box_idx_of_points
[
0
]
=
k
;
break
;
T
local_x
=
0
,
local_y
=
0
;
int
cur_in_flag
=
0
;
for
(
int
k
=
0
;
k
<
boxes_num
;
k
++
)
{
cur_in_flag
=
check_pt_in_box3d
(
pts
,
boxes
+
k
*
7
,
local_x
,
local_y
);
if
(
cur_in_flag
)
{
box_idx_of_points
[
0
]
=
k
;
break
;
}
}
}
}
...
...
@@ -73,19 +74,20 @@ __global__ void points_in_boxes_all_forward_cuda_kernel(
// (B, npoints), default -1
int
bs_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
batch_size
||
pt_idx
>=
pts_num
)
return
;
CUDA_1D_KERNEL_LOOP
(
pt_idx
,
pts_num
)
{
if
(
bs_idx
>=
batch_size
)
return
;
boxes
+=
bs_idx
*
boxes_num
*
7
;
pts
+=
bs_idx
*
pts_num
*
3
+
pt_idx
*
3
;
box_idx_of_points
+=
bs_idx
*
pts_num
*
boxes_num
+
pt_idx
*
boxes_num
;
boxes
+=
bs_idx
*
boxes_num
*
7
;
pts
+=
bs_idx
*
pts_num
*
3
+
pt_idx
*
3
;
box_idx_of_points
+=
bs_idx
*
pts_num
*
boxes_num
+
pt_idx
*
boxes_num
;
T
local_x
=
0
,
local_y
=
0
;
for
(
int
k
=
0
;
k
<
boxes_num
;
k
++
)
{
const
int
cur_in_flag
=
check_pt_in_box3d
(
pts
,
boxes
+
k
*
7
,
local_x
,
local_y
);
if
(
cur_in_flag
)
{
box_idx_of_points
[
k
]
=
1
;
T
local_x
=
0
,
local_y
=
0
;
for
(
int
k
=
0
;
k
<
boxes_num
;
k
++
)
{
const
int
cur_in_flag
=
check_pt_in_box3d
(
pts
,
boxes
+
k
*
7
,
local_x
,
local_y
);
if
(
cur_in_flag
)
{
box_idx_of_points
[
k
]
=
1
;
}
}
}
}
...
...
mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved
#ifndef POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
#define POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
struct
point
{
float
x
,
y
;
};
template
<
typename
scalar_t
>
__global__
void
points_in_polygons_forward_cuda_kernel
(
const
int
nthreads
,
const
scalar_t
*
vertex1
,
const
scalar_t
*
vertex2
,
const
int
rows
,
const
int
cols
,
scalar_t
*
inside_flag
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
int
row
=
index
/
cols
;
int
col
=
index
%
cols
;
const
scalar_t
*
offset_vertex1
=
vertex1
+
row
*
2
;
const
scalar_t
*
offset_vertex2
=
vertex2
+
col
*
8
;
point
point_
[
1
];
point
polygon
[
4
];
point_
[
0
].
x
=
offset_vertex1
[
0
];
point_
[
0
].
y
=
offset_vertex1
[
1
];
polygon
[
0
].
x
=
offset_vertex2
[
0
];
polygon
[
0
].
y
=
offset_vertex2
[
1
];
polygon
[
1
].
x
=
offset_vertex2
[
2
];
polygon
[
1
].
y
=
offset_vertex2
[
3
];
polygon
[
2
].
x
=
offset_vertex2
[
4
];
polygon
[
2
].
y
=
offset_vertex2
[
5
];
polygon
[
3
].
x
=
offset_vertex2
[
6
];
polygon
[
3
].
y
=
offset_vertex2
[
7
];
int
nCross
=
0
;
int
i
,
j
;
float
sx
,
sy
,
tx
,
ty
,
px
,
py
,
x
;
for
(
i
=
0
,
j
=
3
;
i
<
4
;
j
=
i
,
i
++
)
{
sx
=
polygon
[
i
].
x
;
sy
=
polygon
[
i
].
y
;
tx
=
polygon
[
j
].
x
;
ty
=
polygon
[
j
].
y
;
px
=
point_
[
0
].
x
;
py
=
point_
[
0
].
y
;
if
(
py
<
min
(
sy
,
ty
))
continue
;
if
(
py
>
max
(
sy
,
ty
))
continue
;
if
((
sx
==
px
&&
sy
==
py
)
||
(
tx
==
px
&&
ty
==
py
))
{
break
;
}
else
{
if
((
sy
<
py
&&
ty
>=
py
)
||
(
sy
>=
py
&&
ty
<
py
))
{
x
=
sx
+
(
py
-
sy
)
*
(
tx
-
sx
)
/
(
ty
-
sy
);
if
(
x
==
px
)
{
break
;
}
if
(
x
>
px
)
{
nCross
++
;
}
}
}
}
if
(
nCross
%
2
==
1
)
{
inside_flag
[
index
]
=
1.0
;
}
else
{
inside_flag
[
index
]
=
0.0
;
}
return
;
}
}
#endif // POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu
// Distributed under terms of the MIT license.
#ifndef PRROI_POOL_CUDA_KERNEL_CUH
#define PRROI_POOL_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
template
<
typename
T
>
__device__
static
__forceinline__
T
PrRoIPoolingGetData
(
const
T
*
data
,
const
int
h
,
const
int
w
,
const
int
height
,
const
int
width
)
{
bool
overflow
=
(
h
<
0
)
||
(
w
<
0
)
||
(
h
>=
height
)
||
(
w
>=
width
);
T
retVal
=
overflow
?
0.0
f
:
data
[
h
*
width
+
w
];
return
retVal
;
}
template
<
typename
T
>
__device__
static
__forceinline__
T
PrRoIPoolingGetCoeff
(
T
dh
,
T
dw
)
{
return
(
1.0
f
-
abs
(
dh
))
*
(
1.0
f
-
abs
(
dw
));
}
template
<
typename
T
>
__device__
static
__forceinline__
T
PrRoIPoolingSingleCoorIntegral
(
T
s
,
T
t
,
T
c1
,
T
c2
)
{
return
0.5
*
(
t
*
t
-
s
*
s
)
*
(
c2
-
c1
)
+
(
t
-
s
)
*
c1
;
}
template
<
typename
T
>
__device__
static
T
PrRoIPoolingInterpolation
(
const
T
*
data
,
const
T
h
,
const
T
w
,
const
int
height
,
const
int
width
)
{
T
retVal
=
0.0
f
;
int
h1
=
floorf
(
h
);
int
w1
=
floorf
(
w
);
retVal
+=
PrRoIPoolingGetData
(
data
,
h1
,
w1
,
height
,
width
)
*
PrRoIPoolingGetCoeff
(
h
-
T
(
h1
),
w
-
T
(
w1
));
h1
=
floorf
(
h
)
+
1
;
w1
=
floorf
(
w
);
retVal
+=
PrRoIPoolingGetData
(
data
,
h1
,
w1
,
height
,
width
)
*
PrRoIPoolingGetCoeff
(
h
-
T
(
h1
),
w
-
T
(
w1
));
h1
=
floorf
(
h
);
w1
=
floorf
(
w
)
+
1
;
retVal
+=
PrRoIPoolingGetData
(
data
,
h1
,
w1
,
height
,
width
)
*
PrRoIPoolingGetCoeff
(
h
-
T
(
h1
),
w
-
T
(
w1
));
h1
=
floorf
(
h
)
+
1
;
w1
=
floorf
(
w
)
+
1
;
retVal
+=
PrRoIPoolingGetData
(
data
,
h1
,
w1
,
height
,
width
)
*
PrRoIPoolingGetCoeff
(
h
-
T
(
h1
),
w
-
T
(
w1
));
return
retVal
;
}
template
<
typename
T
>
__device__
static
T
PrRoIPoolingMatCalculation
(
const
T
*
this_data
,
const
int
s_h
,
const
int
s_w
,
const
int
e_h
,
const
int
e_w
,
const
T
y0
,
const
T
x0
,
const
T
y1
,
const
T
x1
,
const
int
h0
,
const
int
w0
)
{
T
alpha
,
beta
,
lim_alpha
,
lim_beta
,
tmp
;
T
sum_out
=
0
;
alpha
=
x0
-
T
(
s_w
);
beta
=
y0
-
T
(
s_h
);
lim_alpha
=
x1
-
T
(
s_w
);
lim_beta
=
y1
-
T
(
s_h
);
tmp
=
(
lim_alpha
-
0.5
f
*
lim_alpha
*
lim_alpha
-
alpha
+
0.5
f
*
alpha
*
alpha
)
*
(
lim_beta
-
0.5
f
*
lim_beta
*
lim_beta
-
beta
+
0.5
f
*
beta
*
beta
);
sum_out
+=
PrRoIPoolingGetData
(
this_data
,
s_h
,
s_w
,
h0
,
w0
)
*
tmp
;
alpha
=
T
(
e_w
)
-
x1
;
lim_alpha
=
T
(
e_w
)
-
x0
;
tmp
=
(
lim_alpha
-
0.5
f
*
lim_alpha
*
lim_alpha
-
alpha
+
0.5
f
*
alpha
*
alpha
)
*
(
lim_beta
-
0.5
f
*
lim_beta
*
lim_beta
-
beta
+
0.5
f
*
beta
*
beta
);
sum_out
+=
PrRoIPoolingGetData
(
this_data
,
s_h
,
e_w
,
h0
,
w0
)
*
tmp
;
alpha
=
x0
-
T
(
s_w
);
beta
=
T
(
e_h
)
-
y1
;
lim_alpha
=
x1
-
T
(
s_w
);
lim_beta
=
T
(
e_h
)
-
y0
;
tmp
=
(
lim_alpha
-
0.5
f
*
lim_alpha
*
lim_alpha
-
alpha
+
0.5
f
*
alpha
*
alpha
)
*
(
lim_beta
-
0.5
f
*
lim_beta
*
lim_beta
-
beta
+
0.5
f
*
beta
*
beta
);
sum_out
+=
PrRoIPoolingGetData
(
this_data
,
e_h
,
s_w
,
h0
,
w0
)
*
tmp
;
alpha
=
T
(
e_w
)
-
x1
;
lim_alpha
=
T
(
e_w
)
-
x0
;
tmp
=
(
lim_alpha
-
0.5
f
*
lim_alpha
*
lim_alpha
-
alpha
+
0.5
f
*
alpha
*
alpha
)
*
(
lim_beta
-
0.5
f
*
lim_beta
*
lim_beta
-
beta
+
0.5
f
*
beta
*
beta
);
sum_out
+=
PrRoIPoolingGetData
(
this_data
,
e_h
,
e_w
,
h0
,
w0
)
*
tmp
;
return
sum_out
;
}
template
<
typename
T
>
__device__
static
void
PrRoIPoolingDistributeDiff
(
T
*
diff
,
const
T
top_diff
,
const
int
h
,
const
int
w
,
const
int
height
,
const
int
width
,
const
T
coeff
)
{
bool
overflow
=
(
h
<
0
)
||
(
w
<
0
)
||
(
h
>=
height
)
||
(
w
>=
width
);
if
(
!
overflow
)
atomicAdd
(
diff
+
h
*
width
+
w
,
top_diff
*
coeff
);
}
template
<
typename
T
>
__device__
static
void
PrRoIPoolingMatDistributeDiff
(
T
*
diff
,
const
T
top_diff
,
const
int
s_h
,
const
int
s_w
,
const
int
e_h
,
const
int
e_w
,
const
T
y0
,
const
T
x0
,
const
T
y1
,
const
T
x1
,
const
int
h0
,
const
int
w0
)
{
T
alpha
,
beta
,
lim_alpha
,
lim_beta
,
tmp
;
alpha
=
x0
-
T
(
s_w
);
beta
=
y0
-
T
(
s_h
);
lim_alpha
=
x1
-
T
(
s_w
);
lim_beta
=
y1
-
T
(
s_h
);
tmp
=
(
lim_alpha
-
0.5
f
*
lim_alpha
*
lim_alpha
-
alpha
+
0.5
f
*
alpha
*
alpha
)
*
(
lim_beta
-
0.5
f
*
lim_beta
*
lim_beta
-
beta
+
0.5
f
*
beta
*
beta
);
PrRoIPoolingDistributeDiff
(
diff
,
top_diff
,
s_h
,
s_w
,
h0
,
w0
,
tmp
);
alpha
=
T
(
e_w
)
-
x1
;
lim_alpha
=
T
(
e_w
)
-
x0
;
tmp
=
(
lim_alpha
-
0.5
f
*
lim_alpha
*
lim_alpha
-
alpha
+
0.5
f
*
alpha
*
alpha
)
*
(
lim_beta
-
0.5
f
*
lim_beta
*
lim_beta
-
beta
+
0.5
f
*
beta
*
beta
);
PrRoIPoolingDistributeDiff
(
diff
,
top_diff
,
s_h
,
e_w
,
h0
,
w0
,
tmp
);
alpha
=
x0
-
T
(
s_w
);
beta
=
T
(
e_h
)
-
y1
;
lim_alpha
=
x1
-
T
(
s_w
);
lim_beta
=
T
(
e_h
)
-
y0
;
tmp
=
(
lim_alpha
-
0.5
f
*
lim_alpha
*
lim_alpha
-
alpha
+
0.5
f
*
alpha
*
alpha
)
*
(
lim_beta
-
0.5
f
*
lim_beta
*
lim_beta
-
beta
+
0.5
f
*
beta
*
beta
);
PrRoIPoolingDistributeDiff
(
diff
,
top_diff
,
e_h
,
s_w
,
h0
,
w0
,
tmp
);
alpha
=
T
(
e_w
)
-
x1
;
lim_alpha
=
T
(
e_w
)
-
x0
;
tmp
=
(
lim_alpha
-
0.5
f
*
lim_alpha
*
lim_alpha
-
alpha
+
0.5
f
*
alpha
*
alpha
)
*
(
lim_beta
-
0.5
f
*
lim_beta
*
lim_beta
-
beta
+
0.5
f
*
beta
*
beta
);
PrRoIPoolingDistributeDiff
(
diff
,
top_diff
,
e_h
,
e_w
,
h0
,
w0
,
tmp
);
}
template
<
typename
T
>
__global__
void
prroi_pool_forward_cuda_kernel
(
const
int
nthreads
,
const
T
*
input
,
const
T
*
rois
,
T
*
output
,
const
int
pooled_height
,
const
int
pooled_width
,
const
T
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
// (n, c, ph, pw) is an element in the pooled output
int
pw
=
index
%
pooled_width
;
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
int
c
=
(
index
/
pooled_width
/
pooled_height
)
%
channels
;
int
n
=
index
/
pooled_width
/
pooled_height
/
channels
;
const
T
*
offset_rois
=
rois
+
n
*
5
;
int
roi_batch_ind
=
offset_rois
[
0
];
T
roi_x1
=
offset_rois
[
1
]
*
spatial_scale
;
T
roi_y1
=
offset_rois
[
2
]
*
spatial_scale
;
T
roi_x2
=
offset_rois
[
3
]
*
spatial_scale
;
T
roi_y2
=
offset_rois
[
4
]
*
spatial_scale
;
T
roi_width
=
max
(
roi_x2
-
roi_x1
,
((
T
)
0.0
));
T
roi_height
=
max
(
roi_y2
-
roi_y1
,
((
T
)
0.0
));
T
bin_size_h
=
roi_height
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
roi_width
/
static_cast
<
T
>
(
pooled_width
);
const
T
*
this_data
=
input
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
T
*
this_out
=
output
+
index
;
T
bin_x1
=
roi_x1
+
bin_size_w
*
pw
;
T
bin_y1
=
roi_y1
+
bin_size_h
*
ph
;
T
bin_x2
=
bin_x1
+
bin_size_w
;
T
bin_y2
=
bin_y1
+
bin_size_h
;
T
bin_size
=
max
(
T
(
0.0
),
bin_size_w
*
bin_size_h
);
if
(
bin_size
==
0
)
{
*
this_out
=
0
;
continue
;
}
T
sum_out
=
0
;
int
start_x
,
start_y
,
end_x
,
end_y
;
start_x
=
floorf
(
bin_x1
);
end_x
=
ceilf
(
bin_x2
);
start_y
=
floorf
(
bin_y1
);
end_y
=
ceilf
(
bin_y2
);
for
(
int
bin_x
=
start_x
;
bin_x
<
end_x
;
++
bin_x
)
for
(
int
bin_y
=
start_y
;
bin_y
<
end_y
;
++
bin_y
)
sum_out
+=
PrRoIPoolingMatCalculation
(
this_data
,
bin_y
,
bin_x
,
bin_y
+
1
,
bin_x
+
1
,
max
(
bin_y1
,
T
(
bin_y
)),
max
(
bin_x1
,
T
(
bin_x
)),
min
(
bin_y2
,
T
(
bin_y
)
+
1.0
f
),
min
(
bin_x2
,
T
(
bin_x
+
1.0
f
)),
height
,
width
);
*
this_out
=
sum_out
/
bin_size
;
}
}
template
<
typename
T
>
__global__
void
prroi_pool_backward_cuda_kernel
(
const
int
nthreads
,
const
T
*
grad_output
,
const
T
*
rois
,
T
*
grad_input
,
const
int
pooled_height
,
const
int
pooled_width
,
const
T
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
// (n, c, ph, pw) is an element in the pooled output
int
pw
=
index
%
pooled_width
;
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
int
c
=
(
index
/
pooled_width
/
pooled_height
)
%
channels
;
int
n
=
index
/
pooled_width
/
pooled_height
/
channels
;
rois
+=
n
*
5
;
int
roi_batch_ind
=
rois
[
0
];
T
roi_x1
=
rois
[
1
]
*
spatial_scale
;
T
roi_y1
=
rois
[
2
]
*
spatial_scale
;
T
roi_x2
=
rois
[
3
]
*
spatial_scale
;
T
roi_y2
=
rois
[
4
]
*
spatial_scale
;
T
roi_width
=
max
(
roi_x2
-
roi_x1
,
(
T
)
0
);
T
roi_height
=
max
(
roi_y2
-
roi_y1
,
(
T
)
0
);
T
bin_size_h
=
roi_height
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
roi_width
/
static_cast
<
T
>
(
pooled_width
);
const
T
*
this_out_grad
=
grad_output
+
index
;
T
*
this_data_grad
=
grad_input
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
T
bin_x1
=
roi_x1
+
bin_size_w
*
pw
;
T
bin_y1
=
roi_y1
+
bin_size_h
*
ph
;
T
bin_x2
=
bin_x1
+
bin_size_w
;
T
bin_y2
=
bin_y1
+
bin_size_h
;
T
bin_size
=
max
(
T
(
0.0
),
bin_size_w
*
bin_size_h
);
T
sum_out
=
bin_size
==
T
(
0
)
?
T
(
0
)
:
*
this_out_grad
/
bin_size
;
int
start_x
,
start_y
,
end_x
,
end_y
;
start_x
=
floorf
(
bin_x1
);
end_x
=
ceilf
(
bin_x2
);
start_y
=
floorf
(
bin_y1
);
end_y
=
ceilf
(
bin_y2
);
for
(
int
bin_x
=
start_x
;
bin_x
<
end_x
;
++
bin_x
)
for
(
int
bin_y
=
start_y
;
bin_y
<
end_y
;
++
bin_y
)
PrRoIPoolingMatDistributeDiff
(
this_data_grad
,
sum_out
,
bin_y
,
bin_x
,
bin_y
+
1
,
bin_x
+
1
,
max
(
bin_y1
,
T
(
bin_y
)),
max
(
bin_x1
,
T
(
bin_x
)),
min
(
bin_y2
,
T
(
bin_y
)
+
1.0
f
),
min
(
bin_x2
,
T
(
bin_x
+
1.0
f
)),
height
,
width
);
}
}
template
<
typename
T
>
__global__
void
prroi_pool_coor_backward_cuda_kernel
(
const
int
nthreads
,
const
T
*
output
,
const
T
*
grad_output
,
const
T
*
input
,
const
T
*
rois
,
T
*
grad_rois
,
const
int
pooled_height
,
const
int
pooled_width
,
const
T
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
// (n, c, ph, pw) is an element in the pooled output
int
pw
=
index
%
pooled_width
;
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
int
c
=
(
index
/
pooled_width
/
pooled_height
)
%
channels
;
int
n
=
index
/
pooled_width
/
pooled_height
/
channels
;
rois
+=
n
*
5
;
int
roi_batch_ind
=
rois
[
0
];
T
roi_x1
=
rois
[
1
]
*
spatial_scale
;
T
roi_y1
=
rois
[
2
]
*
spatial_scale
;
T
roi_x2
=
rois
[
3
]
*
spatial_scale
;
T
roi_y2
=
rois
[
4
]
*
spatial_scale
;
T
roi_width
=
max
(
roi_x2
-
roi_x1
,
(
T
)
0
);
T
roi_height
=
max
(
roi_y2
-
roi_y1
,
(
T
)
0
);
T
bin_size_h
=
roi_height
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
roi_width
/
static_cast
<
T
>
(
pooled_width
);
const
T
output_grad_val
=
grad_output
[
index
];
const
T
*
this_input_data
=
input
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
const
T
output_val
=
output
[
index
];
T
*
this_rois_grad
=
grad_rois
+
n
*
5
;
T
bin_x1
=
roi_x1
+
bin_size_w
*
pw
;
T
bin_y1
=
roi_y1
+
bin_size_h
*
ph
;
T
bin_x2
=
bin_x1
+
bin_size_w
;
T
bin_y2
=
bin_y1
+
bin_size_h
;
T
bin_size
=
max
(
T
(
0.0
),
bin_size_w
*
bin_size_h
);
T
sum_out
=
bin_size
==
T
(
0
)
?
T
(
0
)
:
output_grad_val
/
bin_size
;
// WARNING: to be discussed
if
(
sum_out
==
0
)
return
;
int
start_x
,
start_y
,
end_x
,
end_y
;
start_x
=
floorf
(
bin_x1
);
end_x
=
ceilf
(
bin_x2
);
start_y
=
floorf
(
bin_y1
);
end_y
=
ceilf
(
bin_y2
);
T
grad_x1_y
=
0
,
grad_x2_y
=
0
,
grad_x_y1
=
0
,
grad_x_y2
=
0
;
for
(
int
bin_y
=
start_y
;
bin_y
<
end_y
;
++
bin_y
)
{
grad_x1_y
+=
PrRoIPoolingSingleCoorIntegral
(
max
(
bin_y1
,
T
(
bin_y
))
-
bin_y
,
min
(
bin_y2
,
T
(
bin_y
+
1
))
-
bin_y
,
PrRoIPoolingInterpolation
(
this_input_data
,
float
(
bin_y
),
bin_x1
,
height
,
width
),
PrRoIPoolingInterpolation
(
this_input_data
,
float
(
bin_y
+
1
),
bin_x1
,
height
,
width
));
grad_x2_y
+=
PrRoIPoolingSingleCoorIntegral
(
max
(
bin_y1
,
T
(
bin_y
))
-
bin_y
,
min
(
bin_y2
,
T
(
bin_y
+
1
))
-
bin_y
,
PrRoIPoolingInterpolation
(
this_input_data
,
float
(
bin_y
),
bin_x2
,
height
,
width
),
PrRoIPoolingInterpolation
(
this_input_data
,
float
(
bin_y
+
1
),
bin_x2
,
height
,
width
));
}
for
(
int
bin_x
=
start_x
;
bin_x
<
end_x
;
++
bin_x
)
{
grad_x_y1
+=
PrRoIPoolingSingleCoorIntegral
(
max
(
bin_x1
,
T
(
bin_x
))
-
bin_x
,
min
(
bin_x2
,
T
(
bin_x
+
1
))
-
bin_x
,
PrRoIPoolingInterpolation
(
this_input_data
,
bin_y1
,
float
(
bin_x
),
height
,
width
),
PrRoIPoolingInterpolation
(
this_input_data
,
bin_y1
,
float
(
bin_x
+
1
),
height
,
width
));
grad_x_y2
+=
PrRoIPoolingSingleCoorIntegral
(
max
(
bin_x1
,
T
(
bin_x
))
-
bin_x
,
min
(
bin_x2
,
T
(
bin_x
+
1
))
-
bin_x
,
PrRoIPoolingInterpolation
(
this_input_data
,
bin_y2
,
float
(
bin_x
),
height
,
width
),
PrRoIPoolingInterpolation
(
this_input_data
,
bin_y2
,
float
(
bin_x
+
1
),
height
,
width
));
}
T
partial_x1
=
-
grad_x1_y
+
(
bin_y2
-
bin_y1
)
*
output_val
;
T
partial_y1
=
-
grad_x_y1
+
(
bin_x2
-
bin_x1
)
*
output_val
;
T
partial_x2
=
grad_x2_y
-
(
bin_y2
-
bin_y1
)
*
output_val
;
T
partial_y2
=
grad_x_y2
-
(
bin_x2
-
bin_x1
)
*
output_val
;
partial_x1
=
partial_x1
/
bin_size
*
spatial_scale
;
partial_x2
=
partial_x2
/
bin_size
*
spatial_scale
;
partial_y1
=
partial_y1
/
bin_size
*
spatial_scale
;
partial_y2
=
partial_y2
/
bin_size
*
spatial_scale
;
// (index, x1, y1, x2, y2)
this_rois_grad
[
0
]
=
0
;
atomicAdd
(
this_rois_grad
+
1
,
(
partial_x1
*
(
1.0
f
-
T
(
pw
)
/
pooled_width
)
+
partial_x2
*
(
1.0
f
-
T
(
pw
+
1
)
/
pooled_width
))
*
output_grad_val
);
atomicAdd
(
this_rois_grad
+
2
,
(
partial_y1
*
(
1.0
f
-
T
(
ph
)
/
pooled_height
)
+
partial_y2
*
(
1.0
f
-
T
(
ph
+
1
)
/
pooled_height
))
*
output_grad_val
);
atomicAdd
(
this_rois_grad
+
3
,
(
partial_x2
*
T
(
pw
+
1
)
/
pooled_width
+
partial_x1
*
T
(
pw
)
/
pooled_width
)
*
output_grad_val
);
atomicAdd
(
this_rois_grad
+
4
,
(
partial_y2
*
T
(
ph
+
1
)
/
pooled_height
+
partial_y1
*
T
(
ph
)
/
pooled_height
)
*
output_grad_val
);
}
}
#endif // ROI_POOL_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
0 → 100644
View file @
fdeee889
// Modified from
// https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu
#ifndef RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
#define RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
#include <float.h>
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else // MMCV_USE_PARROTS
#include "pytorch_cuda_helper.hpp"
#endif // MMCV_USE_PARROTS
/*** Forward ***/
template
<
typename
scalar_t
>
__global__
void
riroi_align_rotated_forward_cuda_kernel
(
const
int
nthreads
,
const
scalar_t
*
bottom_data
,
const
scalar_t
*
bottom_rois
,
const
scalar_t
spatial_scale
,
const
int
num_samples
,
const
bool
clockwise
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
num_orientations
,
scalar_t
*
top_data
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
// (n, c, ph, pw) is an element in the pooled output
int
pw
=
index
%
pooled_width
;
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
int
o
=
(
index
/
pooled_width
/
pooled_height
)
%
num_orientations
;
int
c
=
(
index
/
pooled_width
/
pooled_height
/
num_orientations
)
%
channels
;
int
n
=
index
/
pooled_width
/
pooled_height
/
num_orientations
/
channels
;
const
scalar_t
*
offset_bottom_rois
=
bottom_rois
+
n
*
6
;
int
roi_batch_ind
=
offset_bottom_rois
[
0
];
// Do not using rounding; this implementation detail is critical
scalar_t
roi_center_w
=
offset_bottom_rois
[
1
]
*
spatial_scale
;
scalar_t
roi_center_h
=
offset_bottom_rois
[
2
]
*
spatial_scale
;
scalar_t
roi_width
=
offset_bottom_rois
[
3
]
*
spatial_scale
;
scalar_t
roi_height
=
offset_bottom_rois
[
4
]
*
spatial_scale
;
// scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
scalar_t
theta
=
offset_bottom_rois
[
5
];
// Force malformed ROIs to be 1x1
roi_width
=
max
(
roi_width
,
(
scalar_t
)
1.
);
roi_height
=
max
(
roi_height
,
(
scalar_t
)
1.
);
scalar_t
bin_size_h
=
static_cast
<
scalar_t
>
(
roi_height
)
/
static_cast
<
scalar_t
>
(
pooled_height
);
scalar_t
bin_size_w
=
static_cast
<
scalar_t
>
(
roi_width
)
/
static_cast
<
scalar_t
>
(
pooled_width
);
// find aligned index
scalar_t
ind_float
=
theta
*
num_orientations
/
(
2
*
M_PI
);
int
ind
=
floorf
(
ind_float
);
scalar_t
l_var
=
ind_float
-
(
scalar_t
)
ind
;
scalar_t
r_var
=
1.0
-
l_var
;
// correct start channel
ind
=
(
ind
+
num_orientations
)
%
num_orientations
;
// rotated channel
int
ind_rot
=
(
o
-
ind
+
num_orientations
)
%
num_orientations
;
int
ind_rot_plus
=
(
ind_rot
+
1
+
num_orientations
)
%
num_orientations
;
const
scalar_t
*
offset_bottom_data
=
bottom_data
+
(
roi_batch_ind
*
channels
*
num_orientations
+
c
*
num_orientations
+
ind_rot
)
*
height
*
width
;
const
scalar_t
*
offset_bottom_data_plus
=
bottom_data
+
(
roi_batch_ind
*
channels
*
num_orientations
+
c
*
num_orientations
+
ind_rot_plus
)
*
height
*
width
;
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
num_samples
>
0
)
?
num_samples
:
ceilf
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
num_samples
>
0
)
?
num_samples
:
ceilf
(
roi_width
/
pooled_width
);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
if
(
clockwise
)
{
theta
=
-
theta
;
// If clockwise, the angle needs to be reversed.
}
scalar_t
roi_start_h
=
-
roi_height
/
2.0
;
scalar_t
roi_start_w
=
-
roi_width
/
2.0
;
scalar_t
cosscalar_theta
=
cos
(
theta
);
scalar_t
sinscalar_theta
=
sin
(
theta
);
// We do average (integral) pooling inside a bin
const
scalar_t
count
=
max
(
roi_bin_grid_h
*
roi_bin_grid_w
,
1
);
// e.g. = 4
scalar_t
output_val
=
0.
;
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
{
// e.g., iy = 0, 1
const
scalar_t
yy
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
scalar_t
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
scalar_t
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
const
scalar_t
xx
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
scalar_t
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
scalar_t
>
(
roi_bin_grid_w
);
// Rotate by theta (counterclockwise) around the center and translate
scalar_t
y
=
yy
*
cosscalar_theta
-
xx
*
sinscalar_theta
+
roi_center_h
;
scalar_t
x
=
yy
*
sinscalar_theta
+
xx
*
cosscalar_theta
+
roi_center_w
;
scalar_t
val
=
bilinear_interpolate
<
scalar_t
>
(
offset_bottom_data
,
height
,
width
,
y
,
x
,
index
);
scalar_t
val_plus
=
bilinear_interpolate
<
scalar_t
>
(
offset_bottom_data_plus
,
height
,
width
,
y
,
x
,
index
);
output_val
+=
r_var
*
val
+
l_var
*
val_plus
;
}
}
output_val
/=
count
;
top_data
[
index
]
=
output_val
;
}
}
/*** Backward ***/
template
<
typename
scalar_t
>
__global__
void
riroi_align_rotated_backward_cuda_kernel
(
const
int
nthreads
,
const
scalar_t
*
top_diff
,
const
scalar_t
*
bottom_rois
,
const
scalar_t
spatial_scale
,
const
int
num_samples
,
const
bool
clockwise
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
num_orientations
,
scalar_t
*
bottom_diff
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
// (n, c, ph, pw) is an element in the pooled output
int
pw
=
index
%
pooled_width
;
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
int
o
=
(
index
/
pooled_width
/
pooled_height
)
%
num_orientations
;
int
c
=
(
index
/
pooled_width
/
pooled_height
/
num_orientations
)
%
channels
;
int
n
=
index
/
pooled_width
/
pooled_height
/
num_orientations
/
channels
;
const
scalar_t
*
offset_bottom_rois
=
bottom_rois
+
n
*
6
;
int
roi_batch_ind
=
offset_bottom_rois
[
0
];
// Do not round
scalar_t
roi_center_w
=
offset_bottom_rois
[
1
]
*
spatial_scale
;
scalar_t
roi_center_h
=
offset_bottom_rois
[
2
]
*
spatial_scale
;
scalar_t
roi_width
=
offset_bottom_rois
[
3
]
*
spatial_scale
;
scalar_t
roi_height
=
offset_bottom_rois
[
4
]
*
spatial_scale
;
// scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
scalar_t
theta
=
offset_bottom_rois
[
5
];
// Force malformed ROIs to be 1x1
roi_width
=
max
(
roi_width
,
(
scalar_t
)
1.
);
roi_height
=
max
(
roi_height
,
(
scalar_t
)
1.
);
scalar_t
bin_size_h
=
static_cast
<
scalar_t
>
(
roi_height
)
/
static_cast
<
scalar_t
>
(
pooled_height
);
scalar_t
bin_size_w
=
static_cast
<
scalar_t
>
(
roi_width
)
/
static_cast
<
scalar_t
>
(
pooled_width
);
// find aligned index
scalar_t
ind_float
=
theta
*
num_orientations
/
(
2
*
M_PI
);
int
ind
=
floorf
(
ind_float
);
scalar_t
l_var
=
ind_float
-
(
scalar_t
)
ind
;
scalar_t
r_var
=
1.0
-
l_var
;
// correct start channel
ind
=
(
ind
+
num_orientations
)
%
num_orientations
;
// rotated channel
int
ind_rot
=
(
o
-
ind
+
num_orientations
)
%
num_orientations
;
int
ind_rot_plus
=
(
ind_rot
+
1
+
num_orientations
)
%
num_orientations
;
scalar_t
*
offset_bottom_diff
=
bottom_diff
+
(
roi_batch_ind
*
channels
*
num_orientations
+
c
*
num_orientations
+
ind_rot
)
*
height
*
width
;
scalar_t
*
offset_bottom_diff_plus
=
bottom_diff
+
(
roi_batch_ind
*
channels
*
num_orientations
+
c
*
num_orientations
+
ind_rot_plus
)
*
height
*
width
;
int
top_offset
=
(
n
*
channels
*
num_orientations
+
c
*
num_orientations
+
o
)
*
pooled_height
*
pooled_width
;
const
scalar_t
*
offset_top_diff
=
top_diff
+
top_offset
;
const
scalar_t
top_diff_this_bin
=
offset_top_diff
[
ph
*
pooled_width
+
pw
];
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
num_samples
>
0
)
?
num_samples
:
ceilf
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
num_samples
>
0
)
?
num_samples
:
ceilf
(
roi_width
/
pooled_width
);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
if
(
clockwise
)
{
theta
=
-
theta
;
// If clockwise, the angle needs to be reversed.
}
scalar_t
roi_start_h
=
-
roi_height
/
2.0
;
scalar_t
roi_start_w
=
-
roi_width
/
2.0
;
scalar_t
cosTheta
=
cos
(
theta
);
scalar_t
sinTheta
=
sin
(
theta
);
// We do average (integral) pooling inside a bin
const
scalar_t
count
=
roi_bin_grid_h
*
roi_bin_grid_w
;
// e.g. = 4
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
{
// e.g., iy = 0, 1
const
scalar_t
yy
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
scalar_t
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
scalar_t
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
const
scalar_t
xx
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
scalar_t
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
scalar_t
>
(
roi_bin_grid_w
);
// Rotate by theta around the center and translate
scalar_t
y
=
yy
*
cosTheta
-
xx
*
sinTheta
+
roi_center_h
;
scalar_t
x
=
yy
*
sinTheta
+
xx
*
cosTheta
+
roi_center_w
;
scalar_t
w1
,
w2
,
w3
,
w4
;
int
x_low
,
x_high
,
y_low
,
y_high
;
bilinear_interpolate_gradient
<
scalar_t
>
(
height
,
width
,
y
,
x
,
w1
,
w2
,
w3
,
w4
,
x_low
,
x_high
,
y_low
,
y_high
,
index
);
scalar_t
g1
=
top_diff_this_bin
*
w1
/
count
;
scalar_t
g2
=
top_diff_this_bin
*
w2
/
count
;
scalar_t
g3
=
top_diff_this_bin
*
w3
/
count
;
scalar_t
g4
=
top_diff_this_bin
*
w4
/
count
;
if
(
x_low
>=
0
&&
x_high
>=
0
&&
y_low
>=
0
&&
y_high
>=
0
)
{
atomicAdd
(
offset_bottom_diff
+
y_low
*
width
+
x_low
,
g1
*
r_var
);
atomicAdd
(
offset_bottom_diff
+
y_low
*
width
+
x_high
,
g2
*
r_var
);
atomicAdd
(
offset_bottom_diff
+
y_high
*
width
+
x_low
,
g3
*
r_var
);
atomicAdd
(
offset_bottom_diff
+
y_high
*
width
+
x_high
,
g4
*
r_var
);
atomicAdd
(
offset_bottom_diff_plus
+
y_low
*
width
+
x_low
,
g1
*
l_var
);
atomicAdd
(
offset_bottom_diff_plus
+
y_low
*
width
+
x_high
,
g2
*
l_var
);
atomicAdd
(
offset_bottom_diff_plus
+
y_high
*
width
+
x_low
,
g3
*
l_var
);
atomicAdd
(
offset_bottom_diff_plus
+
y_high
*
width
+
x_high
,
g4
*
l_var
);
}
// if
}
// ix
}
// iy
}
// CUDA_1D_KERNEL_LOOP
}
// RiRoIAlignBackward
#endif // RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
View file @
fdeee889
...
...
@@ -20,7 +20,7 @@ template <typename scalar_t>
__global__
void
roi_align_rotated_forward_cuda_kernel
(
const
int
nthreads
,
const
scalar_t
*
bottom_data
,
const
scalar_t
*
bottom_rois
,
const
scalar_t
spatial_scale
,
const
int
sampl
e_num
,
const
bool
aligned
,
const
bool
clockwise
,
const
int
sampl
ing_ratio
,
const
bool
aligned
,
const
bool
clockwise
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
scalar_t
*
top_data
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
...
...
@@ -58,11 +58,11 @@ __global__ void roi_align_rotated_forward_cuda_kernel(
bottom_data
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampl
e_num
>
0
)
?
sampl
e_num
int
roi_bin_grid_h
=
(
sampl
ing_ratio
>
0
)
?
sampl
ing_ratio
:
ceilf
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampl
e_num
>
0
)
?
sampl
e_num
:
ceilf
(
roi_width
/
pooled_width
);
(
sampl
ing_ratio
>
0
)
?
sampl
ing_ratio
:
ceilf
(
roi_width
/
pooled_width
);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
...
...
@@ -104,7 +104,7 @@ __global__ void roi_align_rotated_forward_cuda_kernel(
template
<
typename
scalar_t
>
__global__
void
roi_align_rotated_backward_cuda_kernel
(
const
int
nthreads
,
const
scalar_t
*
top_diff
,
const
scalar_t
*
bottom_rois
,
const
scalar_t
spatial_scale
,
const
int
sampl
e_num
,
const
bool
aligned
,
const
scalar_t
spatial_scale
,
const
int
sampl
ing_ratio
,
const
bool
aligned
,
const
bool
clockwise
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
scalar_t
*
bottom_diff
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
...
...
@@ -146,11 +146,11 @@ __global__ void roi_align_rotated_backward_cuda_kernel(
const
scalar_t
top_diff_this_bin
=
offset_top_diff
[
ph
*
pooled_width
+
pw
];
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampl
e_num
>
0
)
?
sampl
e_num
int
roi_bin_grid_h
=
(
sampl
ing_ratio
>
0
)
?
sampl
ing_ratio
:
ceilf
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampl
e_num
>
0
)
?
sampl
e_num
:
ceilf
(
roi_width
/
pooled_width
);
(
sampl
ing_ratio
>
0
)
?
sampl
ing_ratio
:
ceilf
(
roi_width
/
pooled_width
);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
...
...
mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
View file @
fdeee889
...
...
@@ -44,37 +44,38 @@ __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
// coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N,
// npoints): -1 means point does not in this box, otherwise: encode (x_idxs,
// y_idxs, z_idxs) by binary bit
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
box_idx
=
blockIdx
.
y
;
if
(
pt_idx
>=
pts_num
||
box_idx
>=
boxes_num
)
return
;
CUDA_1D_KERNEL_LOOP
(
pt_idx
,
pts_num
)
{
if
(
box_idx
>=
boxes_num
)
return
;
pts
+=
pt_idx
*
3
;
rois
+=
box_idx
*
7
;
pts_mask
+=
box_idx
*
pts_num
+
pt_idx
;
pts
+=
pt_idx
*
3
;
rois
+=
box_idx
*
7
;
pts_mask
+=
box_idx
*
pts_num
+
pt_idx
;
T
local_x
=
0
,
local_y
=
0
;
int
cur_in_flag
=
check_pt_in_box3d
(
pts
,
rois
,
local_x
,
local_y
);
T
local_x
=
0
,
local_y
=
0
;
int
cur_in_flag
=
check_pt_in_box3d
(
pts
,
rois
,
local_x
,
local_y
);
pts_mask
[
0
]
=
-
1
;
if
(
cur_in_flag
>
0
)
{
T
local_z
=
pts
[
2
]
-
rois
[
2
];
T
x_size
=
rois
[
3
],
y_size
=
rois
[
4
],
z_size
=
rois
[
5
];
pts_mask
[
0
]
=
-
1
;
if
(
cur_in_flag
>
0
)
{
T
local_z
=
pts
[
2
]
-
rois
[
2
];
T
x_size
=
rois
[
3
],
y_size
=
rois
[
4
],
z_size
=
rois
[
5
];
T
x_res
=
x_size
/
out_x
;
T
y_res
=
y_size
/
out_y
;
T
z_res
=
z_size
/
out_z
;
T
x_res
=
x_size
/
out_x
;
T
y_res
=
y_size
/
out_y
;
T
z_res
=
z_size
/
out_z
;
unsigned
int
x_idx
=
int
((
local_x
+
x_size
/
2
)
/
x_res
);
unsigned
int
y_idx
=
int
((
local_y
+
y_size
/
2
)
/
y_res
);
unsigned
int
z_idx
=
int
(
local_z
/
z_res
);
unsigned
int
x_idx
=
int
((
local_x
+
x_size
/
2
)
/
x_res
);
unsigned
int
y_idx
=
int
((
local_y
+
y_size
/
2
)
/
y_res
);
unsigned
int
z_idx
=
int
(
local_z
/
z_res
);
x_idx
=
min
(
max
(
x_idx
,
0
),
out_x
-
1
);
y_idx
=
min
(
max
(
y_idx
,
0
),
out_y
-
1
);
z_idx
=
min
(
max
(
z_idx
,
0
),
out_z
-
1
);
x_idx
=
min
(
max
(
x_idx
,
0
),
out_x
-
1
);
y_idx
=
min
(
max
(
y_idx
,
0
),
out_y
-
1
);
z_idx
=
min
(
max
(
z_idx
,
0
),
out_z
-
1
);
unsigned
int
idx_encoding
=
(
x_idx
<<
16
)
+
(
y_idx
<<
8
)
+
z_idx
;
unsigned
int
idx_encoding
=
(
x_idx
<<
16
)
+
(
y_idx
<<
8
)
+
z_idx
;
pts_mask
[
0
]
=
idx_encoding
;
pts_mask
[
0
]
=
idx_encoding
;
}
}
}
...
...
@@ -86,26 +87,24 @@ __global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
T
*
pts_idx_of_voxels
)
{
// params pts_mask: (N, npoints) 0 or 1
// params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
int
box_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
box_idx
>=
boxes_num
)
return
;
int
max_num_pts
=
max_pts_each_voxel
-
1
;
// index 0 is the counter
pts_idx_of_voxels
+=
box_idx
*
out_x
*
out_y
*
out_z
*
max_pts_each_voxel
;
for
(
int
k
=
0
;
k
<
pts_num
;
k
++
)
{
if
(
pts_mask
[
box_idx
*
pts_num
+
k
]
!=
-
1
)
{
unsigned
int
idx_encoding
=
pts_mask
[
box_idx
*
pts_num
+
k
];
unsigned
int
x_idx
=
(
idx_encoding
>>
16
)
&
0xFF
;
unsigned
int
y_idx
=
(
idx_encoding
>>
8
)
&
0xFF
;
unsigned
int
z_idx
=
idx_encoding
&
0xFF
;
unsigned
int
base_offset
=
x_idx
*
out_y
*
out_z
*
max_pts_each_voxel
+
y_idx
*
out_z
*
max_pts_each_voxel
+
z_idx
*
max_pts_each_voxel
;
unsigned
int
cnt
=
pts_idx_of_voxels
[
base_offset
];
if
(
cnt
<
max_num_pts
)
{
pts_idx_of_voxels
[
base_offset
+
cnt
+
1
]
=
k
;
pts_idx_of_voxels
[
base_offset
]
++
;
CUDA_1D_KERNEL_LOOP
(
box_idx
,
boxes_num
)
{
int
max_num_pts
=
max_pts_each_voxel
-
1
;
// index 0 is the counter
pts_idx_of_voxels
+=
box_idx
*
out_x
*
out_y
*
out_z
*
max_pts_each_voxel
;
for
(
int
k
=
0
;
k
<
pts_num
;
k
++
)
{
if
(
pts_mask
[
box_idx
*
pts_num
+
k
]
!=
-
1
)
{
unsigned
int
idx_encoding
=
pts_mask
[
box_idx
*
pts_num
+
k
];
unsigned
int
x_idx
=
(
idx_encoding
>>
16
)
&
0xFF
;
unsigned
int
y_idx
=
(
idx_encoding
>>
8
)
&
0xFF
;
unsigned
int
z_idx
=
idx_encoding
&
0xFF
;
unsigned
int
base_offset
=
x_idx
*
out_y
*
out_z
*
max_pts_each_voxel
+
y_idx
*
out_z
*
max_pts_each_voxel
+
z_idx
*
max_pts_each_voxel
;
unsigned
int
cnt
=
pts_idx_of_voxels
[
base_offset
];
if
(
cnt
<
max_num_pts
)
{
pts_idx_of_voxels
[
base_offset
+
cnt
+
1
]
=
k
;
pts_idx_of_voxels
[
base_offset
]
++
;
}
}
}
}
...
...
@@ -124,39 +123,38 @@ __global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
int
box_idx
=
blockIdx
.
z
;
int
channel_idx
=
blockIdx
.
y
;
int
voxel_idx_flat
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
x_idx
=
voxel_idx_flat
/
(
out_y
*
out_z
);
int
y_idx
=
(
voxel_idx_flat
-
x_idx
*
(
out_y
*
out_z
))
/
out_z
;
int
z_idx
=
voxel_idx_flat
%
out_z
;
if
(
box_idx
>=
boxes_num
||
channel_idx
>=
channels
||
x_idx
>=
out_x
||
y_idx
>=
out_y
||
z_idx
>=
out_z
)
return
;
int
offset_base
=
x_idx
*
out_y
*
out_z
+
y_idx
*
out_z
+
z_idx
;
pts_idx_of_voxels
+=
box_idx
*
out_x
*
out_y
*
out_z
*
max_pts_each_voxel
+
offset_base
*
max_pts_each_voxel
;
pooled_features
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
argmax
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
int
argmax_idx
=
-
1
;
float
max_val
=
-
1e50
;
int
total_pts
=
pts_idx_of_voxels
[
0
];
for
(
int
k
=
1
;
k
<=
total_pts
;
k
++
)
{
if
(
pts_feature
[
pts_idx_of_voxels
[
k
]
*
channels
+
channel_idx
]
>
max_val
)
{
max_val
=
pts_feature
[
pts_idx_of_voxels
[
k
]
*
channels
+
channel_idx
];
argmax_idx
=
pts_idx_of_voxels
[
k
];
CUDA_1D_KERNEL_LOOP
(
voxel_idx_flat
,
out_x
*
out_y
*
out_z
)
{
int
x_idx
=
voxel_idx_flat
/
(
out_y
*
out_z
);
int
y_idx
=
(
voxel_idx_flat
-
x_idx
*
(
out_y
*
out_z
))
/
out_z
;
int
z_idx
=
voxel_idx_flat
%
out_z
;
if
(
box_idx
>=
boxes_num
||
channel_idx
>=
channels
)
return
;
int
offset_base
=
x_idx
*
out_y
*
out_z
+
y_idx
*
out_z
+
z_idx
;
pts_idx_of_voxels
+=
box_idx
*
out_x
*
out_y
*
out_z
*
max_pts_each_voxel
+
offset_base
*
max_pts_each_voxel
;
pooled_features
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
argmax
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
int
argmax_idx
=
-
1
;
float
max_val
=
-
1e50
;
int
total_pts
=
pts_idx_of_voxels
[
0
];
for
(
int
k
=
1
;
k
<=
total_pts
;
k
++
)
{
if
(
pts_feature
[
pts_idx_of_voxels
[
k
]
*
channels
+
channel_idx
]
>
max_val
)
{
max_val
=
pts_feature
[
pts_idx_of_voxels
[
k
]
*
channels
+
channel_idx
];
argmax_idx
=
pts_idx_of_voxels
[
k
];
}
}
}
if
(
argmax_idx
!=
-
1
)
{
pooled_features
[
0
]
=
max_val
;
if
(
argmax_idx
!=
-
1
)
{
pooled_features
[
0
]
=
max_val
;
}
argmax
[
0
]
=
argmax_idx
;
}
argmax
[
0
]
=
argmax_idx
;
}
template
<
typename
T
>
...
...
@@ -172,30 +170,28 @@ __global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
int
box_idx
=
blockIdx
.
z
;
int
channel_idx
=
blockIdx
.
y
;
int
voxel_idx_flat
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
x_idx
=
voxel_idx_flat
/
(
out_y
*
out_z
);
int
y_idx
=
(
voxel_idx_flat
-
x_idx
*
(
out_y
*
out_z
))
/
out_z
;
int
z_idx
=
voxel_idx_flat
%
out_z
;
if
(
box_idx
>=
boxes_num
||
channel_idx
>=
channels
||
x_idx
>=
out_x
||
y_idx
>=
out_y
||
z_idx
>=
out_z
)
return
;
int
offset_base
=
x_idx
*
out_y
*
out_z
+
y_idx
*
out_z
+
z_idx
;
pts_idx_of_voxels
+=
box_idx
*
out_x
*
out_y
*
out_z
*
max_pts_each_voxel
+
offset_base
*
max_pts_each_voxel
;
pooled_features
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
float
sum_val
=
0
;
int
total_pts
=
pts_idx_of_voxels
[
0
];
for
(
int
k
=
1
;
k
<=
total_pts
;
k
++
)
{
sum_val
+=
pts_feature
[
pts_idx_of_voxels
[
k
]
*
channels
+
channel_idx
];
}
CUDA_1D_KERNEL_LOOP
(
voxel_idx_flat
,
out_x
*
out_y
*
out_z
)
{
int
x_idx
=
voxel_idx_flat
/
(
out_y
*
out_z
);
int
y_idx
=
(
voxel_idx_flat
-
x_idx
*
(
out_y
*
out_z
))
/
out_z
;
int
z_idx
=
voxel_idx_flat
%
out_z
;
if
(
box_idx
>=
boxes_num
||
channel_idx
>=
channels
)
return
;
int
offset_base
=
x_idx
*
out_y
*
out_z
+
y_idx
*
out_z
+
z_idx
;
pts_idx_of_voxels
+=
box_idx
*
out_x
*
out_y
*
out_z
*
max_pts_each_voxel
+
offset_base
*
max_pts_each_voxel
;
pooled_features
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
float
sum_val
=
0
;
int
total_pts
=
pts_idx_of_voxels
[
0
];
for
(
int
k
=
1
;
k
<=
total_pts
;
k
++
)
{
sum_val
+=
pts_feature
[
pts_idx_of_voxels
[
k
]
*
channels
+
channel_idx
];
}
if
(
total_pts
>
0
)
{
pooled_features
[
0
]
=
sum_val
/
total_pts
;
if
(
total_pts
>
0
)
{
pooled_features
[
0
]
=
sum_val
/
total_pts
;
}
}
}
...
...
@@ -210,24 +206,22 @@ __global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
int
box_idx
=
blockIdx
.
z
;
int
channel_idx
=
blockIdx
.
y
;
int
voxel_idx_flat
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
x_idx
=
voxel_idx_flat
/
(
out_y
*
out_z
);
int
y_idx
=
(
voxel_idx_flat
-
x_idx
*
(
out_y
*
out_z
))
/
out_z
;
int
z_idx
=
voxel_idx_flat
%
out_z
;
if
(
box_idx
>=
boxes_num
||
channel_idx
>=
channels
||
x_idx
>=
out_x
||
y_idx
>=
out_y
||
z_idx
>=
out_z
)
return
;
int
offset_base
=
x_idx
*
out_y
*
out_z
+
y_idx
*
out_z
+
z_idx
;
argmax
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
grad_out
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
CUDA_1D_KERNEL_LOOP
(
voxel_idx_flat
,
out_x
*
out_y
*
out_z
)
{
int
x_idx
=
voxel_idx_flat
/
(
out_y
*
out_z
);
int
y_idx
=
(
voxel_idx_flat
-
x_idx
*
(
out_y
*
out_z
))
/
out_z
;
int
z_idx
=
voxel_idx_flat
%
out_z
;
if
(
box_idx
>=
boxes_num
||
channel_idx
>=
channels
)
return
;
int
offset_base
=
x_idx
*
out_y
*
out_z
+
y_idx
*
out_z
+
z_idx
;
argmax
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
grad_out
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
if
(
argmax
[
0
]
==
-
1
)
return
;
if
(
argmax
[
0
]
==
-
1
)
return
;
atomicAdd
(
grad_in
+
argmax
[
0
]
*
channels
+
channel_idx
,
grad_out
[
0
]
*
1
);
atomicAdd
(
grad_in
+
argmax
[
0
]
*
channels
+
channel_idx
,
grad_out
[
0
]
*
1
);
}
}
template
<
typename
T
>
...
...
@@ -242,26 +236,24 @@ __global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
int
box_idx
=
blockIdx
.
z
;
int
channel_idx
=
blockIdx
.
y
;
int
voxel_idx_flat
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
x_idx
=
voxel_idx_flat
/
(
out_y
*
out_z
);
int
y_idx
=
(
voxel_idx_flat
-
x_idx
*
(
out_y
*
out_z
))
/
out_z
;
int
z_idx
=
voxel_idx_flat
%
out_z
;
if
(
box_idx
>=
boxes_num
||
channel_idx
>=
channels
||
x_idx
>=
out_x
||
y_idx
>=
out_y
||
z_idx
>=
out_z
)
return
;
int
offset_base
=
x_idx
*
out_y
*
out_z
+
y_idx
*
out_z
+
z_idx
;
pts_idx_of_voxels
+=
box_idx
*
out_x
*
out_y
*
out_z
*
max_pts_each_voxel
+
offset_base
*
max_pts_each_voxel
;
grad_out
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
int
total_pts
=
pts_idx_of_voxels
[
0
];
float
cur_grad
=
1
/
fmaxf
(
float
(
total_pts
),
1.0
);
for
(
int
k
=
1
;
k
<=
total_pts
;
k
++
)
{
atomicAdd
(
grad_in
+
pts_idx_of_voxels
[
k
]
*
channels
+
channel_idx
,
grad_out
[
0
]
*
cur_grad
);
CUDA_1D_KERNEL_LOOP
(
voxel_idx_flat
,
out_x
*
out_y
*
out_z
)
{
int
x_idx
=
voxel_idx_flat
/
(
out_y
*
out_z
);
int
y_idx
=
(
voxel_idx_flat
-
x_idx
*
(
out_y
*
out_z
))
/
out_z
;
int
z_idx
=
voxel_idx_flat
%
out_z
;
if
(
box_idx
>=
boxes_num
||
channel_idx
>=
channels
)
return
;
int
offset_base
=
x_idx
*
out_y
*
out_z
+
y_idx
*
out_z
+
z_idx
;
pts_idx_of_voxels
+=
box_idx
*
out_x
*
out_y
*
out_z
*
max_pts_each_voxel
+
offset_base
*
max_pts_each_voxel
;
grad_out
+=
box_idx
*
out_x
*
out_y
*
out_z
*
channels
+
offset_base
*
channels
+
channel_idx
;
int
total_pts
=
pts_idx_of_voxels
[
0
];
float
cur_grad
=
1
/
fmaxf
(
float
(
total_pts
),
1.0
);
for
(
int
k
=
1
;
k
<=
total_pts
;
k
++
)
{
atomicAdd
(
grad_in
+
pts_idx_of_voxels
[
k
]
*
channels
+
channel_idx
,
grad_out
[
0
]
*
cur_grad
);
}
}
}
...
...
Prev
1
…
6
7
8
9
10
11
12
13
14
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment