Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
detectron2
Commits
c732df65
Commit
c732df65
authored
Jan 18, 2024
by
limm
Browse files
push v0.1.3 version commit bd2ea47
parent
5b3792fc
Pipeline
#706
failed with stages
in 0 seconds
Changes
424
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
6617 additions
and
0 deletions
+6617
-0
detectron2/layers/csrc/ROIAlign/ROIAlign.h
detectron2/layers/csrc/ROIAlign/ROIAlign.h
+130
-0
detectron2/layers/csrc/ROIAlign/ROIAlign_cpu.cpp
detectron2/layers/csrc/ROIAlign/ROIAlign_cpu.cpp
+508
-0
detectron2/layers/csrc/ROIAlign/ROIAlign_cuda.cu
detectron2/layers/csrc/ROIAlign/ROIAlign_cuda.cu
+430
-0
detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
+115
-0
detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
...tron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
+522
-0
detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
...tron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
+443
-0
detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
+35
-0
detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
...tron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
+39
-0
detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
...tron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
+130
-0
detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
...tron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
+363
-0
detectron2/layers/csrc/cuda_version.cu
detectron2/layers/csrc/cuda_version.cu
+9
-0
detectron2/layers/csrc/deformable/deform_conv.h
detectron2/layers/csrc/deformable/deform_conv.h
+377
-0
detectron2/layers/csrc/deformable/deform_conv_cuda.cu
detectron2/layers/csrc/deformable/deform_conv_cuda.cu
+1131
-0
detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu
detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu
+1288
-0
detectron2/layers/csrc/nms_rotated/nms_rotated.h
detectron2/layers/csrc/nms_rotated/nms_rotated.h
+39
-0
detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
+75
-0
detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
+139
-0
detectron2/layers/csrc/vision.cpp
detectron2/layers/csrc/vision.cpp
+102
-0
detectron2/layers/deform_conv.py
detectron2/layers/deform_conv.py
+494
-0
detectron2/layers/mask_ops.py
detectron2/layers/mask_ops.py
+248
-0
No files found.
Too many changes to show.
To preserve performance only
424 of 424+
files are displayed.
Plain diff
Email patch
detectron2/layers/csrc/ROIAlign/ROIAlign.h
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#pragma once
#include <torch/types.h>
namespace
detectron2
{
at
::
Tensor
ROIAlign_forward_cpu
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
bool
aligned
);
at
::
Tensor
ROIAlign_backward_cpu
(
const
at
::
Tensor
&
grad
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
sampling_ratio
,
bool
aligned
);
#ifdef WITH_CUDA
at
::
Tensor
ROIAlign_forward_cuda
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
bool
aligned
);
at
::
Tensor
ROIAlign_backward_cuda
(
const
at
::
Tensor
&
grad
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
sampling_ratio
,
bool
aligned
);
#endif
// Interface for Python
inline
at
::
Tensor
ROIAlign_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
bool
aligned
)
{
if
(
input
.
is_cuda
())
{
#ifdef WITH_CUDA
return
ROIAlign_forward_cuda
(
input
,
rois
,
spatial_scale
,
pooled_height
,
pooled_width
,
sampling_ratio
,
aligned
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
return
ROIAlign_forward_cpu
(
input
,
rois
,
spatial_scale
,
pooled_height
,
pooled_width
,
sampling_ratio
,
aligned
);
}
inline
at
::
Tensor
ROIAlign_backward
(
const
at
::
Tensor
&
grad
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
sampling_ratio
,
bool
aligned
)
{
if
(
grad
.
is_cuda
())
{
#ifdef WITH_CUDA
return
ROIAlign_backward_cuda
(
grad
,
rois
,
spatial_scale
,
pooled_height
,
pooled_width
,
batch_size
,
channels
,
height
,
width
,
sampling_ratio
,
aligned
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
return
ROIAlign_backward_cpu
(
grad
,
rois
,
spatial_scale
,
pooled_height
,
pooled_width
,
batch_size
,
channels
,
height
,
width
,
sampling_ratio
,
aligned
);
}
}
// namespace detectron2
detectron2/layers/csrc/ROIAlign/ROIAlign_cpu.cpp
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/TensorUtils.h>
#include "ROIAlign.h"
namespace
{
// implementation taken from Caffe2
template
<
typename
T
>
struct
PreCalc
{
int
pos1
;
int
pos2
;
int
pos3
;
int
pos4
;
T
w1
;
T
w2
;
T
w3
;
T
w4
;
};
template
<
typename
T
>
void
pre_calc_for_bilinear_interpolate
(
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
iy_upper
,
const
int
ix_upper
,
T
roi_start_h
,
T
roi_start_w
,
T
bin_size_h
,
T
bin_size_w
,
int
roi_bin_grid_h
,
int
roi_bin_grid_w
,
std
::
vector
<
PreCalc
<
T
>>&
pre_calc
)
{
int
pre_calc_index
=
0
;
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
pooled_width
;
pw
++
)
{
for
(
int
iy
=
0
;
iy
<
iy_upper
;
iy
++
)
{
const
T
yy
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
T
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
T
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
ix_upper
;
ix
++
)
{
const
T
xx
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
T
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
T
>
(
roi_bin_grid_w
);
T
x
=
xx
;
T
y
=
yy
;
// deal with: inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
PreCalc
<
T
>
pc
;
pc
.
pos1
=
0
;
pc
.
pos2
=
0
;
pc
.
pos3
=
0
;
pc
.
pos4
=
0
;
pc
.
w1
=
0
;
pc
.
w2
=
0
;
pc
.
w3
=
0
;
pc
.
w4
=
0
;
pre_calc
[
pre_calc_index
]
=
pc
;
pre_calc_index
+=
1
;
continue
;
}
if
(
y
<=
0
)
{
y
=
0
;
}
if
(
x
<=
0
)
{
x
=
0
;
}
int
y_low
=
(
int
)
y
;
int
x_low
=
(
int
)
x
;
int
y_high
;
int
x_high
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
T
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
T
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
// save weights and indices
PreCalc
<
T
>
pc
;
pc
.
pos1
=
y_low
*
width
+
x_low
;
pc
.
pos2
=
y_low
*
width
+
x_high
;
pc
.
pos3
=
y_high
*
width
+
x_low
;
pc
.
pos4
=
y_high
*
width
+
x_high
;
pc
.
w1
=
w1
;
pc
.
w2
=
w2
;
pc
.
w3
=
w3
;
pc
.
w4
=
w4
;
pre_calc
[
pre_calc_index
]
=
pc
;
pre_calc_index
+=
1
;
}
}
}
}
}
template
<
typename
T
>
void
ROIAlignForward
(
const
int
nthreads
,
const
T
*
input
,
const
T
&
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
const
T
*
rois
,
T
*
output
,
bool
aligned
)
{
int
n_rois
=
nthreads
/
channels
/
pooled_width
/
pooled_height
;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for
(
int
n
=
0
;
n
<
n_rois
;
n
++
)
{
int
index_n
=
n
*
channels
*
pooled_width
*
pooled_height
;
const
T
*
offset_rois
=
rois
+
n
*
5
;
int
roi_batch_ind
=
offset_rois
[
0
];
// Do not use rounding; this implementation detail is critical
T
offset
=
aligned
?
(
T
)
0.5
:
(
T
)
0.0
;
T
roi_start_w
=
offset_rois
[
1
]
*
spatial_scale
-
offset
;
T
roi_start_h
=
offset_rois
[
2
]
*
spatial_scale
-
offset
;
T
roi_end_w
=
offset_rois
[
3
]
*
spatial_scale
-
offset
;
T
roi_end_h
=
offset_rois
[
4
]
*
spatial_scale
-
offset
;
T
roi_width
=
roi_end_w
-
roi_start_w
;
T
roi_height
=
roi_end_h
-
roi_start_h
;
if
(
aligned
)
{
AT_ASSERTM
(
roi_width
>=
0
&&
roi_height
>=
0
,
"ROIs in ROIAlign cannot have non-negative size!"
);
}
else
{
// for backward-compatibility only
roi_width
=
std
::
max
(
roi_width
,
(
T
)
1.
);
roi_height
=
std
::
max
(
roi_height
,
(
T
)
1.
);
}
T
bin_size_h
=
static_cast
<
T
>
(
roi_height
)
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
static_cast
<
T
>
(
roi_width
)
/
static_cast
<
T
>
(
pooled_width
);
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_width
/
pooled_width
);
// We do average (integral) pooling inside a bin
// When the grid is empty, output zeros == 0/1, instead of NaN.
const
T
count
=
std
::
max
(
roi_bin_grid_h
*
roi_bin_grid_w
,
1
);
// e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std
::
vector
<
PreCalc
<
T
>>
pre_calc
(
roi_bin_grid_h
*
roi_bin_grid_w
*
pooled_width
*
pooled_height
);
pre_calc_for_bilinear_interpolate
(
height
,
width
,
pooled_height
,
pooled_width
,
roi_bin_grid_h
,
roi_bin_grid_w
,
roi_start_h
,
roi_start_w
,
bin_size_h
,
bin_size_w
,
roi_bin_grid_h
,
roi_bin_grid_w
,
pre_calc
);
for
(
int
c
=
0
;
c
<
channels
;
c
++
)
{
int
index_n_c
=
index_n
+
c
*
pooled_width
*
pooled_height
;
const
T
*
offset_input
=
input
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
int
pre_calc_index
=
0
;
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
pooled_width
;
pw
++
)
{
int
index
=
index_n_c
+
ph
*
pooled_width
+
pw
;
T
output_val
=
0.
;
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
{
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
PreCalc
<
T
>
pc
=
pre_calc
[
pre_calc_index
];
output_val
+=
pc
.
w1
*
offset_input
[
pc
.
pos1
]
+
pc
.
w2
*
offset_input
[
pc
.
pos2
]
+
pc
.
w3
*
offset_input
[
pc
.
pos3
]
+
pc
.
w4
*
offset_input
[
pc
.
pos4
];
pre_calc_index
+=
1
;
}
}
output_val
/=
count
;
output
[
index
]
=
output_val
;
}
// for pw
}
// for ph
}
// for c
}
// for n
}
template
<
typename
T
>
void
bilinear_interpolate_gradient
(
const
int
height
,
const
int
width
,
T
y
,
T
x
,
T
&
w1
,
T
&
w2
,
T
&
w3
,
T
&
w4
,
int
&
x_low
,
int
&
x_high
,
int
&
y_low
,
int
&
y_high
,
const
int
index
/* index for debug only*/
)
{
// deal with cases that inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
w1
=
w2
=
w3
=
w4
=
0.
;
x_low
=
x_high
=
y_low
=
y_high
=
-
1
;
return
;
}
if
(
y
<=
0
)
y
=
0
;
if
(
x
<=
0
)
x
=
0
;
y_low
=
(
int
)
y
;
x_low
=
(
int
)
x
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
T
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
// reference in forward
// T v1 = input[y_low * width + x_low];
// T v2 = input[y_low * width + x_high];
// T v3 = input[y_high * width + x_low];
// T v4 = input[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
return
;
}
template
<
class
T
>
inline
void
add
(
T
*
address
,
const
T
&
val
)
{
*
address
+=
val
;
}
template
<
typename
T
>
void
ROIAlignBackward
(
const
int
nthreads
,
// may not be contiguous, and should be indexed using n_stride, etc
const
T
*
grad_output
,
const
T
&
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
T
*
grad_input
,
const
T
*
rois
,
const
int
n_stride
,
const
int
c_stride
,
const
int
h_stride
,
const
int
w_stride
,
bool
aligned
)
{
for
(
int
index
=
0
;
index
<
nthreads
;
index
++
)
{
// (n, c, ph, pw) is an element in the pooled output
int
pw
=
index
%
pooled_width
;
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
int
c
=
(
index
/
pooled_width
/
pooled_height
)
%
channels
;
int
n
=
index
/
pooled_width
/
pooled_height
/
channels
;
const
T
*
offset_rois
=
rois
+
n
*
5
;
int
roi_batch_ind
=
offset_rois
[
0
];
// Do not use rounding; this implementation detail is critical
T
offset
=
aligned
?
(
T
)
0.5
:
(
T
)
0.0
;
T
roi_start_w
=
offset_rois
[
1
]
*
spatial_scale
-
offset
;
T
roi_start_h
=
offset_rois
[
2
]
*
spatial_scale
-
offset
;
T
roi_end_w
=
offset_rois
[
3
]
*
spatial_scale
-
offset
;
T
roi_end_h
=
offset_rois
[
4
]
*
spatial_scale
-
offset
;
T
roi_width
=
roi_end_w
-
roi_start_w
;
T
roi_height
=
roi_end_h
-
roi_start_h
;
if
(
aligned
)
{
AT_ASSERTM
(
roi_width
>=
0
&&
roi_height
>=
0
,
"ROIs in ROIAlign do not have non-negative size!"
);
}
else
{
// for backward-compatibility only
roi_width
=
std
::
max
(
roi_width
,
(
T
)
1.
);
roi_height
=
std
::
max
(
roi_height
,
(
T
)
1.
);
}
T
bin_size_h
=
static_cast
<
T
>
(
roi_height
)
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
static_cast
<
T
>
(
roi_width
)
/
static_cast
<
T
>
(
pooled_width
);
T
*
offset_grad_input
=
grad_input
+
((
roi_batch_ind
*
channels
+
c
)
*
height
*
width
);
int
output_offset
=
n
*
n_stride
+
c
*
c_stride
;
const
T
*
offset_grad_output
=
grad_output
+
output_offset
;
const
T
grad_output_this_bin
=
offset_grad_output
[
ph
*
h_stride
+
pw
*
w_stride
];
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_width
/
pooled_width
);
// We do average (integral) pooling inside a bin
const
T
count
=
roi_bin_grid_h
*
roi_bin_grid_w
;
// e.g. = 4
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
{
const
T
y
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
T
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
T
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
const
T
x
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
T
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
T
>
(
roi_bin_grid_w
);
T
w1
,
w2
,
w3
,
w4
;
int
x_low
,
x_high
,
y_low
,
y_high
;
bilinear_interpolate_gradient
(
height
,
width
,
y
,
x
,
w1
,
w2
,
w3
,
w4
,
x_low
,
x_high
,
y_low
,
y_high
,
index
);
T
g1
=
grad_output_this_bin
*
w1
/
count
;
T
g2
=
grad_output_this_bin
*
w2
/
count
;
T
g3
=
grad_output_this_bin
*
w3
/
count
;
T
g4
=
grad_output_this_bin
*
w4
/
count
;
if
(
x_low
>=
0
&&
x_high
>=
0
&&
y_low
>=
0
&&
y_high
>=
0
)
{
// atomic add is not needed for now since it is single threaded
add
(
offset_grad_input
+
y_low
*
width
+
x_low
,
static_cast
<
T
>
(
g1
));
add
(
offset_grad_input
+
y_low
*
width
+
x_high
,
static_cast
<
T
>
(
g2
));
add
(
offset_grad_input
+
y_high
*
width
+
x_low
,
static_cast
<
T
>
(
g3
));
add
(
offset_grad_input
+
y_high
*
width
+
x_high
,
static_cast
<
T
>
(
g4
));
}
// if
}
// ix
}
// iy
}
// for
}
// ROIAlignBackward
}
// namespace
namespace
detectron2
{
at
::
Tensor
ROIAlign_forward_cpu
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
bool
aligned
)
{
AT_ASSERTM
(
input
.
device
().
is_cpu
(),
"input must be a CPU tensor"
);
AT_ASSERTM
(
rois
.
device
().
is_cpu
(),
"rois must be a CPU tensor"
);
at
::
TensorArg
input_t
{
input
,
"input"
,
1
},
rois_t
{
rois
,
"rois"
,
2
};
at
::
CheckedFrom
c
=
"ROIAlign_forward_cpu"
;
at
::
checkAllSameType
(
c
,
{
input_t
,
rois_t
});
auto
num_rois
=
rois
.
size
(
0
);
auto
channels
=
input
.
size
(
1
);
auto
height
=
input
.
size
(
2
);
auto
width
=
input
.
size
(
3
);
at
::
Tensor
output
=
at
::
zeros
(
{
num_rois
,
channels
,
pooled_height
,
pooled_width
},
input
.
options
());
auto
output_size
=
num_rois
*
pooled_height
*
pooled_width
*
channels
;
if
(
output
.
numel
()
==
0
)
return
output
;
auto
input_
=
input
.
contiguous
(),
rois_
=
rois
.
contiguous
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
input
.
scalar_type
(),
"ROIAlign_forward"
,
[
&
]
{
ROIAlignForward
<
scalar_t
>
(
output_size
,
input_
.
data_ptr
<
scalar_t
>
(),
spatial_scale
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
sampling_ratio
,
rois_
.
data_ptr
<
scalar_t
>
(),
output
.
data_ptr
<
scalar_t
>
(),
aligned
);
});
return
output
;
}
at
::
Tensor
ROIAlign_backward_cpu
(
const
at
::
Tensor
&
grad
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
sampling_ratio
,
bool
aligned
)
{
AT_ASSERTM
(
grad
.
device
().
is_cpu
(),
"grad must be a CPU tensor"
);
AT_ASSERTM
(
rois
.
device
().
is_cpu
(),
"rois must be a CPU tensor"
);
at
::
TensorArg
grad_t
{
grad
,
"grad"
,
1
},
rois_t
{
rois
,
"rois"
,
2
};
at
::
CheckedFrom
c
=
"ROIAlign_backward_cpu"
;
at
::
checkAllSameType
(
c
,
{
grad_t
,
rois_t
});
at
::
Tensor
grad_input
=
at
::
zeros
({
batch_size
,
channels
,
height
,
width
},
grad
.
options
());
// handle possibly empty gradients
if
(
grad
.
numel
()
==
0
)
{
return
grad_input
;
}
// get stride values to ensure indexing into gradients is correct.
int
n_stride
=
grad
.
stride
(
0
);
int
c_stride
=
grad
.
stride
(
1
);
int
h_stride
=
grad
.
stride
(
2
);
int
w_stride
=
grad
.
stride
(
3
);
auto
rois_
=
rois
.
contiguous
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
grad
.
scalar_type
(),
"ROIAlign_forward"
,
[
&
]
{
ROIAlignBackward
<
scalar_t
>
(
grad
.
numel
(),
grad
.
data_ptr
<
scalar_t
>
(),
spatial_scale
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
sampling_ratio
,
grad_input
.
data_ptr
<
scalar_t
>
(),
rois_
.
data_ptr
<
scalar_t
>
(),
n_stride
,
c_stride
,
h_stride
,
w_stride
,
aligned
);
});
return
grad_input
;
}
}
// namespace detectron2
detectron2/layers/csrc/ROIAlign/ROIAlign_cuda.cu
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
// TODO make it in a common file
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
template
<
typename
T
>
__device__
T
bilinear_interpolate
(
const
T
*
bottom_data
,
const
int
height
,
const
int
width
,
T
y
,
T
x
,
const
int
index
/* index for debug only*/
)
{
// deal with cases that inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
return
0
;
}
if
(
y
<=
0
)
y
=
0
;
if
(
x
<=
0
)
x
=
0
;
int
y_low
=
(
int
)
y
;
int
x_low
=
(
int
)
x
;
int
y_high
;
int
x_high
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
T
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
// do bilinear interpolation
T
v1
=
bottom_data
[
y_low
*
width
+
x_low
];
T
v2
=
bottom_data
[
y_low
*
width
+
x_high
];
T
v3
=
bottom_data
[
y_high
*
width
+
x_low
];
T
v4
=
bottom_data
[
y_high
*
width
+
x_high
];
T
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
T
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
return
val
;
}
template
<
typename
T
>
__global__
void
RoIAlignForward
(
const
int
nthreads
,
const
T
*
bottom_data
,
const
T
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
const
T
*
bottom_rois
,
T
*
top_data
,
bool
aligned
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
// (n, c, ph, pw) is an element in the pooled output
int
pw
=
index
%
pooled_width
;
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
int
c
=
(
index
/
pooled_width
/
pooled_height
)
%
channels
;
int
n
=
index
/
pooled_width
/
pooled_height
/
channels
;
const
T
*
offset_bottom_rois
=
bottom_rois
+
n
*
5
;
int
roi_batch_ind
=
offset_bottom_rois
[
0
];
// Do not use rounding; this implementation detail is critical
T
offset
=
aligned
?
(
T
)
0.5
:
(
T
)
0.0
;
T
roi_start_w
=
offset_bottom_rois
[
1
]
*
spatial_scale
-
offset
;
T
roi_start_h
=
offset_bottom_rois
[
2
]
*
spatial_scale
-
offset
;
T
roi_end_w
=
offset_bottom_rois
[
3
]
*
spatial_scale
-
offset
;
T
roi_end_h
=
offset_bottom_rois
[
4
]
*
spatial_scale
-
offset
;
T
roi_width
=
roi_end_w
-
roi_start_w
;
T
roi_height
=
roi_end_h
-
roi_start_h
;
if
(
!
aligned
)
{
// for backward-compatibility only
roi_width
=
max
(
roi_width
,
(
T
)
1.
);
roi_height
=
max
(
roi_height
,
(
T
)
1.
);
}
T
bin_size_h
=
static_cast
<
T
>
(
roi_height
)
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
static_cast
<
T
>
(
roi_width
)
/
static_cast
<
T
>
(
pooled_width
);
const
T
*
offset_bottom_data
=
bottom_data
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_width
/
pooled_width
);
// We do average (integral) pooling inside a bin
// When the grid is empty, output zeros == 0/1, instead of NaN.
const
T
count
=
max
(
roi_bin_grid_h
*
roi_bin_grid_w
,
1
);
// e.g. = 4
T
output_val
=
0.
;
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
// e.g., iy = 0, 1
{
const
T
y
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
T
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
T
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
const
T
x
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
T
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
T
>
(
roi_bin_grid_w
);
T
val
=
bilinear_interpolate
(
offset_bottom_data
,
height
,
width
,
y
,
x
,
index
);
output_val
+=
val
;
}
}
output_val
/=
count
;
top_data
[
index
]
=
output_val
;
}
}
template
<
typename
T
>
__device__
void
bilinear_interpolate_gradient
(
const
int
height
,
const
int
width
,
T
y
,
T
x
,
T
&
w1
,
T
&
w2
,
T
&
w3
,
T
&
w4
,
int
&
x_low
,
int
&
x_high
,
int
&
y_low
,
int
&
y_high
,
const
int
index
/* index for debug only*/
)
{
// deal with cases that inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
w1
=
w2
=
w3
=
w4
=
0.
;
x_low
=
x_high
=
y_low
=
y_high
=
-
1
;
return
;
}
if
(
y
<=
0
)
y
=
0
;
if
(
x
<=
0
)
x
=
0
;
y_low
=
(
int
)
y
;
x_low
=
(
int
)
x
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
T
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
// reference in forward
// T v1 = bottom_data[y_low * width + x_low];
// T v2 = bottom_data[y_low * width + x_high];
// T v3 = bottom_data[y_high * width + x_low];
// T v4 = bottom_data[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
return
;
}
template
<
typename
T
>
__global__
void
RoIAlignBackwardFeature
(
const
int
nthreads
,
const
T
*
top_diff
,
const
int
num_rois
,
const
T
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
T
*
bottom_diff
,
const
T
*
bottom_rois
,
bool
aligned
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
// (n, c, ph, pw) is an element in the pooled output
int
pw
=
index
%
pooled_width
;
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
int
c
=
(
index
/
pooled_width
/
pooled_height
)
%
channels
;
int
n
=
index
/
pooled_width
/
pooled_height
/
channels
;
const
T
*
offset_bottom_rois
=
bottom_rois
+
n
*
5
;
int
roi_batch_ind
=
offset_bottom_rois
[
0
];
// Do not use rounding; this implementation detail is critical
T
offset
=
aligned
?
(
T
)
0.5
:
(
T
)
0.0
;
T
roi_start_w
=
offset_bottom_rois
[
1
]
*
spatial_scale
-
offset
;
T
roi_start_h
=
offset_bottom_rois
[
2
]
*
spatial_scale
-
offset
;
T
roi_end_w
=
offset_bottom_rois
[
3
]
*
spatial_scale
-
offset
;
T
roi_end_h
=
offset_bottom_rois
[
4
]
*
spatial_scale
-
offset
;
T
roi_width
=
roi_end_w
-
roi_start_w
;
T
roi_height
=
roi_end_h
-
roi_start_h
;
if
(
!
aligned
)
{
// for backward-compatibility only
roi_width
=
max
(
roi_width
,
(
T
)
1.
);
roi_height
=
max
(
roi_height
,
(
T
)
1.
);
}
T
bin_size_h
=
static_cast
<
T
>
(
roi_height
)
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
static_cast
<
T
>
(
roi_width
)
/
static_cast
<
T
>
(
pooled_width
);
T
*
offset_bottom_diff
=
bottom_diff
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
int
top_offset
=
(
n
*
channels
+
c
)
*
pooled_height
*
pooled_width
;
const
T
*
offset_top_diff
=
top_diff
+
top_offset
;
const
T
top_diff_this_bin
=
offset_top_diff
[
ph
*
pooled_width
+
pw
];
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_width
/
pooled_width
);
// We do average (integral) pooling inside a bin
const
T
count
=
roi_bin_grid_h
*
roi_bin_grid_w
;
// e.g. = 4
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
// e.g., iy = 0, 1
{
const
T
y
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
T
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
T
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
const
T
x
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
T
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
T
>
(
roi_bin_grid_w
);
T
w1
,
w2
,
w3
,
w4
;
int
x_low
,
x_high
,
y_low
,
y_high
;
bilinear_interpolate_gradient
(
height
,
width
,
y
,
x
,
w1
,
w2
,
w3
,
w4
,
x_low
,
x_high
,
y_low
,
y_high
,
index
);
T
g1
=
top_diff_this_bin
*
w1
/
count
;
T
g2
=
top_diff_this_bin
*
w2
/
count
;
T
g3
=
top_diff_this_bin
*
w3
/
count
;
T
g4
=
top_diff_this_bin
*
w4
/
count
;
if
(
x_low
>=
0
&&
x_high
>=
0
&&
y_low
>=
0
&&
y_high
>=
0
)
{
atomicAdd
(
offset_bottom_diff
+
y_low
*
width
+
x_low
,
static_cast
<
T
>
(
g1
));
atomicAdd
(
offset_bottom_diff
+
y_low
*
width
+
x_high
,
static_cast
<
T
>
(
g2
));
atomicAdd
(
offset_bottom_diff
+
y_high
*
width
+
x_low
,
static_cast
<
T
>
(
g3
));
atomicAdd
(
offset_bottom_diff
+
y_high
*
width
+
x_high
,
static_cast
<
T
>
(
g4
));
}
// if
}
// ix
}
// iy
}
// CUDA_1D_KERNEL_LOOP
}
// RoIAlignBackward
namespace
detectron2
{
at
::
Tensor
ROIAlign_forward_cuda
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
bool
aligned
)
{
AT_ASSERTM
(
input
.
device
().
is_cuda
(),
"input must be a CUDA tensor"
);
AT_ASSERTM
(
rois
.
device
().
is_cuda
(),
"rois must be a CUDA tensor"
);
at
::
TensorArg
input_t
{
input
,
"input"
,
1
},
rois_t
{
rois
,
"rois"
,
2
};
at
::
CheckedFrom
c
=
"ROIAlign_forward_cuda"
;
at
::
checkAllSameGPU
(
c
,
{
input_t
,
rois_t
});
at
::
checkAllSameType
(
c
,
{
input_t
,
rois_t
});
at
::
cuda
::
CUDAGuard
device_guard
(
input
.
device
());
auto
num_rois
=
rois
.
size
(
0
);
auto
channels
=
input
.
size
(
1
);
auto
height
=
input
.
size
(
2
);
auto
width
=
input
.
size
(
3
);
auto
output
=
at
::
empty
(
{
num_rois
,
channels
,
pooled_height
,
pooled_width
},
input
.
options
());
auto
output_size
=
num_rois
*
pooled_height
*
pooled_width
*
channels
;
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
grid
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
static_cast
<
int64_t
>
(
output_size
),
static_cast
<
int64_t
>
(
512
)),
static_cast
<
int64_t
>
(
4096
)));
dim3
block
(
512
);
if
(
output
.
numel
()
==
0
)
{
AT_CUDA_CHECK
(
cudaGetLastError
());
return
output
;
}
auto
input_
=
input
.
contiguous
(),
rois_
=
rois
.
contiguous
();
AT_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"ROIAlign_forward"
,
[
&
]
{
RoIAlignForward
<
scalar_t
><<<
grid
,
block
,
0
,
stream
>>>
(
output_size
,
input_
.
data_ptr
<
scalar_t
>
(),
spatial_scale
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
sampling_ratio
,
rois_
.
data_ptr
<
scalar_t
>
(),
output
.
data_ptr
<
scalar_t
>
(),
aligned
);
});
cudaDeviceSynchronize
();
AT_CUDA_CHECK
(
cudaGetLastError
());
return
output
;
}
// TODO remove the dependency on input and use instead its sizes -> save memory
at
::
Tensor
ROIAlign_backward_cuda
(
const
at
::
Tensor
&
grad
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
sampling_ratio
,
bool
aligned
)
{
AT_ASSERTM
(
grad
.
device
().
is_cuda
(),
"grad must be a CUDA tensor"
);
AT_ASSERTM
(
rois
.
device
().
is_cuda
(),
"rois must be a CUDA tensor"
);
at
::
TensorArg
grad_t
{
grad
,
"grad"
,
1
},
rois_t
{
rois
,
"rois"
,
2
};
at
::
CheckedFrom
c
=
"ROIAlign_backward_cuda"
;
at
::
checkAllSameGPU
(
c
,
{
grad_t
,
rois_t
});
at
::
checkAllSameType
(
c
,
{
grad_t
,
rois_t
});
at
::
cuda
::
CUDAGuard
device_guard
(
grad
.
device
());
auto
num_rois
=
rois
.
size
(
0
);
auto
grad_input
=
at
::
zeros
({
batch_size
,
channels
,
height
,
width
},
grad
.
options
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
grid
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
static_cast
<
int64_t
>
(
grad
.
numel
()),
static_cast
<
int64_t
>
(
512
)),
static_cast
<
int64_t
>
(
4096
)));
dim3
block
(
512
);
// handle possibly empty gradients
if
(
grad
.
numel
()
==
0
)
{
AT_CUDA_CHECK
(
cudaGetLastError
());
return
grad_input
;
}
auto
grad_
=
grad
.
contiguous
(),
rois_
=
rois
.
contiguous
();
AT_DISPATCH_FLOATING_TYPES
(
grad
.
scalar_type
(),
"ROIAlign_backward"
,
[
&
]
{
RoIAlignBackwardFeature
<
scalar_t
><<<
grid
,
block
,
0
,
stream
>>>
(
grad
.
numel
(),
grad_
.
data_ptr
<
scalar_t
>
(),
num_rois
,
spatial_scale
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
sampling_ratio
,
grad_input
.
data_ptr
<
scalar_t
>
(),
rois_
.
data_ptr
<
scalar_t
>
(),
aligned
);
});
AT_CUDA_CHECK
(
cudaGetLastError
());
return
grad_input
;
}
}
// namespace detectron2
detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#pragma once
#include <torch/types.h>
namespace
detectron2
{
at
::
Tensor
ROIAlignRotated_forward_cpu
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
);
at
::
Tensor
ROIAlignRotated_backward_cpu
(
const
at
::
Tensor
&
grad
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
sampling_ratio
);
#ifdef WITH_CUDA
at
::
Tensor
ROIAlignRotated_forward_cuda
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
);
at
::
Tensor
ROIAlignRotated_backward_cuda
(
const
at
::
Tensor
&
grad
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
sampling_ratio
);
#endif
// Interface for Python
inline
at
::
Tensor
ROIAlignRotated_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
)
{
if
(
input
.
is_cuda
())
{
#ifdef WITH_CUDA
return
ROIAlignRotated_forward_cuda
(
input
,
rois
,
spatial_scale
,
pooled_height
,
pooled_width
,
sampling_ratio
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
return
ROIAlignRotated_forward_cpu
(
input
,
rois
,
spatial_scale
,
pooled_height
,
pooled_width
,
sampling_ratio
);
}
inline
at
::
Tensor
ROIAlignRotated_backward
(
const
at
::
Tensor
&
grad
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
sampling_ratio
)
{
if
(
grad
.
is_cuda
())
{
#ifdef WITH_CUDA
return
ROIAlignRotated_backward_cuda
(
grad
,
rois
,
spatial_scale
,
pooled_height
,
pooled_width
,
batch_size
,
channels
,
height
,
width
,
sampling_ratio
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
return
ROIAlignRotated_backward_cpu
(
grad
,
rois
,
spatial_scale
,
pooled_height
,
pooled_width
,
batch_size
,
channels
,
height
,
width
,
sampling_ratio
);
}
}
// namespace detectron2
detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/TensorUtils.h>
#include "ROIAlignRotated.h"
// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
// and PyTorch ROIAlign (non-rotated) Op implementations.
// The key difference between this implementation and those ones is
// we don't do "legacy offset" in this version, as there aren't many previous
// works, if any, using the "legacy" ROIAlignRotated Op.
// This would make the interface a bit cleaner.
namespace
detectron2
{
namespace
{
template
<
typename
T
>
struct
PreCalc
{
int
pos1
;
int
pos2
;
int
pos3
;
int
pos4
;
T
w1
;
T
w2
;
T
w3
;
T
w4
;
};
template
<
typename
T
>
void
pre_calc_for_bilinear_interpolate
(
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
iy_upper
,
const
int
ix_upper
,
T
roi_start_h
,
T
roi_start_w
,
T
bin_size_h
,
T
bin_size_w
,
int
roi_bin_grid_h
,
int
roi_bin_grid_w
,
T
roi_center_h
,
T
roi_center_w
,
T
cos_theta
,
T
sin_theta
,
std
::
vector
<
PreCalc
<
T
>>&
pre_calc
)
{
int
pre_calc_index
=
0
;
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
pooled_width
;
pw
++
)
{
for
(
int
iy
=
0
;
iy
<
iy_upper
;
iy
++
)
{
const
T
yy
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
T
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
T
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
ix_upper
;
ix
++
)
{
const
T
xx
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
T
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
T
>
(
roi_bin_grid_w
);
// Rotate by theta around the center and translate
// In image space, (y, x) is the order for Right Handed System,
// and this is essentially multiplying the point by a rotation matrix
// to rotate it counterclockwise through angle theta.
T
y
=
yy
*
cos_theta
-
xx
*
sin_theta
+
roi_center_h
;
T
x
=
yy
*
sin_theta
+
xx
*
cos_theta
+
roi_center_w
;
// deal with: inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
PreCalc
<
T
>
pc
;
pc
.
pos1
=
0
;
pc
.
pos2
=
0
;
pc
.
pos3
=
0
;
pc
.
pos4
=
0
;
pc
.
w1
=
0
;
pc
.
w2
=
0
;
pc
.
w3
=
0
;
pc
.
w4
=
0
;
pre_calc
[
pre_calc_index
]
=
pc
;
pre_calc_index
+=
1
;
continue
;
}
if
(
y
<
0
)
{
y
=
0
;
}
if
(
x
<
0
)
{
x
=
0
;
}
int
y_low
=
(
int
)
y
;
int
x_low
=
(
int
)
x
;
int
y_high
;
int
x_high
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
T
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
T
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
// save weights and indices
PreCalc
<
T
>
pc
;
pc
.
pos1
=
y_low
*
width
+
x_low
;
pc
.
pos2
=
y_low
*
width
+
x_high
;
pc
.
pos3
=
y_high
*
width
+
x_low
;
pc
.
pos4
=
y_high
*
width
+
x_high
;
pc
.
w1
=
w1
;
pc
.
w2
=
w2
;
pc
.
w3
=
w3
;
pc
.
w4
=
w4
;
pre_calc
[
pre_calc_index
]
=
pc
;
pre_calc_index
+=
1
;
}
}
}
}
}
template
<
typename
T
>
void
bilinear_interpolate_gradient
(
const
int
height
,
const
int
width
,
T
y
,
T
x
,
T
&
w1
,
T
&
w2
,
T
&
w3
,
T
&
w4
,
int
&
x_low
,
int
&
x_high
,
int
&
y_low
,
int
&
y_high
)
{
// deal with cases that inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
w1
=
w2
=
w3
=
w4
=
0.
;
x_low
=
x_high
=
y_low
=
y_high
=
-
1
;
return
;
}
if
(
y
<
0
)
{
y
=
0
;
}
if
(
x
<
0
)
{
x
=
0
;
}
y_low
=
(
int
)
y
;
x_low
=
(
int
)
x
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
T
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
// reference in forward
// T v1 = input[y_low * width + x_low];
// T v2 = input[y_low * width + x_high];
// T v3 = input[y_high * width + x_low];
// T v4 = input[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
return
;
}
template
<
class
T
>
inline
void
add
(
T
*
address
,
const
T
&
val
)
{
*
address
+=
val
;
}
}
// namespace
template
<
typename
T
>
void
ROIAlignRotatedForward
(
const
int
nthreads
,
const
T
*
input
,
const
T
&
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
const
T
*
rois
,
T
*
output
)
{
int
n_rois
=
nthreads
/
channels
/
pooled_width
/
pooled_height
;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for
(
int
n
=
0
;
n
<
n_rois
;
n
++
)
{
int
index_n
=
n
*
channels
*
pooled_width
*
pooled_height
;
const
T
*
current_roi
=
rois
+
n
*
6
;
int
roi_batch_ind
=
current_roi
[
0
];
// Do not use rounding; this implementation detail is critical
// ROIAlignRotated supports align == true, i.e., continuous coordinate
// by default, thus the 0.5 offset
T
offset
=
(
T
)
0.5
;
T
roi_center_w
=
current_roi
[
1
]
*
spatial_scale
-
offset
;
T
roi_center_h
=
current_roi
[
2
]
*
spatial_scale
-
offset
;
T
roi_width
=
current_roi
[
3
]
*
spatial_scale
;
T
roi_height
=
current_roi
[
4
]
*
spatial_scale
;
T
theta
=
current_roi
[
5
]
*
M_PI
/
180.0
;
T
cos_theta
=
cos
(
theta
);
T
sin_theta
=
sin
(
theta
);
AT_ASSERTM
(
roi_width
>=
0
&&
roi_height
>=
0
,
"ROIs in ROIAlignRotated do not have non-negative size!"
);
T
bin_size_h
=
static_cast
<
T
>
(
roi_height
)
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
static_cast
<
T
>
(
roi_width
)
/
static_cast
<
T
>
(
pooled_width
);
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_width
/
pooled_width
);
// We do average (integral) pooling inside a bin
const
T
count
=
std
::
max
(
roi_bin_grid_h
*
roi_bin_grid_w
,
1
);
// e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std
::
vector
<
PreCalc
<
T
>>
pre_calc
(
roi_bin_grid_h
*
roi_bin_grid_w
*
pooled_width
*
pooled_height
);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T
roi_start_h
=
-
roi_height
/
2.0
;
T
roi_start_w
=
-
roi_width
/
2.0
;
pre_calc_for_bilinear_interpolate
(
height
,
width
,
pooled_height
,
pooled_width
,
roi_bin_grid_h
,
roi_bin_grid_w
,
roi_start_h
,
roi_start_w
,
bin_size_h
,
bin_size_w
,
roi_bin_grid_h
,
roi_bin_grid_w
,
roi_center_h
,
roi_center_w
,
cos_theta
,
sin_theta
,
pre_calc
);
for
(
int
c
=
0
;
c
<
channels
;
c
++
)
{
int
index_n_c
=
index_n
+
c
*
pooled_width
*
pooled_height
;
const
T
*
offset_input
=
input
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
int
pre_calc_index
=
0
;
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
pooled_width
;
pw
++
)
{
int
index
=
index_n_c
+
ph
*
pooled_width
+
pw
;
T
output_val
=
0.
;
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
{
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
PreCalc
<
T
>
pc
=
pre_calc
[
pre_calc_index
];
output_val
+=
pc
.
w1
*
offset_input
[
pc
.
pos1
]
+
pc
.
w2
*
offset_input
[
pc
.
pos2
]
+
pc
.
w3
*
offset_input
[
pc
.
pos3
]
+
pc
.
w4
*
offset_input
[
pc
.
pos4
];
pre_calc_index
+=
1
;
}
}
output_val
/=
count
;
output
[
index
]
=
output_val
;
}
// for pw
}
// for ph
}
// for c
}
// for n
}
template
<
typename
T
>
void
ROIAlignRotatedBackward
(
const
int
nthreads
,
// may not be contiguous. should index using n_stride, etc
const
T
*
grad_output
,
const
T
&
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
T
*
grad_input
,
const
T
*
rois
,
const
int
n_stride
,
const
int
c_stride
,
const
int
h_stride
,
const
int
w_stride
)
{
for
(
int
index
=
0
;
index
<
nthreads
;
index
++
)
{
// (n, c, ph, pw) is an element in the pooled output
int
pw
=
index
%
pooled_width
;
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
int
c
=
(
index
/
pooled_width
/
pooled_height
)
%
channels
;
int
n
=
index
/
pooled_width
/
pooled_height
/
channels
;
const
T
*
current_roi
=
rois
+
n
*
6
;
int
roi_batch_ind
=
current_roi
[
0
];
// Do not use rounding; this implementation detail is critical
// ROIAlignRotated supports align == true, i.e., continuous coordinate
// by default, thus the 0.5 offset
T
offset
=
(
T
)
0.5
;
T
roi_center_w
=
current_roi
[
1
]
*
spatial_scale
-
offset
;
T
roi_center_h
=
current_roi
[
2
]
*
spatial_scale
-
offset
;
T
roi_width
=
current_roi
[
3
]
*
spatial_scale
;
T
roi_height
=
current_roi
[
4
]
*
spatial_scale
;
T
theta
=
current_roi
[
5
]
*
M_PI
/
180.0
;
T
cos_theta
=
cos
(
theta
);
T
sin_theta
=
sin
(
theta
);
AT_ASSERTM
(
roi_width
>=
0
&&
roi_height
>=
0
,
"ROIs in ROIAlignRotated do not have non-negative size!"
);
T
bin_size_h
=
static_cast
<
T
>
(
roi_height
)
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
static_cast
<
T
>
(
roi_width
)
/
static_cast
<
T
>
(
pooled_width
);
T
*
offset_grad_input
=
grad_input
+
((
roi_batch_ind
*
channels
+
c
)
*
height
*
width
);
int
output_offset
=
n
*
n_stride
+
c
*
c_stride
;
const
T
*
offset_grad_output
=
grad_output
+
output_offset
;
const
T
grad_output_this_bin
=
offset_grad_output
[
ph
*
h_stride
+
pw
*
w_stride
];
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_width
/
pooled_width
);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T
roi_start_h
=
-
roi_height
/
2.0
;
T
roi_start_w
=
-
roi_width
/
2.0
;
// We do average (integral) pooling inside a bin
const
T
count
=
roi_bin_grid_h
*
roi_bin_grid_w
;
// e.g. = 4
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
{
const
T
yy
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
T
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
T
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
const
T
xx
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
T
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
T
>
(
roi_bin_grid_w
);
// Rotate by theta around the center and translate
T
y
=
yy
*
cos_theta
-
xx
*
sin_theta
+
roi_center_h
;
T
x
=
yy
*
sin_theta
+
xx
*
cos_theta
+
roi_center_w
;
T
w1
,
w2
,
w3
,
w4
;
int
x_low
,
x_high
,
y_low
,
y_high
;
bilinear_interpolate_gradient
(
height
,
width
,
y
,
x
,
w1
,
w2
,
w3
,
w4
,
x_low
,
x_high
,
y_low
,
y_high
);
T
g1
=
grad_output_this_bin
*
w1
/
count
;
T
g2
=
grad_output_this_bin
*
w2
/
count
;
T
g3
=
grad_output_this_bin
*
w3
/
count
;
T
g4
=
grad_output_this_bin
*
w4
/
count
;
if
(
x_low
>=
0
&&
x_high
>=
0
&&
y_low
>=
0
&&
y_high
>=
0
)
{
// atomic add is not needed for now since it is single threaded
add
(
offset_grad_input
+
y_low
*
width
+
x_low
,
static_cast
<
T
>
(
g1
));
add
(
offset_grad_input
+
y_low
*
width
+
x_high
,
static_cast
<
T
>
(
g2
));
add
(
offset_grad_input
+
y_high
*
width
+
x_low
,
static_cast
<
T
>
(
g3
));
add
(
offset_grad_input
+
y_high
*
width
+
x_high
,
static_cast
<
T
>
(
g4
));
}
// if
}
// ix
}
// iy
}
// for
}
// ROIAlignRotatedBackward
at
::
Tensor
ROIAlignRotated_forward_cpu
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
)
{
AT_ASSERTM
(
input
.
device
().
is_cpu
(),
"input must be a CPU tensor"
);
AT_ASSERTM
(
rois
.
device
().
is_cpu
(),
"rois must be a CPU tensor"
);
at
::
TensorArg
input_t
{
input
,
"input"
,
1
},
rois_t
{
rois
,
"rois"
,
2
};
at
::
CheckedFrom
c
=
"ROIAlign_forward_cpu"
;
at
::
checkAllSameType
(
c
,
{
input_t
,
rois_t
});
auto
num_rois
=
rois
.
size
(
0
);
auto
channels
=
input
.
size
(
1
);
auto
height
=
input
.
size
(
2
);
auto
width
=
input
.
size
(
3
);
at
::
Tensor
output
=
at
::
zeros
(
{
num_rois
,
channels
,
pooled_height
,
pooled_width
},
input
.
options
());
auto
output_size
=
num_rois
*
pooled_height
*
pooled_width
*
channels
;
if
(
output
.
numel
()
==
0
)
{
return
output
;
}
auto
input_
=
input
.
contiguous
(),
rois_
=
rois
.
contiguous
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
input
.
scalar_type
(),
"ROIAlignRotated_forward"
,
[
&
]
{
ROIAlignRotatedForward
<
scalar_t
>
(
output_size
,
input_
.
data_ptr
<
scalar_t
>
(),
spatial_scale
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
sampling_ratio
,
rois_
.
data_ptr
<
scalar_t
>
(),
output
.
data_ptr
<
scalar_t
>
());
});
return
output
;
}
at
::
Tensor
ROIAlignRotated_backward_cpu
(
const
at
::
Tensor
&
grad
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
sampling_ratio
)
{
AT_ASSERTM
(
grad
.
device
().
is_cpu
(),
"grad must be a CPU tensor"
);
AT_ASSERTM
(
rois
.
device
().
is_cpu
(),
"rois must be a CPU tensor"
);
at
::
TensorArg
grad_t
{
grad
,
"grad"
,
1
},
rois_t
{
rois
,
"rois"
,
2
};
at
::
CheckedFrom
c
=
"ROIAlignRotated_backward_cpu"
;
at
::
checkAllSameType
(
c
,
{
grad_t
,
rois_t
});
at
::
Tensor
grad_input
=
at
::
zeros
({
batch_size
,
channels
,
height
,
width
},
grad
.
options
());
// handle possibly empty gradients
if
(
grad
.
numel
()
==
0
)
{
return
grad_input
;
}
// get stride values to ensure indexing into gradients is correct.
int
n_stride
=
grad
.
stride
(
0
);
int
c_stride
=
grad
.
stride
(
1
);
int
h_stride
=
grad
.
stride
(
2
);
int
w_stride
=
grad
.
stride
(
3
);
auto
rois_
=
rois
.
contiguous
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
grad
.
scalar_type
(),
"ROIAlignRotated_forward"
,
[
&
]
{
ROIAlignRotatedBackward
<
scalar_t
>
(
grad
.
numel
(),
grad
.
data_ptr
<
scalar_t
>
(),
spatial_scale
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
sampling_ratio
,
grad_input
.
data_ptr
<
scalar_t
>
(),
rois_
.
data_ptr
<
scalar_t
>
(),
n_stride
,
c_stride
,
h_stride
,
w_stride
);
});
return
grad_input
;
}
}
// namespace detectron2
detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
// TODO make it in a common file
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
// and PyTorch ROIAlign (non-rotated) Op implementations.
// The key difference between this implementation and those ones is
// we don't do "legacy offset" in this version, as there aren't many previous
// works, if any, using the "legacy" ROIAlignRotated Op.
// This would make the interface a bit cleaner.
namespace
detectron2
{
namespace
{
template
<
typename
T
>
__device__
T
bilinear_interpolate
(
const
T
*
input
,
const
int
height
,
const
int
width
,
T
y
,
T
x
)
{
// deal with cases that inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
return
0
;
}
if
(
y
<
0
)
{
y
=
0
;
}
if
(
x
<
0
)
{
x
=
0
;
}
int
y_low
=
(
int
)
y
;
int
x_low
=
(
int
)
x
;
int
y_high
;
int
x_high
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
T
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
// do bilinear interpolation
T
v1
=
input
[
y_low
*
width
+
x_low
];
T
v2
=
input
[
y_low
*
width
+
x_high
];
T
v3
=
input
[
y_high
*
width
+
x_low
];
T
v4
=
input
[
y_high
*
width
+
x_high
];
T
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
T
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
return
val
;
}
template
<
typename
T
>
__device__
void
bilinear_interpolate_gradient
(
const
int
height
,
const
int
width
,
T
y
,
T
x
,
T
&
w1
,
T
&
w2
,
T
&
w3
,
T
&
w4
,
int
&
x_low
,
int
&
x_high
,
int
&
y_low
,
int
&
y_high
)
{
// deal with cases that inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
w1
=
w2
=
w3
=
w4
=
0.
;
x_low
=
x_high
=
y_low
=
y_high
=
-
1
;
return
;
}
if
(
y
<
0
)
{
y
=
0
;
}
if
(
x
<
0
)
{
x
=
0
;
}
y_low
=
(
int
)
y
;
x_low
=
(
int
)
x
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
T
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
// reference in forward
// T v1 = input[y_low * width + x_low];
// T v2 = input[y_low * width + x_high];
// T v3 = input[y_high * width + x_low];
// T v4 = input[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
return
;
}
}
// namespace
template
<
typename
T
>
__global__
void
RoIAlignRotatedForward
(
const
int
nthreads
,
const
T
*
input
,
const
T
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
const
T
*
rois
,
T
*
top_data
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
// (n, c, ph, pw) is an element in the pooled output
int
pw
=
index
%
pooled_width
;
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
int
c
=
(
index
/
pooled_width
/
pooled_height
)
%
channels
;
int
n
=
index
/
pooled_width
/
pooled_height
/
channels
;
const
T
*
current_roi
=
rois
+
n
*
6
;
int
roi_batch_ind
=
current_roi
[
0
];
// Do not use rounding; this implementation detail is critical
// ROIAlignRotated supports align == true, i.e., continuous coordinate
// by default, thus the 0.5 offset
T
offset
=
(
T
)
0.5
;
T
roi_center_w
=
current_roi
[
1
]
*
spatial_scale
-
offset
;
T
roi_center_h
=
current_roi
[
2
]
*
spatial_scale
-
offset
;
T
roi_width
=
current_roi
[
3
]
*
spatial_scale
;
T
roi_height
=
current_roi
[
4
]
*
spatial_scale
;
T
theta
=
current_roi
[
5
]
*
M_PI
/
180.0
;
T
cos_theta
=
cos
(
theta
);
T
sin_theta
=
sin
(
theta
);
T
bin_size_h
=
static_cast
<
T
>
(
roi_height
)
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
static_cast
<
T
>
(
roi_width
)
/
static_cast
<
T
>
(
pooled_width
);
const
T
*
offset_input
=
input
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_width
/
pooled_width
);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T
roi_start_h
=
-
roi_height
/
2.0
;
T
roi_start_w
=
-
roi_width
/
2.0
;
// We do average (inte gral) pooling inside a bin
const
T
count
=
max
(
roi_bin_grid_h
*
roi_bin_grid_w
,
1
);
// e.g. = 4
T
output_val
=
0.
;
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
// e.g., iy = 0, 1
{
const
T
yy
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
T
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
T
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
const
T
xx
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
T
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
T
>
(
roi_bin_grid_w
);
// Rotate by theta around the center and translate
T
y
=
yy
*
cos_theta
-
xx
*
sin_theta
+
roi_center_h
;
T
x
=
yy
*
sin_theta
+
xx
*
cos_theta
+
roi_center_w
;
T
val
=
bilinear_interpolate
(
offset_input
,
height
,
width
,
y
,
x
);
output_val
+=
val
;
}
}
output_val
/=
count
;
top_data
[
index
]
=
output_val
;
}
}
template
<
typename
T
>
__global__
void
RoIAlignRotatedBackwardFeature
(
const
int
nthreads
,
const
T
*
top_diff
,
const
int
num_rois
,
const
T
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
T
*
bottom_diff
,
const
T
*
rois
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
// (n, c, ph, pw) is an element in the pooled output
int
pw
=
index
%
pooled_width
;
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
int
c
=
(
index
/
pooled_width
/
pooled_height
)
%
channels
;
int
n
=
index
/
pooled_width
/
pooled_height
/
channels
;
const
T
*
current_roi
=
rois
+
n
*
6
;
int
roi_batch_ind
=
current_roi
[
0
];
// Do not use rounding; this implementation detail is critical
// ROIAlignRotated supports align == true, i.e., continuous coordinate
// by default, thus the 0.5 offset
T
offset
=
(
T
)
0.5
;
T
roi_center_w
=
current_roi
[
1
]
*
spatial_scale
-
offset
;
T
roi_center_h
=
current_roi
[
2
]
*
spatial_scale
-
offset
;
T
roi_width
=
current_roi
[
3
]
*
spatial_scale
;
T
roi_height
=
current_roi
[
4
]
*
spatial_scale
;
T
theta
=
current_roi
[
5
]
*
M_PI
/
180.0
;
T
cos_theta
=
cos
(
theta
);
T
sin_theta
=
sin
(
theta
);
T
bin_size_h
=
static_cast
<
T
>
(
roi_height
)
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
static_cast
<
T
>
(
roi_width
)
/
static_cast
<
T
>
(
pooled_width
);
T
*
offset_bottom_diff
=
bottom_diff
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
int
top_offset
=
(
n
*
channels
+
c
)
*
pooled_height
*
pooled_width
;
const
T
*
offset_top_diff
=
top_diff
+
top_offset
;
const
T
top_diff_this_bin
=
offset_top_diff
[
ph
*
pooled_width
+
pw
];
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_width
/
pooled_width
);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T
roi_start_h
=
-
roi_height
/
2.0
;
T
roi_start_w
=
-
roi_width
/
2.0
;
// We do average (integral) pooling inside a bin
const
T
count
=
roi_bin_grid_h
*
roi_bin_grid_w
;
// e.g. = 4
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
// e.g., iy = 0, 1
{
const
T
yy
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
T
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
T
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
const
T
xx
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
T
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
T
>
(
roi_bin_grid_w
);
// Rotate by theta around the center and translate
T
y
=
yy
*
cos_theta
-
xx
*
sin_theta
+
roi_center_h
;
T
x
=
yy
*
sin_theta
+
xx
*
cos_theta
+
roi_center_w
;
T
w1
,
w2
,
w3
,
w4
;
int
x_low
,
x_high
,
y_low
,
y_high
;
bilinear_interpolate_gradient
(
height
,
width
,
y
,
x
,
w1
,
w2
,
w3
,
w4
,
x_low
,
x_high
,
y_low
,
y_high
);
T
g1
=
top_diff_this_bin
*
w1
/
count
;
T
g2
=
top_diff_this_bin
*
w2
/
count
;
T
g3
=
top_diff_this_bin
*
w3
/
count
;
T
g4
=
top_diff_this_bin
*
w4
/
count
;
if
(
x_low
>=
0
&&
x_high
>=
0
&&
y_low
>=
0
&&
y_high
>=
0
)
{
atomicAdd
(
offset_bottom_diff
+
y_low
*
width
+
x_low
,
static_cast
<
T
>
(
g1
));
atomicAdd
(
offset_bottom_diff
+
y_low
*
width
+
x_high
,
static_cast
<
T
>
(
g2
));
atomicAdd
(
offset_bottom_diff
+
y_high
*
width
+
x_low
,
static_cast
<
T
>
(
g3
));
atomicAdd
(
offset_bottom_diff
+
y_high
*
width
+
x_high
,
static_cast
<
T
>
(
g4
));
}
// if
}
// ix
}
// iy
}
// CUDA_1D_KERNEL_LOOP
}
// RoIAlignRotatedBackward
at
::
Tensor
ROIAlignRotated_forward_cuda
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
)
{
AT_ASSERTM
(
input
.
device
().
is_cuda
(),
"input must be a CUDA tensor"
);
AT_ASSERTM
(
rois
.
device
().
is_cuda
(),
"rois must be a CUDA tensor"
);
at
::
TensorArg
input_t
{
input
,
"input"
,
1
},
rois_t
{
rois
,
"rois"
,
2
};
at
::
CheckedFrom
c
=
"ROIAlignRotated_forward_cuda"
;
at
::
checkAllSameGPU
(
c
,
{
input_t
,
rois_t
});
at
::
checkAllSameType
(
c
,
{
input_t
,
rois_t
});
at
::
cuda
::
CUDAGuard
device_guard
(
input
.
device
());
auto
num_rois
=
rois
.
size
(
0
);
auto
channels
=
input
.
size
(
1
);
auto
height
=
input
.
size
(
2
);
auto
width
=
input
.
size
(
3
);
auto
output
=
at
::
empty
(
{
num_rois
,
channels
,
pooled_height
,
pooled_width
},
input
.
options
());
auto
output_size
=
num_rois
*
pooled_height
*
pooled_width
*
channels
;
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
grid
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
static_cast
<
int64_t
>
(
output_size
),
static_cast
<
int64_t
>
(
512
)),
static_cast
<
int64_t
>
(
4096
)));
dim3
block
(
512
);
if
(
output
.
numel
()
==
0
)
{
AT_CUDA_CHECK
(
cudaGetLastError
());
return
output
;
}
auto
input_
=
input
.
contiguous
(),
rois_
=
rois
.
contiguous
();
AT_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"ROIAlignRotated_forward"
,
[
&
]
{
RoIAlignRotatedForward
<
scalar_t
><<<
grid
,
block
,
0
,
stream
>>>
(
output_size
,
input_
.
data_ptr
<
scalar_t
>
(),
spatial_scale
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
sampling_ratio
,
rois_
.
data_ptr
<
scalar_t
>
(),
output
.
data_ptr
<
scalar_t
>
());
});
cudaDeviceSynchronize
();
AT_CUDA_CHECK
(
cudaGetLastError
());
return
output
;
}
// TODO remove the dependency on input and use instead its sizes -> save memory
at
::
Tensor
ROIAlignRotated_backward_cuda
(
const
at
::
Tensor
&
grad
,
const
at
::
Tensor
&
rois
,
const
float
spatial_scale
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
batch_size
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
sampling_ratio
)
{
AT_ASSERTM
(
grad
.
device
().
is_cuda
(),
"grad must be a CUDA tensor"
);
AT_ASSERTM
(
rois
.
device
().
is_cuda
(),
"rois must be a CUDA tensor"
);
at
::
TensorArg
grad_t
{
grad
,
"grad"
,
1
},
rois_t
{
rois
,
"rois"
,
2
};
at
::
CheckedFrom
c
=
"ROIAlign_backward_cuda"
;
at
::
checkAllSameGPU
(
c
,
{
grad_t
,
rois_t
});
at
::
checkAllSameType
(
c
,
{
grad_t
,
rois_t
});
at
::
cuda
::
CUDAGuard
device_guard
(
grad
.
device
());
auto
num_rois
=
rois
.
size
(
0
);
auto
grad_input
=
at
::
zeros
({
batch_size
,
channels
,
height
,
width
},
grad
.
options
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
dim3
grid
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
static_cast
<
int64_t
>
(
grad
.
numel
()),
static_cast
<
int64_t
>
(
512
)),
static_cast
<
int64_t
>
(
4096
)));
dim3
block
(
512
);
// handle possibly empty gradients
if
(
grad
.
numel
()
==
0
)
{
AT_CUDA_CHECK
(
cudaGetLastError
());
return
grad_input
;
}
auto
grad_
=
grad
.
contiguous
(),
rois_
=
rois
.
contiguous
();
AT_DISPATCH_FLOATING_TYPES
(
grad
.
scalar_type
(),
"ROIAlignRotated_backward"
,
[
&
]
{
RoIAlignRotatedBackwardFeature
<
scalar_t
><<<
grid
,
block
,
0
,
stream
>>>
(
grad
.
numel
(),
grad_
.
data_ptr
<
scalar_t
>
(),
num_rois
,
spatial_scale
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
sampling_ratio
,
grad_input
.
data_ptr
<
scalar_t
>
(),
rois_
.
data_ptr
<
scalar_t
>
());
});
AT_CUDA_CHECK
(
cudaGetLastError
());
return
grad_input
;
}
}
// namespace detectron2
detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#pragma once
#include <torch/types.h>
namespace
detectron2
{
at
::
Tensor
box_iou_rotated_cpu
(
const
at
::
Tensor
&
boxes1
,
const
at
::
Tensor
&
boxes2
);
#ifdef WITH_CUDA
at
::
Tensor
box_iou_rotated_cuda
(
const
at
::
Tensor
&
boxes1
,
const
at
::
Tensor
&
boxes2
);
#endif
// Interface for Python
// inline is needed to prevent multiple function definitions when this header is
// included by different cpps
inline
at
::
Tensor
box_iou_rotated
(
const
at
::
Tensor
&
boxes1
,
const
at
::
Tensor
&
boxes2
)
{
assert
(
boxes1
.
device
().
is_cuda
()
==
boxes2
.
device
().
is_cuda
());
if
(
boxes1
.
device
().
is_cuda
())
{
#ifdef WITH_CUDA
return
box_iou_rotated_cuda
(
boxes1
.
contiguous
(),
boxes2
.
contiguous
());
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
return
box_iou_rotated_cpu
(
boxes1
.
contiguous
(),
boxes2
.
contiguous
());
}
}
// namespace detectron2
detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "box_iou_rotated.h"
#include "box_iou_rotated_utils.h"
namespace
detectron2
{
template
<
typename
T
>
void
box_iou_rotated_cpu_kernel
(
const
at
::
Tensor
&
boxes1
,
const
at
::
Tensor
&
boxes2
,
at
::
Tensor
&
ious
)
{
auto
num_boxes1
=
boxes1
.
size
(
0
);
auto
num_boxes2
=
boxes2
.
size
(
0
);
for
(
int
i
=
0
;
i
<
num_boxes1
;
i
++
)
{
for
(
int
j
=
0
;
j
<
num_boxes2
;
j
++
)
{
ious
[
i
*
num_boxes2
+
j
]
=
single_box_iou_rotated
<
T
>
(
boxes1
[
i
].
data_ptr
<
T
>
(),
boxes2
[
j
].
data_ptr
<
T
>
());
}
}
}
at
::
Tensor
box_iou_rotated_cpu
(
// input must be contiguous:
const
at
::
Tensor
&
boxes1
,
const
at
::
Tensor
&
boxes2
)
{
auto
num_boxes1
=
boxes1
.
size
(
0
);
auto
num_boxes2
=
boxes2
.
size
(
0
);
at
::
Tensor
ious
=
at
::
empty
({
num_boxes1
*
num_boxes2
},
boxes1
.
options
().
dtype
(
at
::
kFloat
));
box_iou_rotated_cpu_kernel
<
float
>
(
boxes1
,
boxes2
,
ious
);
// reshape from 1d array to 2d array
auto
shape
=
std
::
vector
<
int64_t
>
{
num_boxes1
,
num_boxes2
};
return
ious
.
reshape
(
shape
);
}
}
// namespace detectron2
detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include "box_iou_rotated_utils.h"
namespace
detectron2
{
// 2D block with 32 * 16 = 512 threads per block
const
int
BLOCK_DIM_X
=
32
;
const
int
BLOCK_DIM_Y
=
16
;
template
<
typename
T
>
__global__
void
box_iou_rotated_cuda_kernel
(
const
int
n_boxes1
,
const
int
n_boxes2
,
const
T
*
dev_boxes1
,
const
T
*
dev_boxes2
,
T
*
dev_ious
)
{
const
int
row_start
=
blockIdx
.
x
*
blockDim
.
x
;
const
int
col_start
=
blockIdx
.
y
*
blockDim
.
y
;
const
int
row_size
=
min
(
n_boxes1
-
row_start
,
blockDim
.
x
);
const
int
col_size
=
min
(
n_boxes2
-
col_start
,
blockDim
.
y
);
__shared__
float
block_boxes1
[
BLOCK_DIM_X
*
5
];
__shared__
float
block_boxes2
[
BLOCK_DIM_Y
*
5
];
// It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y
if
(
threadIdx
.
x
<
row_size
&&
threadIdx
.
y
==
0
)
{
block_boxes1
[
threadIdx
.
x
*
5
+
0
]
=
dev_boxes1
[(
row_start
+
threadIdx
.
x
)
*
5
+
0
];
block_boxes1
[
threadIdx
.
x
*
5
+
1
]
=
dev_boxes1
[(
row_start
+
threadIdx
.
x
)
*
5
+
1
];
block_boxes1
[
threadIdx
.
x
*
5
+
2
]
=
dev_boxes1
[(
row_start
+
threadIdx
.
x
)
*
5
+
2
];
block_boxes1
[
threadIdx
.
x
*
5
+
3
]
=
dev_boxes1
[(
row_start
+
threadIdx
.
x
)
*
5
+
3
];
block_boxes1
[
threadIdx
.
x
*
5
+
4
]
=
dev_boxes1
[(
row_start
+
threadIdx
.
x
)
*
5
+
4
];
}
if
(
threadIdx
.
x
<
col_size
&&
threadIdx
.
y
==
0
)
{
block_boxes2
[
threadIdx
.
x
*
5
+
0
]
=
dev_boxes2
[(
col_start
+
threadIdx
.
x
)
*
5
+
0
];
block_boxes2
[
threadIdx
.
x
*
5
+
1
]
=
dev_boxes2
[(
col_start
+
threadIdx
.
x
)
*
5
+
1
];
block_boxes2
[
threadIdx
.
x
*
5
+
2
]
=
dev_boxes2
[(
col_start
+
threadIdx
.
x
)
*
5
+
2
];
block_boxes2
[
threadIdx
.
x
*
5
+
3
]
=
dev_boxes2
[(
col_start
+
threadIdx
.
x
)
*
5
+
3
];
block_boxes2
[
threadIdx
.
x
*
5
+
4
]
=
dev_boxes2
[(
col_start
+
threadIdx
.
x
)
*
5
+
4
];
}
__syncthreads
();
if
(
threadIdx
.
x
<
row_size
&&
threadIdx
.
y
<
col_size
)
{
int
offset
=
(
row_start
+
threadIdx
.
x
)
*
n_boxes2
+
col_start
+
threadIdx
.
y
;
dev_ious
[
offset
]
=
single_box_iou_rotated
<
T
>
(
block_boxes1
+
threadIdx
.
x
*
5
,
block_boxes2
+
threadIdx
.
y
*
5
);
}
}
at
::
Tensor
box_iou_rotated_cuda
(
// input must be contiguous
const
at
::
Tensor
&
boxes1
,
const
at
::
Tensor
&
boxes2
)
{
using
scalar_t
=
float
;
AT_ASSERTM
(
boxes1
.
scalar_type
()
==
at
::
kFloat
,
"boxes1 must be a float tensor"
);
AT_ASSERTM
(
boxes2
.
scalar_type
()
==
at
::
kFloat
,
"boxes2 must be a float tensor"
);
AT_ASSERTM
(
boxes1
.
is_cuda
(),
"boxes1 must be a CUDA tensor"
);
AT_ASSERTM
(
boxes2
.
is_cuda
(),
"boxes2 must be a CUDA tensor"
);
at
::
cuda
::
CUDAGuard
device_guard
(
boxes1
.
device
());
auto
num_boxes1
=
boxes1
.
size
(
0
);
auto
num_boxes2
=
boxes2
.
size
(
0
);
at
::
Tensor
ious
=
at
::
empty
({
num_boxes1
*
num_boxes2
},
boxes1
.
options
().
dtype
(
at
::
kFloat
));
bool
transpose
=
false
;
if
(
num_boxes1
>
0
&&
num_boxes2
>
0
)
{
scalar_t
*
data1
=
boxes1
.
data_ptr
<
scalar_t
>
(),
*
data2
=
boxes2
.
data_ptr
<
scalar_t
>
();
if
(
num_boxes2
>
65535
*
BLOCK_DIM_Y
)
{
AT_ASSERTM
(
num_boxes1
<=
65535
*
BLOCK_DIM_Y
,
"Too many boxes for box_iou_rotated_cuda!"
);
// x dim is allowed to be large, but y dim cannot,
// so we transpose the two to avoid "invalid configuration argument"
// error. We assume one of them is small. Otherwise the result is hard to
// fit in memory anyway.
std
::
swap
(
num_boxes1
,
num_boxes2
);
std
::
swap
(
data1
,
data2
);
transpose
=
true
;
}
const
int
blocks_x
=
at
::
cuda
::
ATenCeilDiv
(
static_cast
<
int
>
(
num_boxes1
),
BLOCK_DIM_X
);
const
int
blocks_y
=
at
::
cuda
::
ATenCeilDiv
(
static_cast
<
int
>
(
num_boxes2
),
BLOCK_DIM_Y
);
dim3
blocks
(
blocks_x
,
blocks_y
);
dim3
threads
(
BLOCK_DIM_X
,
BLOCK_DIM_Y
);
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
box_iou_rotated_cuda_kernel
<
scalar_t
><<<
blocks
,
threads
,
0
,
stream
>>>
(
num_boxes1
,
num_boxes2
,
data1
,
data2
,
(
scalar_t
*
)
ious
.
data_ptr
<
scalar_t
>
());
AT_CUDA_CHECK
(
cudaGetLastError
());
}
// reshape from 1d array to 2d array
auto
shape
=
std
::
vector
<
int64_t
>
{
num_boxes1
,
num_boxes2
};
if
(
transpose
)
{
return
ious
.
view
(
shape
).
t
();
}
else
{
return
ious
.
view
(
shape
);
}
}
}
// namespace detectron2
detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#pragma once
#include <cassert>
#include <cmath>
#ifdef __CUDACC__
// Designates functions callable from the host (CPU) and the device (GPU)
#define HOST_DEVICE __host__ __device__
#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
#else
#include <algorithm>
#define HOST_DEVICE
#define HOST_DEVICE_INLINE HOST_DEVICE inline
#endif
namespace
detectron2
{
namespace
{
template
<
typename
T
>
struct
RotatedBox
{
T
x_ctr
,
y_ctr
,
w
,
h
,
a
;
};
template
<
typename
T
>
struct
Point
{
T
x
,
y
;
HOST_DEVICE_INLINE
Point
(
const
T
&
px
=
0
,
const
T
&
py
=
0
)
:
x
(
px
),
y
(
py
)
{}
HOST_DEVICE_INLINE
Point
operator
+
(
const
Point
&
p
)
const
{
return
Point
(
x
+
p
.
x
,
y
+
p
.
y
);
}
HOST_DEVICE_INLINE
Point
&
operator
+=
(
const
Point
&
p
)
{
x
+=
p
.
x
;
y
+=
p
.
y
;
return
*
this
;
}
HOST_DEVICE_INLINE
Point
operator
-
(
const
Point
&
p
)
const
{
return
Point
(
x
-
p
.
x
,
y
-
p
.
y
);
}
HOST_DEVICE_INLINE
Point
operator
*
(
const
T
coeff
)
const
{
return
Point
(
x
*
coeff
,
y
*
coeff
);
}
};
template
<
typename
T
>
HOST_DEVICE_INLINE
T
dot_2d
(
const
Point
<
T
>&
A
,
const
Point
<
T
>&
B
)
{
return
A
.
x
*
B
.
x
+
A
.
y
*
B
.
y
;
}
// R: result type. can be different from input type
template
<
typename
T
,
typename
R
=
T
>
HOST_DEVICE_INLINE
R
cross_2d
(
const
Point
<
T
>&
A
,
const
Point
<
T
>&
B
)
{
return
static_cast
<
R
>
(
A
.
x
)
*
static_cast
<
R
>
(
B
.
y
)
-
static_cast
<
R
>
(
B
.
x
)
*
static_cast
<
R
>
(
A
.
y
);
}
template
<
typename
T
>
HOST_DEVICE_INLINE
void
get_rotated_vertices
(
const
RotatedBox
<
T
>&
box
,
Point
<
T
>
(
&
pts
)[
4
])
{
// M_PI / 180. == 0.01745329251
double
theta
=
box
.
a
*
0.01745329251
;
T
cosTheta2
=
(
T
)
cos
(
theta
)
*
0.5
f
;
T
sinTheta2
=
(
T
)
sin
(
theta
)
*
0.5
f
;
// y: top --> down; x: left --> right
pts
[
0
].
x
=
box
.
x_ctr
+
sinTheta2
*
box
.
h
+
cosTheta2
*
box
.
w
;
pts
[
0
].
y
=
box
.
y_ctr
+
cosTheta2
*
box
.
h
-
sinTheta2
*
box
.
w
;
pts
[
1
].
x
=
box
.
x_ctr
-
sinTheta2
*
box
.
h
+
cosTheta2
*
box
.
w
;
pts
[
1
].
y
=
box
.
y_ctr
-
cosTheta2
*
box
.
h
-
sinTheta2
*
box
.
w
;
pts
[
2
].
x
=
2
*
box
.
x_ctr
-
pts
[
0
].
x
;
pts
[
2
].
y
=
2
*
box
.
y_ctr
-
pts
[
0
].
y
;
pts
[
3
].
x
=
2
*
box
.
x_ctr
-
pts
[
1
].
x
;
pts
[
3
].
y
=
2
*
box
.
y_ctr
-
pts
[
1
].
y
;
}
template
<
typename
T
>
HOST_DEVICE_INLINE
int
get_intersection_points
(
const
Point
<
T
>
(
&
pts1
)[
4
],
const
Point
<
T
>
(
&
pts2
)[
4
],
Point
<
T
>
(
&
intersections
)[
24
])
{
// Line vector
// A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
Point
<
T
>
vec1
[
4
],
vec2
[
4
];
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
vec1
[
i
]
=
pts1
[(
i
+
1
)
%
4
]
-
pts1
[
i
];
vec2
[
i
]
=
pts2
[(
i
+
1
)
%
4
]
-
pts2
[
i
];
}
// Line test - test all line combos for intersection
int
num
=
0
;
// number of intersections
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
// Solve for 2x2 Ax=b
T
det
=
cross_2d
<
T
>
(
vec2
[
j
],
vec1
[
i
]);
// This takes care of parallel lines
if
(
fabs
(
det
)
<=
1e-14
)
{
continue
;
}
auto
vec12
=
pts2
[
j
]
-
pts1
[
i
];
T
t1
=
cross_2d
<
T
>
(
vec2
[
j
],
vec12
)
/
det
;
T
t2
=
cross_2d
<
T
>
(
vec1
[
i
],
vec12
)
/
det
;
if
(
t1
>=
0.0
f
&&
t1
<=
1.0
f
&&
t2
>=
0.0
f
&&
t2
<=
1.0
f
)
{
intersections
[
num
++
]
=
pts1
[
i
]
+
vec1
[
i
]
*
t1
;
}
}
}
// Check for vertices of rect1 inside rect2
{
const
auto
&
AB
=
vec2
[
0
];
const
auto
&
DA
=
vec2
[
3
];
auto
ABdotAB
=
dot_2d
<
T
>
(
AB
,
AB
);
auto
ADdotAD
=
dot_2d
<
T
>
(
DA
,
DA
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
// assume ABCD is the rectangle, and P is the point to be judged
// P is inside ABCD iff. P's projection on AB lies within AB
// and P's projection on AD lies within AD
auto
AP
=
pts1
[
i
]
-
pts2
[
0
];
auto
APdotAB
=
dot_2d
<
T
>
(
AP
,
AB
);
auto
APdotAD
=
-
dot_2d
<
T
>
(
AP
,
DA
);
if
((
APdotAB
>=
0
)
&&
(
APdotAD
>=
0
)
&&
(
APdotAB
<=
ABdotAB
)
&&
(
APdotAD
<=
ADdotAD
))
{
intersections
[
num
++
]
=
pts1
[
i
];
}
}
}
// Reverse the check - check for vertices of rect2 inside rect1
{
const
auto
&
AB
=
vec1
[
0
];
const
auto
&
DA
=
vec1
[
3
];
auto
ABdotAB
=
dot_2d
<
T
>
(
AB
,
AB
);
auto
ADdotAD
=
dot_2d
<
T
>
(
DA
,
DA
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
auto
AP
=
pts2
[
i
]
-
pts1
[
0
];
auto
APdotAB
=
dot_2d
<
T
>
(
AP
,
AB
);
auto
APdotAD
=
-
dot_2d
<
T
>
(
AP
,
DA
);
if
((
APdotAB
>=
0
)
&&
(
APdotAD
>=
0
)
&&
(
APdotAB
<=
ABdotAB
)
&&
(
APdotAD
<=
ADdotAD
))
{
intersections
[
num
++
]
=
pts2
[
i
];
}
}
}
return
num
;
}
template
<
typename
T
>
HOST_DEVICE_INLINE
int
convex_hull_graham
(
const
Point
<
T
>
(
&
p
)[
24
],
const
int
&
num_in
,
Point
<
T
>
(
&
q
)[
24
],
bool
shift_to_zero
=
false
)
{
assert
(
num_in
>=
2
);
// Step 1:
// Find point with minimum y
// if more than 1 points have the same minimum y,
// pick the one with the minimum x.
int
t
=
0
;
for
(
int
i
=
1
;
i
<
num_in
;
i
++
)
{
if
(
p
[
i
].
y
<
p
[
t
].
y
||
(
p
[
i
].
y
==
p
[
t
].
y
&&
p
[
i
].
x
<
p
[
t
].
x
))
{
t
=
i
;
}
}
auto
&
start
=
p
[
t
];
// starting point
// Step 2:
// Subtract starting point from every points (for sorting in the next step)
for
(
int
i
=
0
;
i
<
num_in
;
i
++
)
{
q
[
i
]
=
p
[
i
]
-
start
;
}
// Swap the starting point to position 0
auto
tmp
=
q
[
0
];
q
[
0
]
=
q
[
t
];
q
[
t
]
=
tmp
;
// Step 3:
// Sort point 1 ~ num_in according to their relative cross-product values
// (essentially sorting according to angles)
// If the angles are the same, sort according to their distance to origin
T
dist
[
24
];
#ifdef __CUDACC__
// compute distance to origin before sort, and sort them together with the
// points
for
(
int
i
=
0
;
i
<
num_in
;
i
++
)
{
dist
[
i
]
=
dot_2d
<
T
>
(
q
[
i
],
q
[
i
]);
}
// CUDA version
// In the future, we can potentially use thrust
// for sorting here to improve speed (though not guaranteed)
for
(
int
i
=
1
;
i
<
num_in
-
1
;
i
++
)
{
for
(
int
j
=
i
+
1
;
j
<
num_in
;
j
++
)
{
T
crossProduct
=
cross_2d
<
T
>
(
q
[
i
],
q
[
j
]);
if
((
crossProduct
<
-
1e-6
)
||
(
fabs
(
crossProduct
)
<
1e-6
&&
dist
[
i
]
>
dist
[
j
]))
{
auto
q_tmp
=
q
[
i
];
q
[
i
]
=
q
[
j
];
q
[
j
]
=
q_tmp
;
auto
dist_tmp
=
dist
[
i
];
dist
[
i
]
=
dist
[
j
];
dist
[
j
]
=
dist_tmp
;
}
}
}
#else
// CPU version
std
::
sort
(
q
+
1
,
q
+
num_in
,
[](
const
Point
<
T
>&
A
,
const
Point
<
T
>&
B
)
->
bool
{
T
temp
=
cross_2d
<
T
>
(
A
,
B
);
if
(
fabs
(
temp
)
<
1e-6
)
{
return
dot_2d
<
T
>
(
A
,
A
)
<
dot_2d
<
T
>
(
B
,
B
);
}
else
{
return
temp
>
0
;
}
});
// compute distance to origin after sort, since the points are now different.
for
(
int
i
=
0
;
i
<
num_in
;
i
++
)
{
dist
[
i
]
=
dot_2d
<
T
>
(
q
[
i
],
q
[
i
]);
}
#endif
// Step 4:
// Make sure there are at least 2 points (that don't overlap with each other)
// in the stack
int
k
;
// index of the non-overlapped second point
for
(
k
=
1
;
k
<
num_in
;
k
++
)
{
if
(
dist
[
k
]
>
1e-8
)
{
break
;
}
}
if
(
k
==
num_in
)
{
// We reach the end, which means the convex hull is just one point
q
[
0
]
=
p
[
t
];
return
1
;
}
q
[
1
]
=
q
[
k
];
int
m
=
2
;
// 2 points in the stack
// Step 5:
// Finally we can start the scanning process.
// When a non-convex relationship between the 3 points is found
// (either concave shape or duplicated points),
// we pop the previous point from the stack
// until the 3-point relationship is convex again, or
// until the stack only contains two points
for
(
int
i
=
k
+
1
;
i
<
num_in
;
i
++
)
{
while
(
m
>
1
)
{
auto
q1
=
q
[
i
]
-
q
[
m
-
2
],
q2
=
q
[
m
-
1
]
-
q
[
m
-
2
];
// cross_2d() uses FMA and therefore computes round(round(q1.x*q2.y) -
// q2.x*q1.y) So it may not return 0 even when q1==q2. Therefore we
// compare round(q1.x*q2.y) and round(q2.x*q1.y) directly. (round means
// round to nearest floating point).
if
(
q1
.
x
*
q2
.
y
>=
q2
.
x
*
q1
.
y
)
m
--
;
else
break
;
}
// Using double also helps, but float can solve the issue for now.
// while (m > 1 && cross_2d<T, double>(q[i] - q[m - 2], q[m - 1] - q[m - 2])
// >= 0) {
// m--;
// }
q
[
m
++
]
=
q
[
i
];
}
// Step 6 (Optional):
// In general sense we need the original coordinates, so we
// need to shift the points back (reverting Step 2)
// But if we're only interested in getting the area/perimeter of the shape
// We can simply return.
if
(
!
shift_to_zero
)
{
for
(
int
i
=
0
;
i
<
m
;
i
++
)
{
q
[
i
]
+=
start
;
}
}
return
m
;
}
template
<
typename
T
>
HOST_DEVICE_INLINE
T
polygon_area
(
const
Point
<
T
>
(
&
q
)[
24
],
const
int
&
m
)
{
if
(
m
<=
2
)
{
return
0
;
}
T
area
=
0
;
for
(
int
i
=
1
;
i
<
m
-
1
;
i
++
)
{
area
+=
fabs
(
cross_2d
<
T
>
(
q
[
i
]
-
q
[
0
],
q
[
i
+
1
]
-
q
[
0
]));
}
return
area
/
2.0
;
}
template
<
typename
T
>
HOST_DEVICE_INLINE
T
rotated_boxes_intersection
(
const
RotatedBox
<
T
>&
box1
,
const
RotatedBox
<
T
>&
box2
)
{
// There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
// from rotated_rect_intersection_pts
Point
<
T
>
intersectPts
[
24
],
orderedPts
[
24
];
Point
<
T
>
pts1
[
4
];
Point
<
T
>
pts2
[
4
];
get_rotated_vertices
<
T
>
(
box1
,
pts1
);
get_rotated_vertices
<
T
>
(
box2
,
pts2
);
int
num
=
get_intersection_points
<
T
>
(
pts1
,
pts2
,
intersectPts
);
if
(
num
<=
2
)
{
return
0.0
;
}
// Convex Hull to order the intersection points in clockwise order and find
// the contour area.
int
num_convex
=
convex_hull_graham
<
T
>
(
intersectPts
,
num
,
orderedPts
,
true
);
return
polygon_area
<
T
>
(
orderedPts
,
num_convex
);
}
}
// namespace
template
<
typename
T
>
HOST_DEVICE_INLINE
T
single_box_iou_rotated
(
T
const
*
const
box1_raw
,
T
const
*
const
box2_raw
)
{
// shift center to the middle point to achieve higher precision in result
RotatedBox
<
T
>
box1
,
box2
;
auto
center_shift_x
=
(
box1_raw
[
0
]
+
box2_raw
[
0
])
/
2.0
;
auto
center_shift_y
=
(
box1_raw
[
1
]
+
box2_raw
[
1
])
/
2.0
;
box1
.
x_ctr
=
box1_raw
[
0
]
-
center_shift_x
;
box1
.
y_ctr
=
box1_raw
[
1
]
-
center_shift_y
;
box1
.
w
=
box1_raw
[
2
];
box1
.
h
=
box1_raw
[
3
];
box1
.
a
=
box1_raw
[
4
];
box2
.
x_ctr
=
box2_raw
[
0
]
-
center_shift_x
;
box2
.
y_ctr
=
box2_raw
[
1
]
-
center_shift_y
;
box2
.
w
=
box2_raw
[
2
];
box2
.
h
=
box2_raw
[
3
];
box2
.
a
=
box2_raw
[
4
];
T
area1
=
box1
.
w
*
box1
.
h
;
T
area2
=
box2
.
w
*
box2
.
h
;
if
(
area1
<
1e-14
||
area2
<
1e-14
)
{
return
0.
f
;
}
T
intersection
=
rotated_boxes_intersection
<
T
>
(
box1
,
box2
);
T
iou
=
intersection
/
(
area1
+
area2
-
intersection
);
return
iou
;
}
}
// namespace detectron2
detectron2/layers/csrc/cuda_version.cu
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include <cuda_runtime_api.h>
namespace
detectron2
{
int
get_cudart_version
()
{
return
CUDART_VERSION
;
}
}
// namespace detectron2
detectron2/layers/csrc/deformable/deform_conv.h
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#pragma once
#include <torch/types.h>
namespace
detectron2
{
#ifdef WITH_CUDA
int
deform_conv_forward_cuda
(
at
::
Tensor
input
,
at
::
Tensor
weight
,
at
::
Tensor
offset
,
at
::
Tensor
output
,
at
::
Tensor
columns
,
at
::
Tensor
ones
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
int
dilationW
,
int
dilationH
,
int
group
,
int
deformable_group
,
int
im2col_step
);
int
deform_conv_backward_input_cuda
(
at
::
Tensor
input
,
at
::
Tensor
offset
,
at
::
Tensor
gradOutput
,
at
::
Tensor
gradInput
,
at
::
Tensor
gradOffset
,
at
::
Tensor
weight
,
at
::
Tensor
columns
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
int
dilationW
,
int
dilationH
,
int
group
,
int
deformable_group
,
int
im2col_step
);
int
deform_conv_backward_parameters_cuda
(
at
::
Tensor
input
,
at
::
Tensor
offset
,
at
::
Tensor
gradOutput
,
at
::
Tensor
gradWeight
,
// at::Tensor gradBias,
at
::
Tensor
columns
,
at
::
Tensor
ones
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
int
dilationW
,
int
dilationH
,
int
group
,
int
deformable_group
,
float
scale
,
int
im2col_step
);
void
modulated_deform_conv_cuda_forward
(
at
::
Tensor
input
,
at
::
Tensor
weight
,
at
::
Tensor
bias
,
at
::
Tensor
ones
,
at
::
Tensor
offset
,
at
::
Tensor
mask
,
at
::
Tensor
output
,
at
::
Tensor
columns
,
int
kernel_h
,
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
deformable_group
,
const
bool
with_bias
);
void
modulated_deform_conv_cuda_backward
(
at
::
Tensor
input
,
at
::
Tensor
weight
,
at
::
Tensor
bias
,
at
::
Tensor
ones
,
at
::
Tensor
offset
,
at
::
Tensor
mask
,
at
::
Tensor
columns
,
at
::
Tensor
grad_input
,
at
::
Tensor
grad_weight
,
at
::
Tensor
grad_bias
,
at
::
Tensor
grad_offset
,
at
::
Tensor
grad_mask
,
at
::
Tensor
grad_output
,
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
int
pad_h
,
int
pad_w
,
int
dilation_h
,
int
dilation_w
,
int
group
,
int
deformable_group
,
const
bool
with_bias
);
#endif
inline
int
deform_conv_forward
(
at
::
Tensor
input
,
at
::
Tensor
weight
,
at
::
Tensor
offset
,
at
::
Tensor
output
,
at
::
Tensor
columns
,
at
::
Tensor
ones
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
int
dilationW
,
int
dilationH
,
int
group
,
int
deformable_group
,
int
im2col_step
)
{
if
(
input
.
is_cuda
())
{
#ifdef WITH_CUDA
TORCH_CHECK
(
weight
.
is_cuda
(),
"weight tensor is not on GPU!"
);
TORCH_CHECK
(
offset
.
is_cuda
(),
"offset tensor is not on GPU!"
);
return
deform_conv_forward_cuda
(
input
,
weight
,
offset
,
output
,
columns
,
ones
,
kW
,
kH
,
dW
,
dH
,
padW
,
padH
,
dilationW
,
dilationH
,
group
,
deformable_group
,
im2col_step
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
AT_ERROR
(
"Not implemented on the CPU"
);
}
inline
int
deform_conv_backward_input
(
at
::
Tensor
input
,
at
::
Tensor
offset
,
at
::
Tensor
gradOutput
,
at
::
Tensor
gradInput
,
at
::
Tensor
gradOffset
,
at
::
Tensor
weight
,
at
::
Tensor
columns
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
int
dilationW
,
int
dilationH
,
int
group
,
int
deformable_group
,
int
im2col_step
)
{
if
(
gradOutput
.
is_cuda
())
{
#ifdef WITH_CUDA
TORCH_CHECK
(
input
.
is_cuda
(),
"input tensor is not on GPU!"
);
TORCH_CHECK
(
weight
.
is_cuda
(),
"weight tensor is not on GPU!"
);
TORCH_CHECK
(
offset
.
is_cuda
(),
"offset tensor is not on GPU!"
);
return
deform_conv_backward_input_cuda
(
input
,
offset
,
gradOutput
,
gradInput
,
gradOffset
,
weight
,
columns
,
kW
,
kH
,
dW
,
dH
,
padW
,
padH
,
dilationW
,
dilationH
,
group
,
deformable_group
,
im2col_step
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
AT_ERROR
(
"Not implemented on the CPU"
);
}
inline
int
deform_conv_backward_filter
(
at
::
Tensor
input
,
at
::
Tensor
offset
,
at
::
Tensor
gradOutput
,
at
::
Tensor
gradWeight
,
// at::Tensor gradBias,
at
::
Tensor
columns
,
at
::
Tensor
ones
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
int
dilationW
,
int
dilationH
,
int
group
,
int
deformable_group
,
float
scale
,
int
im2col_step
)
{
if
(
gradOutput
.
is_cuda
())
{
#ifdef WITH_CUDA
TORCH_CHECK
(
input
.
is_cuda
(),
"input tensor is not on GPU!"
);
TORCH_CHECK
(
offset
.
is_cuda
(),
"offset tensor is not on GPU!"
);
return
deform_conv_backward_parameters_cuda
(
input
,
offset
,
gradOutput
,
gradWeight
,
columns
,
ones
,
kW
,
kH
,
dW
,
dH
,
padW
,
padH
,
dilationW
,
dilationH
,
group
,
deformable_group
,
scale
,
im2col_step
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
AT_ERROR
(
"Not implemented on the CPU"
);
}
inline
void
modulated_deform_conv_forward
(
at
::
Tensor
input
,
at
::
Tensor
weight
,
at
::
Tensor
bias
,
at
::
Tensor
ones
,
at
::
Tensor
offset
,
at
::
Tensor
mask
,
at
::
Tensor
output
,
at
::
Tensor
columns
,
int
kernel_h
,
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
deformable_group
,
const
bool
with_bias
)
{
if
(
input
.
is_cuda
())
{
#ifdef WITH_CUDA
TORCH_CHECK
(
weight
.
is_cuda
(),
"weight tensor is not on GPU!"
);
TORCH_CHECK
(
bias
.
is_cuda
(),
"bias tensor is not on GPU!"
);
TORCH_CHECK
(
offset
.
is_cuda
(),
"offset tensor is not on GPU!"
);
return
modulated_deform_conv_cuda_forward
(
input
,
weight
,
bias
,
ones
,
offset
,
mask
,
output
,
columns
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
deformable_group
,
with_bias
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
AT_ERROR
(
"Not implemented on the CPU"
);
}
inline
void
modulated_deform_conv_backward
(
at
::
Tensor
input
,
at
::
Tensor
weight
,
at
::
Tensor
bias
,
at
::
Tensor
ones
,
at
::
Tensor
offset
,
at
::
Tensor
mask
,
at
::
Tensor
columns
,
at
::
Tensor
grad_input
,
at
::
Tensor
grad_weight
,
at
::
Tensor
grad_bias
,
at
::
Tensor
grad_offset
,
at
::
Tensor
grad_mask
,
at
::
Tensor
grad_output
,
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
int
pad_h
,
int
pad_w
,
int
dilation_h
,
int
dilation_w
,
int
group
,
int
deformable_group
,
const
bool
with_bias
)
{
if
(
grad_output
.
is_cuda
())
{
#ifdef WITH_CUDA
TORCH_CHECK
(
input
.
is_cuda
(),
"input tensor is not on GPU!"
);
TORCH_CHECK
(
weight
.
is_cuda
(),
"weight tensor is not on GPU!"
);
TORCH_CHECK
(
bias
.
is_cuda
(),
"bias tensor is not on GPU!"
);
TORCH_CHECK
(
offset
.
is_cuda
(),
"offset tensor is not on GPU!"
);
return
modulated_deform_conv_cuda_backward
(
input
,
weight
,
bias
,
ones
,
offset
,
mask
,
columns
,
grad_input
,
grad_weight
,
grad_bias
,
grad_offset
,
grad_mask
,
grad_output
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
deformable_group
,
with_bias
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
AT_ERROR
(
"Not implemented on the CPU"
);
}
}
// namespace detectron2
detectron2/layers/csrc/deformable/deform_conv_cuda.cu
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp
// Original license: Apache 2.0
// modify from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
// Original license: Apache 2.0
#include <torch/types.h>
#include "deform_conv.h"
#include <cmath>
#include <vector>
namespace
detectron2
{
void
deformable_im2col
(
const
at
::
Tensor
data_im
,
const
at
::
Tensor
data_offset
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
ksize_h
,
const
int
ksize_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
parallel_imgs
,
const
int
deformable_group
,
at
::
Tensor
data_col
);
void
deformable_col2im
(
const
at
::
Tensor
data_col
,
const
at
::
Tensor
data_offset
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
ksize_h
,
const
int
ksize_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
parallel_imgs
,
const
int
deformable_group
,
at
::
Tensor
grad_im
);
void
deformable_col2im_coord
(
const
at
::
Tensor
data_col
,
const
at
::
Tensor
data_im
,
const
at
::
Tensor
data_offset
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
ksize_h
,
const
int
ksize_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
parallel_imgs
,
const
int
deformable_group
,
at
::
Tensor
grad_offset
);
void
modulated_deformable_im2col_cuda
(
const
at
::
Tensor
data_im
,
const
at
::
Tensor
data_offset
,
const
at
::
Tensor
data_mask
,
const
int
batch_size
,
const
int
channels
,
const
int
height_im
,
const
int
width_im
,
const
int
height_col
,
const
int
width_col
,
const
int
kernel_h
,
const
int
kenerl_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
deformable_group
,
at
::
Tensor
data_col
);
void
modulated_deformable_col2im_cuda
(
const
at
::
Tensor
data_col
,
const
at
::
Tensor
data_offset
,
const
at
::
Tensor
data_mask
,
const
int
batch_size
,
const
int
channels
,
const
int
height_im
,
const
int
width_im
,
const
int
height_col
,
const
int
width_col
,
const
int
kernel_h
,
const
int
kenerl_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
deformable_group
,
at
::
Tensor
grad_im
);
void
modulated_deformable_col2im_coord_cuda
(
const
at
::
Tensor
data_col
,
const
at
::
Tensor
data_im
,
const
at
::
Tensor
data_offset
,
const
at
::
Tensor
data_mask
,
const
int
batch_size
,
const
int
channels
,
const
int
height_im
,
const
int
width_im
,
const
int
height_col
,
const
int
width_col
,
const
int
kernel_h
,
const
int
kenerl_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
deformable_group
,
at
::
Tensor
grad_offset
,
at
::
Tensor
grad_mask
);
void
shape_check
(
at
::
Tensor
input
,
at
::
Tensor
offset
,
at
::
Tensor
*
gradOutput
,
at
::
Tensor
weight
,
int
kH
,
int
kW
,
int
dH
,
int
dW
,
int
padH
,
int
padW
,
int
dilationH
,
int
dilationW
,
int
group
,
int
deformable_group
)
{
TORCH_CHECK
(
weight
.
ndimension
()
==
4
,
"4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
"but got: %s"
,
weight
.
ndimension
());
TORCH_CHECK
(
weight
.
is_contiguous
(),
"weight tensor has to be contiguous"
);
TORCH_CHECK
(
kW
>
0
&&
kH
>
0
,
"kernel size should be greater than zero, but got kH: %d kW: %d"
,
kH
,
kW
);
TORCH_CHECK
(
(
weight
.
size
(
2
)
==
kH
&&
weight
.
size
(
3
)
==
kW
),
"kernel size should be consistent with weight, "
,
"but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d"
,
kH
,
kW
,
weight
.
size
(
2
),
weight
.
size
(
3
));
TORCH_CHECK
(
dW
>
0
&&
dH
>
0
,
"stride should be greater than zero, but got dH: %d dW: %d"
,
dH
,
dW
);
TORCH_CHECK
(
dilationW
>
0
&&
dilationH
>
0
,
"dilation should be greater than 0, but got dilationH: %d dilationW: %d"
,
dilationH
,
dilationW
);
int
ndim
=
input
.
ndimension
();
int
dimf
=
0
;
int
dimh
=
1
;
int
dimw
=
2
;
if
(
ndim
==
4
)
{
dimf
++
;
dimh
++
;
dimw
++
;
}
TORCH_CHECK
(
ndim
==
3
||
ndim
==
4
,
"3D or 4D input tensor expected but got: %s"
,
ndim
);
long
nInputPlane
=
weight
.
size
(
1
)
*
group
;
long
inputHeight
=
input
.
size
(
dimh
);
long
inputWidth
=
input
.
size
(
dimw
);
long
nOutputPlane
=
weight
.
size
(
0
);
long
outputHeight
=
(
inputHeight
+
2
*
padH
-
(
dilationH
*
(
kH
-
1
)
+
1
))
/
dH
+
1
;
long
outputWidth
=
(
inputWidth
+
2
*
padW
-
(
dilationW
*
(
kW
-
1
)
+
1
))
/
dW
+
1
;
TORCH_CHECK
(
nInputPlane
%
deformable_group
==
0
,
"input channels must divide deformable group size"
);
if
(
outputWidth
<
1
||
outputHeight
<
1
)
AT_ERROR
(
"Given input size: (%ld x %ld x %ld). "
"Calculated output size: (%ld x %ld x %ld). Output size is too small"
,
nInputPlane
,
inputHeight
,
inputWidth
,
nOutputPlane
,
outputHeight
,
outputWidth
);
TORCH_CHECK
(
input
.
size
(
1
)
==
nInputPlane
,
"invalid number of input planes, expected: %d, but got: %d"
,
nInputPlane
,
input
.
size
(
1
));
TORCH_CHECK
(
(
inputHeight
>=
kH
&&
inputWidth
>=
kW
),
"input image is smaller than kernel"
);
TORCH_CHECK
(
(
offset
.
size
(
2
)
==
outputHeight
&&
offset
.
size
(
3
)
==
outputWidth
),
"invalid spatial size of offset, expected height: %d width: %d, but "
"got height: %d width: %d"
,
outputHeight
,
outputWidth
,
offset
.
size
(
2
),
offset
.
size
(
3
));
TORCH_CHECK
(
(
offset
.
size
(
1
)
==
deformable_group
*
2
*
kH
*
kW
),
"invalid number of channels of offset"
);
if
(
gradOutput
!=
NULL
)
{
TORCH_CHECK
(
gradOutput
->
size
(
dimf
)
==
nOutputPlane
,
"invalid number of gradOutput planes, expected: %d, but got: %d"
,
nOutputPlane
,
gradOutput
->
size
(
dimf
));
TORCH_CHECK
(
(
gradOutput
->
size
(
dimh
)
==
outputHeight
&&
gradOutput
->
size
(
dimw
)
==
outputWidth
),
"invalid size of gradOutput, expected height: %d width: %d , but "
"got height: %d width: %d"
,
outputHeight
,
outputWidth
,
gradOutput
->
size
(
dimh
),
gradOutput
->
size
(
dimw
));
}
}
int
deform_conv_forward_cuda
(
at
::
Tensor
input
,
at
::
Tensor
weight
,
at
::
Tensor
offset
,
at
::
Tensor
output
,
at
::
Tensor
columns
,
at
::
Tensor
ones
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
int
dilationW
,
int
dilationH
,
int
group
,
int
deformable_group
,
int
im2col_step
)
{
// todo: resize columns to include im2col: done
// todo: add im2col_step as input
// todo: add new output buffer and transpose it to output (or directly
// transpose output) todo: possibly change data indexing because of
// parallel_imgs
shape_check
(
input
,
offset
,
NULL
,
weight
,
kH
,
kW
,
dH
,
dW
,
padH
,
padW
,
dilationH
,
dilationW
,
group
,
deformable_group
);
input
=
input
.
contiguous
();
offset
=
offset
.
contiguous
();
weight
=
weight
.
contiguous
();
int
batch
=
1
;
if
(
input
.
ndimension
()
==
3
)
{
// Force batch
batch
=
0
;
input
.
unsqueeze_
(
0
);
offset
.
unsqueeze_
(
0
);
}
// todo: assert batchsize dividable by im2col_step
long
batchSize
=
input
.
size
(
0
);
long
nInputPlane
=
input
.
size
(
1
);
long
inputHeight
=
input
.
size
(
2
);
long
inputWidth
=
input
.
size
(
3
);
long
nOutputPlane
=
weight
.
size
(
0
);
long
outputWidth
=
(
inputWidth
+
2
*
padW
-
(
dilationW
*
(
kW
-
1
)
+
1
))
/
dW
+
1
;
long
outputHeight
=
(
inputHeight
+
2
*
padH
-
(
dilationH
*
(
kH
-
1
)
+
1
))
/
dH
+
1
;
TORCH_CHECK
((
offset
.
size
(
0
)
==
batchSize
),
"invalid batch size of offset"
);
output
=
output
.
view
({
batchSize
/
im2col_step
,
im2col_step
,
nOutputPlane
,
outputHeight
,
outputWidth
});
columns
=
at
::
zeros
(
{
nInputPlane
*
kW
*
kH
,
im2col_step
*
outputHeight
*
outputWidth
},
input
.
options
());
if
(
ones
.
ndimension
()
!=
2
||
ones
.
size
(
0
)
*
ones
.
size
(
1
)
<
outputHeight
*
outputWidth
)
{
ones
=
at
::
ones
({
outputHeight
,
outputWidth
},
input
.
options
());
}
input
=
input
.
view
({
batchSize
/
im2col_step
,
im2col_step
,
nInputPlane
,
inputHeight
,
inputWidth
});
offset
=
offset
.
view
({
batchSize
/
im2col_step
,
im2col_step
,
deformable_group
*
2
*
kH
*
kW
,
outputHeight
,
outputWidth
});
at
::
Tensor
output_buffer
=
at
::
zeros
(
{
batchSize
/
im2col_step
,
nOutputPlane
,
im2col_step
*
outputHeight
,
outputWidth
},
output
.
options
());
output_buffer
=
output_buffer
.
view
({
output_buffer
.
size
(
0
),
group
,
output_buffer
.
size
(
1
)
/
group
,
output_buffer
.
size
(
2
),
output_buffer
.
size
(
3
)});
for
(
int
elt
=
0
;
elt
<
batchSize
/
im2col_step
;
elt
++
)
{
deformable_im2col
(
input
[
elt
],
offset
[
elt
],
nInputPlane
,
inputHeight
,
inputWidth
,
kH
,
kW
,
padH
,
padW
,
dH
,
dW
,
dilationH
,
dilationW
,
im2col_step
,
deformable_group
,
columns
);
columns
=
columns
.
view
({
group
,
columns
.
size
(
0
)
/
group
,
columns
.
size
(
1
)});
weight
=
weight
.
view
({
group
,
weight
.
size
(
0
)
/
group
,
weight
.
size
(
1
),
weight
.
size
(
2
),
weight
.
size
(
3
)});
for
(
int
g
=
0
;
g
<
group
;
g
++
)
{
output_buffer
[
elt
][
g
]
=
output_buffer
[
elt
][
g
]
.
flatten
(
1
)
.
addmm_
(
weight
[
g
].
flatten
(
1
),
columns
[
g
])
.
view_as
(
output_buffer
[
elt
][
g
]);
}
}
output_buffer
=
output_buffer
.
view
({
output_buffer
.
size
(
0
),
output_buffer
.
size
(
1
)
*
output_buffer
.
size
(
2
),
output_buffer
.
size
(
3
),
output_buffer
.
size
(
4
)});
output_buffer
=
output_buffer
.
view
({
batchSize
/
im2col_step
,
nOutputPlane
,
im2col_step
,
outputHeight
,
outputWidth
});
output_buffer
.
transpose_
(
1
,
2
);
output
.
copy_
(
output_buffer
);
output
=
output
.
view
({
batchSize
,
nOutputPlane
,
outputHeight
,
outputWidth
});
input
=
input
.
view
({
batchSize
,
nInputPlane
,
inputHeight
,
inputWidth
});
offset
=
offset
.
view
(
{
batchSize
,
deformable_group
*
2
*
kH
*
kW
,
outputHeight
,
outputWidth
});
if
(
batch
==
0
)
{
output
=
output
.
view
({
nOutputPlane
,
outputHeight
,
outputWidth
});
input
=
input
.
view
({
nInputPlane
,
inputHeight
,
inputWidth
});
offset
=
offset
.
view
({
offset
.
size
(
1
),
offset
.
size
(
2
),
offset
.
size
(
3
)});
}
return
1
;
}
int
deform_conv_backward_input_cuda
(
at
::
Tensor
input
,
at
::
Tensor
offset
,
at
::
Tensor
gradOutput
,
at
::
Tensor
gradInput
,
at
::
Tensor
gradOffset
,
at
::
Tensor
weight
,
at
::
Tensor
columns
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
int
dilationW
,
int
dilationH
,
int
group
,
int
deformable_group
,
int
im2col_step
)
{
shape_check
(
input
,
offset
,
&
gradOutput
,
weight
,
kH
,
kW
,
dH
,
dW
,
padH
,
padW
,
dilationH
,
dilationW
,
group
,
deformable_group
);
input
=
input
.
contiguous
();
offset
=
offset
.
contiguous
();
gradOutput
=
gradOutput
.
contiguous
();
weight
=
weight
.
contiguous
();
int
batch
=
1
;
if
(
input
.
ndimension
()
==
3
)
{
// Force batch
batch
=
0
;
input
=
input
.
view
({
1
,
input
.
size
(
0
),
input
.
size
(
1
),
input
.
size
(
2
)});
offset
=
offset
.
view
({
1
,
offset
.
size
(
0
),
offset
.
size
(
1
),
offset
.
size
(
2
)});
gradOutput
=
gradOutput
.
view
(
{
1
,
gradOutput
.
size
(
0
),
gradOutput
.
size
(
1
),
gradOutput
.
size
(
2
)});
}
long
batchSize
=
input
.
size
(
0
);
long
nInputPlane
=
input
.
size
(
1
);
long
inputHeight
=
input
.
size
(
2
);
long
inputWidth
=
input
.
size
(
3
);
long
nOutputPlane
=
weight
.
size
(
0
);
long
outputWidth
=
(
inputWidth
+
2
*
padW
-
(
dilationW
*
(
kW
-
1
)
+
1
))
/
dW
+
1
;
long
outputHeight
=
(
inputHeight
+
2
*
padH
-
(
dilationH
*
(
kH
-
1
)
+
1
))
/
dH
+
1
;
TORCH_CHECK
((
offset
.
size
(
0
)
==
batchSize
),
3
,
"invalid batch size of offset"
);
gradInput
=
gradInput
.
view
({
batchSize
,
nInputPlane
,
inputHeight
,
inputWidth
});
columns
=
at
::
zeros
(
{
nInputPlane
*
kW
*
kH
,
im2col_step
*
outputHeight
*
outputWidth
},
input
.
options
());
// change order of grad output
gradOutput
=
gradOutput
.
view
({
batchSize
/
im2col_step
,
im2col_step
,
nOutputPlane
,
outputHeight
,
outputWidth
});
gradOutput
.
transpose_
(
1
,
2
);
gradInput
=
gradInput
.
view
({
batchSize
/
im2col_step
,
im2col_step
,
nInputPlane
,
inputHeight
,
inputWidth
});
input
=
input
.
view
({
batchSize
/
im2col_step
,
im2col_step
,
nInputPlane
,
inputHeight
,
inputWidth
});
gradOffset
=
gradOffset
.
view
({
batchSize
/
im2col_step
,
im2col_step
,
deformable_group
*
2
*
kH
*
kW
,
outputHeight
,
outputWidth
});
offset
=
offset
.
view
({
batchSize
/
im2col_step
,
im2col_step
,
deformable_group
*
2
*
kH
*
kW
,
outputHeight
,
outputWidth
});
for
(
int
elt
=
0
;
elt
<
batchSize
/
im2col_step
;
elt
++
)
{
// divide into groups
columns
=
columns
.
view
({
group
,
columns
.
size
(
0
)
/
group
,
columns
.
size
(
1
)});
weight
=
weight
.
view
({
group
,
weight
.
size
(
0
)
/
group
,
weight
.
size
(
1
),
weight
.
size
(
2
),
weight
.
size
(
3
)});
gradOutput
=
gradOutput
.
view
({
gradOutput
.
size
(
0
),
group
,
gradOutput
.
size
(
1
)
/
group
,
gradOutput
.
size
(
2
),
gradOutput
.
size
(
3
),
gradOutput
.
size
(
4
)});
for
(
int
g
=
0
;
g
<
group
;
g
++
)
{
columns
[
g
]
=
columns
[
g
].
addmm_
(
weight
[
g
].
flatten
(
1
).
transpose
(
0
,
1
),
gradOutput
[
elt
][
g
].
flatten
(
1
),
0.0
f
,
1.0
f
);
}
columns
=
columns
.
view
({
columns
.
size
(
0
)
*
columns
.
size
(
1
),
columns
.
size
(
2
)});
gradOutput
=
gradOutput
.
view
({
gradOutput
.
size
(
0
),
gradOutput
.
size
(
1
)
*
gradOutput
.
size
(
2
),
gradOutput
.
size
(
3
),
gradOutput
.
size
(
4
),
gradOutput
.
size
(
5
)});
deformable_col2im_coord
(
columns
,
input
[
elt
],
offset
[
elt
],
nInputPlane
,
inputHeight
,
inputWidth
,
kH
,
kW
,
padH
,
padW
,
dH
,
dW
,
dilationH
,
dilationW
,
im2col_step
,
deformable_group
,
gradOffset
[
elt
]);
deformable_col2im
(
columns
,
offset
[
elt
],
nInputPlane
,
inputHeight
,
inputWidth
,
kH
,
kW
,
padH
,
padW
,
dH
,
dW
,
dilationH
,
dilationW
,
im2col_step
,
deformable_group
,
gradInput
[
elt
]);
}
gradOutput
.
transpose_
(
1
,
2
);
gradOutput
=
gradOutput
.
view
({
batchSize
,
nOutputPlane
,
outputHeight
,
outputWidth
});
gradInput
=
gradInput
.
view
({
batchSize
,
nInputPlane
,
inputHeight
,
inputWidth
});
input
=
input
.
view
({
batchSize
,
nInputPlane
,
inputHeight
,
inputWidth
});
gradOffset
=
gradOffset
.
view
(
{
batchSize
,
deformable_group
*
2
*
kH
*
kW
,
outputHeight
,
outputWidth
});
offset
=
offset
.
view
(
{
batchSize
,
deformable_group
*
2
*
kH
*
kW
,
outputHeight
,
outputWidth
});
if
(
batch
==
0
)
{
gradOutput
=
gradOutput
.
view
({
nOutputPlane
,
outputHeight
,
outputWidth
});
input
=
input
.
view
({
nInputPlane
,
inputHeight
,
inputWidth
});
gradInput
=
gradInput
.
view
({
nInputPlane
,
inputHeight
,
inputWidth
});
offset
=
offset
.
view
({
offset
.
size
(
1
),
offset
.
size
(
2
),
offset
.
size
(
3
)});
gradOffset
=
gradOffset
.
view
({
offset
.
size
(
1
),
offset
.
size
(
2
),
offset
.
size
(
3
)});
}
return
1
;
}
int
deform_conv_backward_parameters_cuda
(
at
::
Tensor
input
,
at
::
Tensor
offset
,
at
::
Tensor
gradOutput
,
at
::
Tensor
gradWeight
,
// at::Tensor gradBias,
at
::
Tensor
columns
,
at
::
Tensor
ones
,
int
kW
,
int
kH
,
int
dW
,
int
dH
,
int
padW
,
int
padH
,
int
dilationW
,
int
dilationH
,
int
group
,
int
deformable_group
,
float
scale
,
int
im2col_step
)
{
// todo: transpose and reshape outGrad
// todo: reshape columns
// todo: add im2col_step as input
shape_check
(
input
,
offset
,
&
gradOutput
,
gradWeight
,
kH
,
kW
,
dH
,
dW
,
padH
,
padW
,
dilationH
,
dilationW
,
group
,
deformable_group
);
input
=
input
.
contiguous
();
offset
=
offset
.
contiguous
();
gradOutput
=
gradOutput
.
contiguous
();
int
batch
=
1
;
if
(
input
.
ndimension
()
==
3
)
{
// Force batch
batch
=
0
;
input
=
input
.
view
(
at
::
IntList
({
1
,
input
.
size
(
0
),
input
.
size
(
1
),
input
.
size
(
2
)}));
gradOutput
=
gradOutput
.
view
(
{
1
,
gradOutput
.
size
(
0
),
gradOutput
.
size
(
1
),
gradOutput
.
size
(
2
)});
}
long
batchSize
=
input
.
size
(
0
);
long
nInputPlane
=
input
.
size
(
1
);
long
inputHeight
=
input
.
size
(
2
);
long
inputWidth
=
input
.
size
(
3
);
long
nOutputPlane
=
gradWeight
.
size
(
0
);
long
outputWidth
=
(
inputWidth
+
2
*
padW
-
(
dilationW
*
(
kW
-
1
)
+
1
))
/
dW
+
1
;
long
outputHeight
=
(
inputHeight
+
2
*
padH
-
(
dilationH
*
(
kH
-
1
)
+
1
))
/
dH
+
1
;
TORCH_CHECK
((
offset
.
size
(
0
)
==
batchSize
),
"invalid batch size of offset"
);
columns
=
at
::
zeros
(
{
nInputPlane
*
kW
*
kH
,
im2col_step
*
outputHeight
*
outputWidth
},
input
.
options
());
gradOutput
=
gradOutput
.
view
({
batchSize
/
im2col_step
,
im2col_step
,
nOutputPlane
,
outputHeight
,
outputWidth
});
gradOutput
.
transpose_
(
1
,
2
);
at
::
Tensor
gradOutputBuffer
=
at
::
zeros_like
(
gradOutput
);
gradOutputBuffer
=
gradOutputBuffer
.
view
({
batchSize
/
im2col_step
,
nOutputPlane
,
im2col_step
,
outputHeight
,
outputWidth
});
gradOutputBuffer
.
copy_
(
gradOutput
);
// gradOutput is not contiguous, so we do reshape (instead of view) next
gradOutputBuffer
=
gradOutputBuffer
.
reshape
({
batchSize
/
im2col_step
,
nOutputPlane
,
im2col_step
*
outputHeight
,
outputWidth
});
gradOutput
.
transpose_
(
1
,
2
);
gradOutput
=
gradOutput
.
view
({
batchSize
,
nOutputPlane
,
outputHeight
,
outputWidth
});
input
=
input
.
view
({
batchSize
/
im2col_step
,
im2col_step
,
nInputPlane
,
inputHeight
,
inputWidth
});
offset
=
offset
.
view
({
batchSize
/
im2col_step
,
im2col_step
,
deformable_group
*
2
*
kH
*
kW
,
outputHeight
,
outputWidth
});
for
(
int
elt
=
0
;
elt
<
batchSize
/
im2col_step
;
elt
++
)
{
deformable_im2col
(
input
[
elt
],
offset
[
elt
],
nInputPlane
,
inputHeight
,
inputWidth
,
kH
,
kW
,
padH
,
padW
,
dH
,
dW
,
dilationH
,
dilationW
,
im2col_step
,
deformable_group
,
columns
);
// divide into group
gradOutputBuffer
=
gradOutputBuffer
.
view
({
gradOutputBuffer
.
size
(
0
),
group
,
gradOutputBuffer
.
size
(
1
)
/
group
,
gradOutputBuffer
.
size
(
2
),
gradOutputBuffer
.
size
(
3
)});
columns
=
columns
.
view
({
group
,
columns
.
size
(
0
)
/
group
,
columns
.
size
(
1
)});
gradWeight
=
gradWeight
.
view
({
group
,
gradWeight
.
size
(
0
)
/
group
,
gradWeight
.
size
(
1
),
gradWeight
.
size
(
2
),
gradWeight
.
size
(
3
)});
for
(
int
g
=
0
;
g
<
group
;
g
++
)
{
gradWeight
[
g
]
=
gradWeight
[
g
]
.
flatten
(
1
)
.
addmm_
(
gradOutputBuffer
[
elt
][
g
].
flatten
(
1
),
columns
[
g
].
transpose
(
1
,
0
),
1.0
,
scale
)
.
view_as
(
gradWeight
[
g
]);
}
gradOutputBuffer
=
gradOutputBuffer
.
view
(
{
gradOutputBuffer
.
size
(
0
),
gradOutputBuffer
.
size
(
1
)
*
gradOutputBuffer
.
size
(
2
),
gradOutputBuffer
.
size
(
3
),
gradOutputBuffer
.
size
(
4
)});
columns
=
columns
.
view
({
columns
.
size
(
0
)
*
columns
.
size
(
1
),
columns
.
size
(
2
)});
gradWeight
=
gradWeight
.
view
({
gradWeight
.
size
(
0
)
*
gradWeight
.
size
(
1
),
gradWeight
.
size
(
2
),
gradWeight
.
size
(
3
),
gradWeight
.
size
(
4
)});
}
input
=
input
.
view
({
batchSize
,
nInputPlane
,
inputHeight
,
inputWidth
});
offset
=
offset
.
view
(
{
batchSize
,
deformable_group
*
2
*
kH
*
kW
,
outputHeight
,
outputWidth
});
if
(
batch
==
0
)
{
gradOutput
=
gradOutput
.
view
({
nOutputPlane
,
outputHeight
,
outputWidth
});
input
=
input
.
view
({
nInputPlane
,
inputHeight
,
inputWidth
});
}
return
1
;
}
void
modulated_deform_conv_cuda_forward
(
at
::
Tensor
input
,
at
::
Tensor
weight
,
at
::
Tensor
bias
,
at
::
Tensor
ones
,
at
::
Tensor
offset
,
at
::
Tensor
mask
,
at
::
Tensor
output
,
at
::
Tensor
columns
,
int
kernel_h
,
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
deformable_group
,
const
bool
with_bias
)
{
TORCH_CHECK
(
input
.
is_contiguous
(),
"input tensor has to be contiguous"
);
TORCH_CHECK
(
weight
.
is_contiguous
(),
"weight tensor has to be contiguous"
);
const
int
batch
=
input
.
size
(
0
);
const
int
channels
=
input
.
size
(
1
);
const
int
height
=
input
.
size
(
2
);
const
int
width
=
input
.
size
(
3
);
const
int
channels_out
=
weight
.
size
(
0
);
const
int
channels_kernel
=
weight
.
size
(
1
);
const
int
kernel_h_
=
weight
.
size
(
2
);
const
int
kernel_w_
=
weight
.
size
(
3
);
if
(
kernel_h_
!=
kernel_h
||
kernel_w_
!=
kernel_w
)
AT_ERROR
(
"Input shape and kernel shape wont match: (%d x %d vs %d x %d)."
,
kernel_h_
,
kernel_w
,
kernel_h_
,
kernel_w_
);
if
(
channels
!=
channels_kernel
*
group
)
AT_ERROR
(
"Input shape and kernel channels wont match: (%d vs %d)."
,
channels
,
channels_kernel
*
group
);
const
int
height_out
=
(
height
+
2
*
pad_h
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
/
stride_h
+
1
;
const
int
width_out
=
(
width
+
2
*
pad_w
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
/
stride_w
+
1
;
if
(
ones
.
ndimension
()
!=
2
||
ones
.
size
(
0
)
*
ones
.
size
(
1
)
<
height_out
*
width_out
)
{
// Resize plane and fill with ones...
ones
=
at
::
ones
({
height_out
,
width_out
},
input
.
options
());
}
// resize output
output
=
output
.
view
({
batch
,
channels_out
,
height_out
,
width_out
}).
zero_
();
// resize temporary columns
columns
=
at
::
zeros
(
{
channels
*
kernel_h
*
kernel_w
,
1
*
height_out
*
width_out
},
input
.
options
());
output
=
output
.
view
({
output
.
size
(
0
),
group
,
output
.
size
(
1
)
/
group
,
output
.
size
(
2
),
output
.
size
(
3
)});
for
(
int
b
=
0
;
b
<
batch
;
b
++
)
{
modulated_deformable_im2col_cuda
(
input
[
b
],
offset
[
b
],
mask
[
b
],
1
,
channels
,
height
,
width
,
height_out
,
width_out
,
kernel_h
,
kernel_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
deformable_group
,
columns
);
// divide into group
weight
=
weight
.
view
({
group
,
weight
.
size
(
0
)
/
group
,
weight
.
size
(
1
),
weight
.
size
(
2
),
weight
.
size
(
3
)});
columns
=
columns
.
view
({
group
,
columns
.
size
(
0
)
/
group
,
columns
.
size
(
1
)});
for
(
int
g
=
0
;
g
<
group
;
g
++
)
{
output
[
b
][
g
]
=
output
[
b
][
g
]
.
flatten
(
1
)
.
addmm_
(
weight
[
g
].
flatten
(
1
),
columns
[
g
])
.
view_as
(
output
[
b
][
g
]);
}
weight
=
weight
.
view
({
weight
.
size
(
0
)
*
weight
.
size
(
1
),
weight
.
size
(
2
),
weight
.
size
(
3
),
weight
.
size
(
4
)});
columns
=
columns
.
view
({
columns
.
size
(
0
)
*
columns
.
size
(
1
),
columns
.
size
(
2
)});
}
output
=
output
.
view
({
output
.
size
(
0
),
output
.
size
(
1
)
*
output
.
size
(
2
),
output
.
size
(
3
),
output
.
size
(
4
)});
if
(
with_bias
)
{
output
+=
bias
.
view
({
1
,
bias
.
size
(
0
),
1
,
1
});
}
}
void
modulated_deform_conv_cuda_backward
(
at
::
Tensor
input
,
at
::
Tensor
weight
,
at
::
Tensor
bias
,
at
::
Tensor
ones
,
at
::
Tensor
offset
,
at
::
Tensor
mask
,
at
::
Tensor
columns
,
at
::
Tensor
grad_input
,
at
::
Tensor
grad_weight
,
at
::
Tensor
grad_bias
,
at
::
Tensor
grad_offset
,
at
::
Tensor
grad_mask
,
at
::
Tensor
grad_output
,
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
int
pad_h
,
int
pad_w
,
int
dilation_h
,
int
dilation_w
,
int
group
,
int
deformable_group
,
const
bool
with_bias
)
{
TORCH_CHECK
(
input
.
is_contiguous
(),
"input tensor has to be contiguous"
);
TORCH_CHECK
(
weight
.
is_contiguous
(),
"weight tensor has to be contiguous"
);
const
int
batch
=
input
.
size
(
0
);
const
int
channels
=
input
.
size
(
1
);
const
int
height
=
input
.
size
(
2
);
const
int
width
=
input
.
size
(
3
);
const
int
channels_kernel
=
weight
.
size
(
1
);
const
int
kernel_h_
=
weight
.
size
(
2
);
const
int
kernel_w_
=
weight
.
size
(
3
);
if
(
kernel_h_
!=
kernel_h
||
kernel_w_
!=
kernel_w
)
AT_ERROR
(
"Input shape and kernel shape wont match: (%d x %d vs %d x %d)."
,
kernel_h_
,
kernel_w
,
kernel_h_
,
kernel_w_
);
if
(
channels
!=
channels_kernel
*
group
)
AT_ERROR
(
"Input shape and kernel channels wont match: (%d vs %d)."
,
channels
,
channels_kernel
*
group
);
const
int
height_out
=
(
height
+
2
*
pad_h
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
/
stride_h
+
1
;
const
int
width_out
=
(
width
+
2
*
pad_w
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
/
stride_w
+
1
;
if
(
ones
.
ndimension
()
!=
2
||
ones
.
size
(
0
)
*
ones
.
size
(
1
)
<
height_out
*
width_out
)
{
// Resize plane and fill with ones...
ones
=
at
::
ones
({
height_out
,
width_out
},
input
.
options
());
}
grad_input
=
grad_input
.
view
({
batch
,
channels
,
height
,
width
});
columns
=
at
::
zeros
(
{
channels
*
kernel_h
*
kernel_w
,
height_out
*
width_out
},
input
.
options
());
grad_output
=
grad_output
.
view
({
grad_output
.
size
(
0
),
group
,
grad_output
.
size
(
1
)
/
group
,
grad_output
.
size
(
2
),
grad_output
.
size
(
3
)});
for
(
int
b
=
0
;
b
<
batch
;
b
++
)
{
// divide int group
columns
=
columns
.
view
({
group
,
columns
.
size
(
0
)
/
group
,
columns
.
size
(
1
)});
weight
=
weight
.
view
({
group
,
weight
.
size
(
0
)
/
group
,
weight
.
size
(
1
),
weight
.
size
(
2
),
weight
.
size
(
3
)});
for
(
int
g
=
0
;
g
<
group
;
g
++
)
{
columns
[
g
].
addmm_
(
weight
[
g
].
flatten
(
1
).
transpose
(
0
,
1
),
grad_output
[
b
][
g
].
flatten
(
1
),
0.0
f
,
1.0
f
);
}
columns
=
columns
.
view
({
columns
.
size
(
0
)
*
columns
.
size
(
1
),
columns
.
size
(
2
)});
weight
=
weight
.
view
({
weight
.
size
(
0
)
*
weight
.
size
(
1
),
weight
.
size
(
2
),
weight
.
size
(
3
),
weight
.
size
(
4
)});
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cuda
(
columns
,
input
[
b
],
offset
[
b
],
mask
[
b
],
1
,
channels
,
height
,
width
,
height_out
,
width_out
,
kernel_h
,
kernel_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
deformable_group
,
grad_offset
[
b
],
grad_mask
[
b
]);
// gradient w.r.t. input data
modulated_deformable_col2im_cuda
(
columns
,
offset
[
b
],
mask
[
b
],
1
,
channels
,
height
,
width
,
height_out
,
width_out
,
kernel_h
,
kernel_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
deformable_group
,
grad_input
[
b
]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
modulated_deformable_im2col_cuda
(
input
[
b
],
offset
[
b
],
mask
[
b
],
1
,
channels
,
height
,
width
,
height_out
,
width_out
,
kernel_h
,
kernel_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
deformable_group
,
columns
);
columns
=
columns
.
view
({
group
,
columns
.
size
(
0
)
/
group
,
columns
.
size
(
1
)});
grad_weight
=
grad_weight
.
view
({
group
,
grad_weight
.
size
(
0
)
/
group
,
grad_weight
.
size
(
1
),
grad_weight
.
size
(
2
),
grad_weight
.
size
(
3
)});
if
(
with_bias
)
grad_bias
=
grad_bias
.
view
({
group
,
grad_bias
.
size
(
0
)
/
group
});
for
(
int
g
=
0
;
g
<
group
;
g
++
)
{
grad_weight
[
g
]
=
grad_weight
[
g
]
.
flatten
(
1
)
.
addmm_
(
grad_output
[
b
][
g
].
flatten
(
1
),
columns
[
g
].
transpose
(
0
,
1
))
.
view_as
(
grad_weight
[
g
]);
if
(
with_bias
)
{
grad_bias
[
g
]
=
grad_bias
[
g
]
.
view
({
-
1
,
1
})
.
addmm_
(
grad_output
[
b
][
g
].
flatten
(
1
),
ones
.
view
({
-
1
,
1
}))
.
view
(
-
1
);
}
}
columns
=
columns
.
view
({
columns
.
size
(
0
)
*
columns
.
size
(
1
),
columns
.
size
(
2
)});
grad_weight
=
grad_weight
.
view
({
grad_weight
.
size
(
0
)
*
grad_weight
.
size
(
1
),
grad_weight
.
size
(
2
),
grad_weight
.
size
(
3
),
grad_weight
.
size
(
4
)});
if
(
with_bias
)
grad_bias
=
grad_bias
.
view
({
grad_bias
.
size
(
0
)
*
grad_bias
.
size
(
1
)});
}
grad_output
=
grad_output
.
view
({
grad_output
.
size
(
0
)
*
grad_output
.
size
(
1
),
grad_output
.
size
(
2
),
grad_output
.
size
(
3
),
grad_output
.
size
(
4
)});
}
}
// namespace detectron2
detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
// Original license: Apache 2.0
// clang-format off
// modify from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
/*!
******************* BEGIN Caffe Copyright Notice and Disclaimer *****************
*
* COPYRIGHT
*
* All contributions by the University of California:
* Copyright (c) 2014-2017 The Regents of the University of California (Regents)
* All rights reserved.
*
* All other contributions:
* Copyright (c) 2014-2017, the respective contributors
* All rights reserved.
*
* Caffe uses a shared copyright model: each contributor holds copyright over
* their contributions to Caffe. The project versioning records all such
* contribution and copyright details. If a contributor wants to further mark
* their specific copyright on a particular contribution, they should indicate
* their copyright solely in the commit message of the change when it is
* committed.
*
* LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
*FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
*DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
*CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
*OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
*OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* CONTRIBUTION AGREEMENT
*
* By contributing to the BVLC/caffe repository through pull-request, comment,
* or otherwise, the contributor releases their content to the
* license and copyright terms herein.
*
***************** END Caffe Copyright Notice and Disclaimer *********************
*
* Copyright (c) 2018 Microsoft
* Licensed under The MIT License [see LICENSE for details]
* \file modulated_deformable_im2col.cuh
* \brief Function definitions of converting an image to
* column matrix based on kernel, padding, dilation, and offset.
* These functions are mainly used in deformable convolution operators.
* \ref: https://arxiv.org/abs/1703.06211
* \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
*/
#include <ATen/ATen.h>
#include <c10/cuda/CUDAGuard.h>
#include <float.h>
#include <math.h>
#include <stdio.h>
#include <THC/THCAtomics.cuh>
using
namespace
at
;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
namespace
{
const
int
CUDA_NUM_THREADS
=
1024
;
const
int
kMaxGridNum
=
65535
;
inline
int
GET_BLOCKS
(
const
int
N
)
{
return
std
::
min
(
kMaxGridNum
,
(
N
+
CUDA_NUM_THREADS
-
1
)
/
CUDA_NUM_THREADS
);
}
}
template
<
typename
scalar_t
>
__device__
scalar_t
deformable_im2col_bilinear
(
const
scalar_t
*
bottom_data
,
const
int
data_width
,
const
int
height
,
const
int
width
,
scalar_t
h
,
scalar_t
w
)
{
int
h_low
=
floor
(
h
);
int
w_low
=
floor
(
w
);
int
h_high
=
h_low
+
1
;
int
w_high
=
w_low
+
1
;
scalar_t
lh
=
h
-
h_low
;
scalar_t
lw
=
w
-
w_low
;
scalar_t
hh
=
1
-
lh
,
hw
=
1
-
lw
;
scalar_t
v1
=
0
;
if
(
h_low
>=
0
&&
w_low
>=
0
)
v1
=
bottom_data
[
h_low
*
data_width
+
w_low
];
scalar_t
v2
=
0
;
if
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
v2
=
bottom_data
[
h_low
*
data_width
+
w_high
];
scalar_t
v3
=
0
;
if
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
v3
=
bottom_data
[
h_high
*
data_width
+
w_low
];
scalar_t
v4
=
0
;
if
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
v4
=
bottom_data
[
h_high
*
data_width
+
w_high
];
scalar_t
w1
=
hh
*
hw
,
w2
=
hh
*
lw
,
w3
=
lh
*
hw
,
w4
=
lh
*
lw
;
scalar_t
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
return
val
;
}
template
<
typename
scalar_t
>
__device__
scalar_t
get_gradient_weight
(
scalar_t
argmax_h
,
scalar_t
argmax_w
,
const
int
h
,
const
int
w
,
const
int
height
,
const
int
width
)
{
if
(
argmax_h
<=
-
1
||
argmax_h
>=
height
||
argmax_w
<=
-
1
||
argmax_w
>=
width
)
{
// empty
return
0
;
}
int
argmax_h_low
=
floor
(
argmax_h
);
int
argmax_w_low
=
floor
(
argmax_w
);
int
argmax_h_high
=
argmax_h_low
+
1
;
int
argmax_w_high
=
argmax_w_low
+
1
;
scalar_t
weight
=
0
;
if
(
h
==
argmax_h_low
&&
w
==
argmax_w_low
)
weight
=
(
h
+
1
-
argmax_h
)
*
(
w
+
1
-
argmax_w
);
if
(
h
==
argmax_h_low
&&
w
==
argmax_w_high
)
weight
=
(
h
+
1
-
argmax_h
)
*
(
argmax_w
+
1
-
w
);
if
(
h
==
argmax_h_high
&&
w
==
argmax_w_low
)
weight
=
(
argmax_h
+
1
-
h
)
*
(
w
+
1
-
argmax_w
);
if
(
h
==
argmax_h_high
&&
w
==
argmax_w_high
)
weight
=
(
argmax_h
+
1
-
h
)
*
(
argmax_w
+
1
-
w
);
return
weight
;
}
template
<
typename
scalar_t
>
__device__
scalar_t
get_coordinate_weight
(
scalar_t
argmax_h
,
scalar_t
argmax_w
,
const
int
height
,
const
int
width
,
const
scalar_t
*
im_data
,
const
int
data_width
,
const
int
bp_dir
)
{
if
(
argmax_h
<=
-
1
||
argmax_h
>=
height
||
argmax_w
<=
-
1
||
argmax_w
>=
width
)
{
// empty
return
0
;
}
int
argmax_h_low
=
floor
(
argmax_h
);
int
argmax_w_low
=
floor
(
argmax_w
);
int
argmax_h_high
=
argmax_h_low
+
1
;
int
argmax_w_high
=
argmax_w_low
+
1
;
scalar_t
weight
=
0
;
if
(
bp_dir
==
0
)
{
if
(
argmax_h_low
>=
0
&&
argmax_w_low
>=
0
)
weight
+=
-
1
*
(
argmax_w_low
+
1
-
argmax_w
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_low
];
if
(
argmax_h_low
>=
0
&&
argmax_w_high
<=
width
-
1
)
weight
+=
-
1
*
(
argmax_w
-
argmax_w_low
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_high
];
if
(
argmax_h_high
<=
height
-
1
&&
argmax_w_low
>=
0
)
weight
+=
(
argmax_w_low
+
1
-
argmax_w
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_low
];
if
(
argmax_h_high
<=
height
-
1
&&
argmax_w_high
<=
width
-
1
)
weight
+=
(
argmax_w
-
argmax_w_low
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_high
];
}
else
if
(
bp_dir
==
1
)
{
if
(
argmax_h_low
>=
0
&&
argmax_w_low
>=
0
)
weight
+=
-
1
*
(
argmax_h_low
+
1
-
argmax_h
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_low
];
if
(
argmax_h_low
>=
0
&&
argmax_w_high
<=
width
-
1
)
weight
+=
(
argmax_h_low
+
1
-
argmax_h
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_high
];
if
(
argmax_h_high
<=
height
-
1
&&
argmax_w_low
>=
0
)
weight
+=
-
1
*
(
argmax_h
-
argmax_h_low
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_low
];
if
(
argmax_h_high
<=
height
-
1
&&
argmax_w_high
<=
width
-
1
)
weight
+=
(
argmax_h
-
argmax_h_low
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_high
];
}
return
weight
;
}
template
<
typename
scalar_t
>
__global__
void
deformable_im2col_gpu_kernel
(
const
int
n
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
num_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
scalar_t
*
data_col
)
{
CUDA_KERNEL_LOOP
(
index
,
n
)
{
// index index of output matrix
const
int
w_col
=
index
%
width_col
;
const
int
h_col
=
(
index
/
width_col
)
%
height_col
;
const
int
b_col
=
(
index
/
width_col
/
height_col
)
%
batch_size
;
const
int
c_im
=
(
index
/
width_col
/
height_col
)
/
batch_size
;
const
int
c_col
=
c_im
*
kernel_h
*
kernel_w
;
// compute deformable group index
const
int
deformable_group_index
=
c_im
/
channel_per_deformable_group
;
const
int
h_in
=
h_col
*
stride_h
-
pad_h
;
const
int
w_in
=
w_col
*
stride_w
-
pad_w
;
scalar_t
*
data_col_ptr
=
data_col
+
((
c_col
*
batch_size
+
b_col
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
// const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) *
// height + h_in) * width + w_in;
const
scalar_t
*
data_im_ptr
=
data_im
+
(
b_col
*
num_channels
+
c_im
)
*
height
*
width
;
const
scalar_t
*
data_offset_ptr
=
data_offset
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
for
(
int
i
=
0
;
i
<
kernel_h
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
++
j
)
{
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
scalar_t
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
scalar_t
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
scalar_t
val
=
static_cast
<
scalar_t
>
(
0
);
const
scalar_t
h_im
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
scalar_t
w_im
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
h_im
>
-
1
&&
w_im
>
-
1
&&
h_im
<
height
&&
w_im
<
width
)
{
// const scalar_t map_h = i * dilation_h + offset_h;
// const scalar_t map_w = j * dilation_w + offset_w;
// const int cur_height = height - h_in;
// const int cur_width = width - w_in;
// val = deformable_im2col_bilinear(data_im_ptr, width, cur_height,
// cur_width, map_h, map_w);
val
=
deformable_im2col_bilinear
(
data_im_ptr
,
width
,
height
,
width
,
h_im
,
w_im
);
}
*
data_col_ptr
=
val
;
data_col_ptr
+=
batch_size
*
height_col
*
width_col
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
deformable_col2im_gpu_kernel
(
const
int
n
,
const
scalar_t
*
data_col
,
const
scalar_t
*
data_offset
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
scalar_t
*
grad_im
)
{
CUDA_KERNEL_LOOP
(
index
,
n
)
{
const
int
j
=
(
index
/
width_col
/
height_col
/
batch_size
)
%
kernel_w
;
const
int
i
=
(
index
/
width_col
/
height_col
/
batch_size
/
kernel_w
)
%
kernel_h
;
const
int
c
=
index
/
width_col
/
height_col
/
batch_size
/
kernel_w
/
kernel_h
;
// compute the start and end of the output
const
int
deformable_group_index
=
c
/
channel_per_deformable_group
;
int
w_out
=
index
%
width_col
;
int
h_out
=
(
index
/
width_col
)
%
height_col
;
int
b
=
(
index
/
width_col
/
height_col
)
%
batch_size
;
int
w_in
=
w_out
*
stride_w
-
pad_w
;
int
h_in
=
h_out
*
stride_h
-
pad_h
;
const
scalar_t
*
data_offset_ptr
=
data_offset
+
(
b
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
scalar_t
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
scalar_t
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
const
scalar_t
cur_inv_h_data
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
scalar_t
cur_inv_w_data
=
w_in
+
j
*
dilation_w
+
offset_w
;
const
scalar_t
cur_top_grad
=
data_col
[
index
];
const
int
cur_h
=
(
int
)
cur_inv_h_data
;
const
int
cur_w
=
(
int
)
cur_inv_w_data
;
for
(
int
dy
=
-
2
;
dy
<=
2
;
dy
++
)
{
for
(
int
dx
=
-
2
;
dx
<=
2
;
dx
++
)
{
if
(
cur_h
+
dy
>=
0
&&
cur_h
+
dy
<
height
&&
cur_w
+
dx
>=
0
&&
cur_w
+
dx
<
width
&&
abs
(
cur_inv_h_data
-
(
cur_h
+
dy
))
<
1
&&
abs
(
cur_inv_w_data
-
(
cur_w
+
dx
))
<
1
)
{
int
cur_bottom_grad_pos
=
((
b
*
channels
+
c
)
*
height
+
cur_h
+
dy
)
*
width
+
cur_w
+
dx
;
scalar_t
weight
=
get_gradient_weight
(
cur_inv_h_data
,
cur_inv_w_data
,
cur_h
+
dy
,
cur_w
+
dx
,
height
,
width
);
atomicAdd
(
grad_im
+
cur_bottom_grad_pos
,
weight
*
cur_top_grad
);
}
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
deformable_col2im_coord_gpu_kernel
(
const
int
n
,
const
scalar_t
*
data_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
offset_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
scalar_t
*
grad_offset
)
{
CUDA_KERNEL_LOOP
(
index
,
n
)
{
scalar_t
val
=
0
;
int
w
=
index
%
width_col
;
int
h
=
(
index
/
width_col
)
%
height_col
;
int
c
=
(
index
/
width_col
/
height_col
)
%
offset_channels
;
int
b
=
(
index
/
width_col
/
height_col
)
/
offset_channels
;
// compute the start and end of the output
const
int
deformable_group_index
=
c
/
(
2
*
kernel_h
*
kernel_w
);
const
int
col_step
=
kernel_h
*
kernel_w
;
int
cnt
=
0
;
const
scalar_t
*
data_col_ptr
=
data_col
+
deformable_group_index
*
channel_per_deformable_group
*
batch_size
*
width_col
*
height_col
;
const
scalar_t
*
data_im_ptr
=
data_im
+
(
b
*
deformable_group
+
deformable_group_index
)
*
channel_per_deformable_group
/
kernel_h
/
kernel_w
*
height
*
width
;
const
scalar_t
*
data_offset_ptr
=
data_offset
+
(
b
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
int
offset_c
=
c
-
deformable_group_index
*
2
*
kernel_h
*
kernel_w
;
for
(
int
col_c
=
(
offset_c
/
2
);
col_c
<
channel_per_deformable_group
;
col_c
+=
col_step
)
{
const
int
col_pos
=
(((
col_c
*
batch_size
+
b
)
*
height_col
)
+
h
)
*
width_col
+
w
;
const
int
bp_dir
=
offset_c
%
2
;
int
j
=
(
col_pos
/
width_col
/
height_col
/
batch_size
)
%
kernel_w
;
int
i
=
(
col_pos
/
width_col
/
height_col
/
batch_size
/
kernel_w
)
%
kernel_h
;
int
w_out
=
col_pos
%
width_col
;
int
h_out
=
(
col_pos
/
width_col
)
%
height_col
;
int
w_in
=
w_out
*
stride_w
-
pad_w
;
int
h_in
=
h_out
*
stride_h
-
pad_h
;
const
int
data_offset_h_ptr
=
(((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
int
data_offset_w_ptr
=
(((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
scalar_t
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
scalar_t
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
scalar_t
inv_h
=
h_in
+
i
*
dilation_h
+
offset_h
;
scalar_t
inv_w
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
inv_h
<=
-
1
||
inv_w
<=
-
1
||
inv_h
>=
height
||
inv_w
>=
width
)
{
inv_h
=
inv_w
=
-
2
;
}
const
scalar_t
weight
=
get_coordinate_weight
(
inv_h
,
inv_w
,
height
,
width
,
data_im_ptr
+
cnt
*
height
*
width
,
width
,
bp_dir
);
val
+=
weight
*
data_col_ptr
[
col_pos
];
cnt
+=
1
;
}
grad_offset
[
index
]
=
val
;
}
}
namespace
detectron2
{
void
deformable_im2col
(
const
at
::
Tensor
data_im
,
const
at
::
Tensor
data_offset
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
ksize_h
,
const
int
ksize_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
parallel_imgs
,
const
int
deformable_group
,
at
::
Tensor
data_col
)
{
// num_axes should be smaller than block size
// todo: check parallel_imgs is correctly passed in
int
height_col
=
(
height
+
2
*
pad_h
-
(
dilation_h
*
(
ksize_h
-
1
)
+
1
))
/
stride_h
+
1
;
int
width_col
=
(
width
+
2
*
pad_w
-
(
dilation_w
*
(
ksize_w
-
1
)
+
1
))
/
stride_w
+
1
;
int
num_kernels
=
channels
*
height_col
*
width_col
*
parallel_imgs
;
int
channel_per_deformable_group
=
channels
/
deformable_group
;
at
::
cuda
::
CUDAGuard
device_guard
(
data_im
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
data_im
.
scalar_type
(),
"deformable_im2col_gpu"
,
([
&
]
{
const
scalar_t
*
data_im_
=
data_im
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
data_offset_
=
data_offset
.
data_ptr
<
scalar_t
>
();
scalar_t
*
data_col_
=
data_col
.
data_ptr
<
scalar_t
>
();
deformable_im2col_gpu_kernel
<<<
GET_BLOCKS
(
num_kernels
),
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
num_kernels
,
data_im_
,
data_offset_
,
height
,
width
,
ksize_h
,
ksize_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channel_per_deformable_group
,
parallel_imgs
,
channels
,
deformable_group
,
height_col
,
width_col
,
data_col_
);
}));
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"error in deformable_im2col: %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
void
deformable_col2im
(
const
at
::
Tensor
data_col
,
const
at
::
Tensor
data_offset
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
ksize_h
,
const
int
ksize_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
parallel_imgs
,
const
int
deformable_group
,
at
::
Tensor
grad_im
)
{
// todo: make sure parallel_imgs is passed in correctly
int
height_col
=
(
height
+
2
*
pad_h
-
(
dilation_h
*
(
ksize_h
-
1
)
+
1
))
/
stride_h
+
1
;
int
width_col
=
(
width
+
2
*
pad_w
-
(
dilation_w
*
(
ksize_w
-
1
)
+
1
))
/
stride_w
+
1
;
int
num_kernels
=
channels
*
ksize_h
*
ksize_w
*
height_col
*
width_col
*
parallel_imgs
;
int
channel_per_deformable_group
=
channels
/
deformable_group
;
at
::
cuda
::
CUDAGuard
device_guard
(
data_col
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
data_col
.
scalar_type
(),
"deformable_col2im_gpu"
,
([
&
]
{
const
scalar_t
*
data_col_
=
data_col
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
data_offset_
=
data_offset
.
data_ptr
<
scalar_t
>
();
scalar_t
*
grad_im_
=
grad_im
.
data_ptr
<
scalar_t
>
();
deformable_col2im_gpu_kernel
<<<
GET_BLOCKS
(
num_kernels
),
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
num_kernels
,
data_col_
,
data_offset_
,
channels
,
height
,
width
,
ksize_h
,
ksize_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channel_per_deformable_group
,
parallel_imgs
,
deformable_group
,
height_col
,
width_col
,
grad_im_
);
}));
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"error in deformable_col2im: %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
void
deformable_col2im_coord
(
const
at
::
Tensor
data_col
,
const
at
::
Tensor
data_im
,
const
at
::
Tensor
data_offset
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
ksize_h
,
const
int
ksize_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
parallel_imgs
,
const
int
deformable_group
,
at
::
Tensor
grad_offset
)
{
int
height_col
=
(
height
+
2
*
pad_h
-
(
dilation_h
*
(
ksize_h
-
1
)
+
1
))
/
stride_h
+
1
;
int
width_col
=
(
width
+
2
*
pad_w
-
(
dilation_w
*
(
ksize_w
-
1
)
+
1
))
/
stride_w
+
1
;
int
num_kernels
=
height_col
*
width_col
*
2
*
ksize_h
*
ksize_w
*
deformable_group
*
parallel_imgs
;
int
channel_per_deformable_group
=
channels
*
ksize_h
*
ksize_w
/
deformable_group
;
at
::
cuda
::
CUDAGuard
device_guard
(
data_col
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
data_col
.
scalar_type
(),
"deformable_col2im_coord_gpu"
,
([
&
]
{
const
scalar_t
*
data_col_
=
data_col
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
data_im_
=
data_im
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
data_offset_
=
data_offset
.
data_ptr
<
scalar_t
>
();
scalar_t
*
grad_offset_
=
grad_offset
.
data_ptr
<
scalar_t
>
();
deformable_col2im_coord_gpu_kernel
<<<
GET_BLOCKS
(
num_kernels
),
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
num_kernels
,
data_col_
,
data_im_
,
data_offset_
,
channels
,
height
,
width
,
ksize_h
,
ksize_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channel_per_deformable_group
,
parallel_imgs
,
2
*
ksize_h
*
ksize_w
*
deformable_group
,
deformable_group
,
height_col
,
width_col
,
grad_offset_
);
}));
}
}
// namespace detectron2
template
<
typename
scalar_t
>
__device__
scalar_t
dmcn_im2col_bilinear
(
const
scalar_t
*
bottom_data
,
const
int
data_width
,
const
int
height
,
const
int
width
,
scalar_t
h
,
scalar_t
w
)
{
int
h_low
=
floor
(
h
);
int
w_low
=
floor
(
w
);
int
h_high
=
h_low
+
1
;
int
w_high
=
w_low
+
1
;
scalar_t
lh
=
h
-
h_low
;
scalar_t
lw
=
w
-
w_low
;
scalar_t
hh
=
1
-
lh
,
hw
=
1
-
lw
;
scalar_t
v1
=
0
;
if
(
h_low
>=
0
&&
w_low
>=
0
)
v1
=
bottom_data
[
h_low
*
data_width
+
w_low
];
scalar_t
v2
=
0
;
if
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
v2
=
bottom_data
[
h_low
*
data_width
+
w_high
];
scalar_t
v3
=
0
;
if
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
v3
=
bottom_data
[
h_high
*
data_width
+
w_low
];
scalar_t
v4
=
0
;
if
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
v4
=
bottom_data
[
h_high
*
data_width
+
w_high
];
scalar_t
w1
=
hh
*
hw
,
w2
=
hh
*
lw
,
w3
=
lh
*
hw
,
w4
=
lh
*
lw
;
scalar_t
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
return
val
;
}
template
<
typename
scalar_t
>
__device__
scalar_t
dmcn_get_gradient_weight
(
scalar_t
argmax_h
,
scalar_t
argmax_w
,
const
int
h
,
const
int
w
,
const
int
height
,
const
int
width
)
{
if
(
argmax_h
<=
-
1
||
argmax_h
>=
height
||
argmax_w
<=
-
1
||
argmax_w
>=
width
)
{
// empty
return
0
;
}
int
argmax_h_low
=
floor
(
argmax_h
);
int
argmax_w_low
=
floor
(
argmax_w
);
int
argmax_h_high
=
argmax_h_low
+
1
;
int
argmax_w_high
=
argmax_w_low
+
1
;
scalar_t
weight
=
0
;
if
(
h
==
argmax_h_low
&&
w
==
argmax_w_low
)
weight
=
(
h
+
1
-
argmax_h
)
*
(
w
+
1
-
argmax_w
);
if
(
h
==
argmax_h_low
&&
w
==
argmax_w_high
)
weight
=
(
h
+
1
-
argmax_h
)
*
(
argmax_w
+
1
-
w
);
if
(
h
==
argmax_h_high
&&
w
==
argmax_w_low
)
weight
=
(
argmax_h
+
1
-
h
)
*
(
w
+
1
-
argmax_w
);
if
(
h
==
argmax_h_high
&&
w
==
argmax_w_high
)
weight
=
(
argmax_h
+
1
-
h
)
*
(
argmax_w
+
1
-
w
);
return
weight
;
}
template
<
typename
scalar_t
>
__device__
scalar_t
dmcn_get_coordinate_weight
(
scalar_t
argmax_h
,
scalar_t
argmax_w
,
const
int
height
,
const
int
width
,
const
scalar_t
*
im_data
,
const
int
data_width
,
const
int
bp_dir
)
{
if
(
argmax_h
<=
-
1
||
argmax_h
>=
height
||
argmax_w
<=
-
1
||
argmax_w
>=
width
)
{
// empty
return
0
;
}
int
argmax_h_low
=
floor
(
argmax_h
);
int
argmax_w_low
=
floor
(
argmax_w
);
int
argmax_h_high
=
argmax_h_low
+
1
;
int
argmax_w_high
=
argmax_w_low
+
1
;
scalar_t
weight
=
0
;
if
(
bp_dir
==
0
)
{
if
(
argmax_h_low
>=
0
&&
argmax_w_low
>=
0
)
weight
+=
-
1
*
(
argmax_w_low
+
1
-
argmax_w
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_low
];
if
(
argmax_h_low
>=
0
&&
argmax_w_high
<=
width
-
1
)
weight
+=
-
1
*
(
argmax_w
-
argmax_w_low
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_high
];
if
(
argmax_h_high
<=
height
-
1
&&
argmax_w_low
>=
0
)
weight
+=
(
argmax_w_low
+
1
-
argmax_w
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_low
];
if
(
argmax_h_high
<=
height
-
1
&&
argmax_w_high
<=
width
-
1
)
weight
+=
(
argmax_w
-
argmax_w_low
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_high
];
}
else
if
(
bp_dir
==
1
)
{
if
(
argmax_h_low
>=
0
&&
argmax_w_low
>=
0
)
weight
+=
-
1
*
(
argmax_h_low
+
1
-
argmax_h
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_low
];
if
(
argmax_h_low
>=
0
&&
argmax_w_high
<=
width
-
1
)
weight
+=
(
argmax_h_low
+
1
-
argmax_h
)
*
im_data
[
argmax_h_low
*
data_width
+
argmax_w_high
];
if
(
argmax_h_high
<=
height
-
1
&&
argmax_w_low
>=
0
)
weight
+=
-
1
*
(
argmax_h
-
argmax_h_low
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_low
];
if
(
argmax_h_high
<=
height
-
1
&&
argmax_w_high
<=
width
-
1
)
weight
+=
(
argmax_h
-
argmax_h_low
)
*
im_data
[
argmax_h_high
*
data_width
+
argmax_w_high
];
}
return
weight
;
}
template
<
typename
scalar_t
>
__global__
void
modulated_deformable_im2col_gpu_kernel
(
const
int
n
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
num_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
scalar_t
*
data_col
)
{
CUDA_KERNEL_LOOP
(
index
,
n
)
{
// index index of output matrix
const
int
w_col
=
index
%
width_col
;
const
int
h_col
=
(
index
/
width_col
)
%
height_col
;
const
int
b_col
=
(
index
/
width_col
/
height_col
)
%
batch_size
;
const
int
c_im
=
(
index
/
width_col
/
height_col
)
/
batch_size
;
const
int
c_col
=
c_im
*
kernel_h
*
kernel_w
;
// compute deformable group index
const
int
deformable_group_index
=
c_im
/
channel_per_deformable_group
;
const
int
h_in
=
h_col
*
stride_h
-
pad_h
;
const
int
w_in
=
w_col
*
stride_w
-
pad_w
;
scalar_t
*
data_col_ptr
=
data_col
+
((
c_col
*
batch_size
+
b_col
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
// const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) *
// height + h_in) * width + w_in;
const
scalar_t
*
data_im_ptr
=
data_im
+
(
b_col
*
num_channels
+
c_im
)
*
height
*
width
;
const
scalar_t
*
data_offset_ptr
=
data_offset
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
scalar_t
*
data_mask_ptr
=
data_mask
+
(
b_col
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
for
(
int
i
=
0
;
i
<
kernel_h
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
++
j
)
{
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
int
data_mask_hw_ptr
=
((
i
*
kernel_w
+
j
)
*
height_col
+
h_col
)
*
width_col
+
w_col
;
const
scalar_t
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
scalar_t
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
const
scalar_t
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
scalar_t
val
=
static_cast
<
scalar_t
>
(
0
);
const
scalar_t
h_im
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
scalar_t
w_im
=
w_in
+
j
*
dilation_w
+
offset_w
;
// if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
if
(
h_im
>
-
1
&&
w_im
>
-
1
&&
h_im
<
height
&&
w_im
<
width
)
{
// const float map_h = i * dilation_h + offset_h;
// const float map_w = j * dilation_w + offset_w;
// const int cur_height = height - h_in;
// const int cur_width = width - w_in;
// val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height,
// cur_width, map_h, map_w);
val
=
dmcn_im2col_bilinear
(
data_im_ptr
,
width
,
height
,
width
,
h_im
,
w_im
);
}
*
data_col_ptr
=
val
*
mask
;
data_col_ptr
+=
batch_size
*
height_col
*
width_col
;
// data_col_ptr += height_col * width_col;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
modulated_deformable_col2im_gpu_kernel
(
const
int
n
,
const
scalar_t
*
data_col
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
scalar_t
*
grad_im
)
{
CUDA_KERNEL_LOOP
(
index
,
n
)
{
const
int
j
=
(
index
/
width_col
/
height_col
/
batch_size
)
%
kernel_w
;
const
int
i
=
(
index
/
width_col
/
height_col
/
batch_size
/
kernel_w
)
%
kernel_h
;
const
int
c
=
index
/
width_col
/
height_col
/
batch_size
/
kernel_w
/
kernel_h
;
// compute the start and end of the output
const
int
deformable_group_index
=
c
/
channel_per_deformable_group
;
int
w_out
=
index
%
width_col
;
int
h_out
=
(
index
/
width_col
)
%
height_col
;
int
b
=
(
index
/
width_col
/
height_col
)
%
batch_size
;
int
w_in
=
w_out
*
stride_w
-
pad_w
;
int
h_in
=
h_out
*
stride_h
-
pad_h
;
const
scalar_t
*
data_offset_ptr
=
data_offset
+
(
b
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
scalar_t
*
data_mask_ptr
=
data_mask
+
(
b
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
int
data_offset_h_ptr
=
((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
int
data_offset_w_ptr
=
((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
int
data_mask_hw_ptr
=
((
i
*
kernel_w
+
j
)
*
height_col
+
h_out
)
*
width_col
+
w_out
;
const
scalar_t
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
scalar_t
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
const
scalar_t
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
const
scalar_t
cur_inv_h_data
=
h_in
+
i
*
dilation_h
+
offset_h
;
const
scalar_t
cur_inv_w_data
=
w_in
+
j
*
dilation_w
+
offset_w
;
const
scalar_t
cur_top_grad
=
data_col
[
index
]
*
mask
;
const
int
cur_h
=
(
int
)
cur_inv_h_data
;
const
int
cur_w
=
(
int
)
cur_inv_w_data
;
for
(
int
dy
=
-
2
;
dy
<=
2
;
dy
++
)
{
for
(
int
dx
=
-
2
;
dx
<=
2
;
dx
++
)
{
if
(
cur_h
+
dy
>=
0
&&
cur_h
+
dy
<
height
&&
cur_w
+
dx
>=
0
&&
cur_w
+
dx
<
width
&&
abs
(
cur_inv_h_data
-
(
cur_h
+
dy
))
<
1
&&
abs
(
cur_inv_w_data
-
(
cur_w
+
dx
))
<
1
)
{
int
cur_bottom_grad_pos
=
((
b
*
channels
+
c
)
*
height
+
cur_h
+
dy
)
*
width
+
cur_w
+
dx
;
scalar_t
weight
=
dmcn_get_gradient_weight
(
cur_inv_h_data
,
cur_inv_w_data
,
cur_h
+
dy
,
cur_w
+
dx
,
height
,
width
);
atomicAdd
(
grad_im
+
cur_bottom_grad_pos
,
weight
*
cur_top_grad
);
}
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
modulated_deformable_col2im_coord_gpu_kernel
(
const
int
n
,
const
scalar_t
*
data_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
channel_per_deformable_group
,
const
int
batch_size
,
const
int
offset_channels
,
const
int
deformable_group
,
const
int
height_col
,
const
int
width_col
,
scalar_t
*
grad_offset
,
scalar_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
n
)
{
scalar_t
val
=
0
,
mval
=
0
;
int
w
=
index
%
width_col
;
int
h
=
(
index
/
width_col
)
%
height_col
;
int
c
=
(
index
/
width_col
/
height_col
)
%
offset_channels
;
int
b
=
(
index
/
width_col
/
height_col
)
/
offset_channels
;
// compute the start and end of the output
const
int
deformable_group_index
=
c
/
(
2
*
kernel_h
*
kernel_w
);
const
int
col_step
=
kernel_h
*
kernel_w
;
int
cnt
=
0
;
const
scalar_t
*
data_col_ptr
=
data_col
+
deformable_group_index
*
channel_per_deformable_group
*
batch_size
*
width_col
*
height_col
;
const
scalar_t
*
data_im_ptr
=
data_im
+
(
b
*
deformable_group
+
deformable_group_index
)
*
channel_per_deformable_group
/
kernel_h
/
kernel_w
*
height
*
width
;
const
scalar_t
*
data_offset_ptr
=
data_offset
+
(
b
*
deformable_group
+
deformable_group_index
)
*
2
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
scalar_t
*
data_mask_ptr
=
data_mask
+
(
b
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
*
height_col
*
width_col
;
const
int
offset_c
=
c
-
deformable_group_index
*
2
*
kernel_h
*
kernel_w
;
for
(
int
col_c
=
(
offset_c
/
2
);
col_c
<
channel_per_deformable_group
;
col_c
+=
col_step
)
{
const
int
col_pos
=
(((
col_c
*
batch_size
+
b
)
*
height_col
)
+
h
)
*
width_col
+
w
;
const
int
bp_dir
=
offset_c
%
2
;
int
j
=
(
col_pos
/
width_col
/
height_col
/
batch_size
)
%
kernel_w
;
int
i
=
(
col_pos
/
width_col
/
height_col
/
batch_size
/
kernel_w
)
%
kernel_h
;
int
w_out
=
col_pos
%
width_col
;
int
h_out
=
(
col_pos
/
width_col
)
%
height_col
;
int
w_in
=
w_out
*
stride_w
-
pad_w
;
int
h_in
=
h_out
*
stride_h
-
pad_h
;
const
int
data_offset_h_ptr
=
(((
2
*
(
i
*
kernel_w
+
j
))
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
int
data_offset_w_ptr
=
(((
2
*
(
i
*
kernel_w
+
j
)
+
1
)
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
int
data_mask_hw_ptr
=
(((
i
*
kernel_w
+
j
)
*
height_col
+
h_out
)
*
width_col
+
w_out
);
const
scalar_t
offset_h
=
data_offset_ptr
[
data_offset_h_ptr
];
const
scalar_t
offset_w
=
data_offset_ptr
[
data_offset_w_ptr
];
const
scalar_t
mask
=
data_mask_ptr
[
data_mask_hw_ptr
];
scalar_t
inv_h
=
h_in
+
i
*
dilation_h
+
offset_h
;
scalar_t
inv_w
=
w_in
+
j
*
dilation_w
+
offset_w
;
if
(
inv_h
<=
-
1
||
inv_w
<=
-
1
||
inv_h
>=
height
||
inv_w
>=
width
)
{
inv_h
=
inv_w
=
-
2
;
}
else
{
mval
+=
data_col_ptr
[
col_pos
]
*
dmcn_im2col_bilinear
(
data_im_ptr
+
cnt
*
height
*
width
,
width
,
height
,
width
,
inv_h
,
inv_w
);
}
const
scalar_t
weight
=
dmcn_get_coordinate_weight
(
inv_h
,
inv_w
,
height
,
width
,
data_im_ptr
+
cnt
*
height
*
width
,
width
,
bp_dir
);
val
+=
weight
*
data_col_ptr
[
col_pos
]
*
mask
;
cnt
+=
1
;
}
// KERNEL_ASSIGN(grad_offset[index], offset_req, val);
grad_offset
[
index
]
=
val
;
if
(
offset_c
%
2
==
0
)
// KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
// deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
// height_col + h) * width_col + w], mask_req, mval);
grad_mask
[(((
b
*
deformable_group
+
deformable_group_index
)
*
kernel_h
*
kernel_w
+
offset_c
/
2
)
*
height_col
+
h
)
*
width_col
+
w
]
=
mval
;
}
}
namespace
detectron2
{
void
modulated_deformable_im2col_cuda
(
const
at
::
Tensor
data_im
,
const
at
::
Tensor
data_offset
,
const
at
::
Tensor
data_mask
,
const
int
batch_size
,
const
int
channels
,
const
int
height_im
,
const
int
width_im
,
const
int
height_col
,
const
int
width_col
,
const
int
kernel_h
,
const
int
kenerl_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
deformable_group
,
at
::
Tensor
data_col
)
{
// num_axes should be smaller than block size
const
int
channel_per_deformable_group
=
channels
/
deformable_group
;
const
int
num_kernels
=
channels
*
batch_size
*
height_col
*
width_col
;
at
::
cuda
::
CUDAGuard
device_guard
(
data_im
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
data_im
.
scalar_type
(),
"modulated_deformable_im2col_gpu"
,
([
&
]
{
const
scalar_t
*
data_im_
=
data_im
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
data_offset_
=
data_offset
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
data_mask_
=
data_mask
.
data_ptr
<
scalar_t
>
();
scalar_t
*
data_col_
=
data_col
.
data_ptr
<
scalar_t
>
();
modulated_deformable_im2col_gpu_kernel
<<<
GET_BLOCKS
(
num_kernels
),
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
num_kernels
,
data_im_
,
data_offset_
,
data_mask_
,
height_im
,
width_im
,
kernel_h
,
kenerl_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channel_per_deformable_group
,
batch_size
,
channels
,
deformable_group
,
height_col
,
width_col
,
data_col_
);
}));
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"error in modulated_deformable_im2col_cuda: %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
void
modulated_deformable_col2im_cuda
(
const
at
::
Tensor
data_col
,
const
at
::
Tensor
data_offset
,
const
at
::
Tensor
data_mask
,
const
int
batch_size
,
const
int
channels
,
const
int
height_im
,
const
int
width_im
,
const
int
height_col
,
const
int
width_col
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
deformable_group
,
at
::
Tensor
grad_im
)
{
const
int
channel_per_deformable_group
=
channels
/
deformable_group
;
const
int
num_kernels
=
channels
*
kernel_h
*
kernel_w
*
batch_size
*
height_col
*
width_col
;
at
::
cuda
::
CUDAGuard
device_guard
(
data_col
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
data_col
.
scalar_type
(),
"modulated_deformable_col2im_gpu"
,
([
&
]
{
const
scalar_t
*
data_col_
=
data_col
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
data_offset_
=
data_offset
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
data_mask_
=
data_mask
.
data_ptr
<
scalar_t
>
();
scalar_t
*
grad_im_
=
grad_im
.
data_ptr
<
scalar_t
>
();
modulated_deformable_col2im_gpu_kernel
<<<
GET_BLOCKS
(
num_kernels
),
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
num_kernels
,
data_col_
,
data_offset_
,
data_mask_
,
channels
,
height_im
,
width_im
,
kernel_h
,
kernel_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channel_per_deformable_group
,
batch_size
,
deformable_group
,
height_col
,
width_col
,
grad_im_
);
}));
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"error in modulated_deformable_col2im_cuda: %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
void
modulated_deformable_col2im_coord_cuda
(
const
at
::
Tensor
data_col
,
const
at
::
Tensor
data_im
,
const
at
::
Tensor
data_offset
,
const
at
::
Tensor
data_mask
,
const
int
batch_size
,
const
int
channels
,
const
int
height_im
,
const
int
width_im
,
const
int
height_col
,
const
int
width_col
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
deformable_group
,
at
::
Tensor
grad_offset
,
at
::
Tensor
grad_mask
)
{
const
int
num_kernels
=
batch_size
*
height_col
*
width_col
*
2
*
kernel_h
*
kernel_w
*
deformable_group
;
const
int
channel_per_deformable_group
=
channels
*
kernel_h
*
kernel_w
/
deformable_group
;
at
::
cuda
::
CUDAGuard
device_guard
(
data_col
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
data_col
.
scalar_type
(),
"modulated_deformable_col2im_coord_gpu"
,
([
&
]
{
const
scalar_t
*
data_col_
=
data_col
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
data_im_
=
data_im
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
data_offset_
=
data_offset
.
data_ptr
<
scalar_t
>
();
const
scalar_t
*
data_mask_
=
data_mask
.
data_ptr
<
scalar_t
>
();
scalar_t
*
grad_offset_
=
grad_offset
.
data_ptr
<
scalar_t
>
();
scalar_t
*
grad_mask_
=
grad_mask
.
data_ptr
<
scalar_t
>
();
modulated_deformable_col2im_coord_gpu_kernel
<<<
GET_BLOCKS
(
num_kernels
),
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
num_kernels
,
data_col_
,
data_im_
,
data_offset_
,
data_mask_
,
channels
,
height_im
,
width_im
,
kernel_h
,
kernel_w
,
pad_h
,
pad_w
,
stride_h
,
stride_w
,
dilation_h
,
dilation_w
,
channel_per_deformable_group
,
batch_size
,
2
*
kernel_h
*
kernel_w
*
deformable_group
,
deformable_group
,
height_col
,
width_col
,
grad_offset_
,
grad_mask_
);
}));
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"error in modulated_deformable_col2im_coord_cuda: %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
}
// namespace detectron2
detectron2/layers/csrc/nms_rotated/nms_rotated.h
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#pragma once
#include <torch/types.h>
namespace
detectron2
{
at
::
Tensor
nms_rotated_cpu
(
const
at
::
Tensor
&
dets
,
const
at
::
Tensor
&
scores
,
const
float
iou_threshold
);
#ifdef WITH_CUDA
at
::
Tensor
nms_rotated_cuda
(
const
at
::
Tensor
&
dets
,
const
at
::
Tensor
&
scores
,
const
float
iou_threshold
);
#endif
// Interface for Python
// inline is needed to prevent multiple function definitions when this header is
// included by different cpps
inline
at
::
Tensor
nms_rotated
(
const
at
::
Tensor
&
dets
,
const
at
::
Tensor
&
scores
,
const
float
iou_threshold
)
{
assert
(
dets
.
device
().
is_cuda
()
==
scores
.
device
().
is_cuda
());
if
(
dets
.
device
().
is_cuda
())
{
#ifdef WITH_CUDA
return
nms_rotated_cuda
(
dets
.
contiguous
(),
scores
.
contiguous
(),
iou_threshold
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
return
nms_rotated_cpu
(
dets
.
contiguous
(),
scores
.
contiguous
(),
iou_threshold
);
}
}
// namespace detectron2
detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "../box_iou_rotated/box_iou_rotated_utils.h"
#include "nms_rotated.h"
namespace
detectron2
{
template
<
typename
scalar_t
>
at
::
Tensor
nms_rotated_cpu_kernel
(
const
at
::
Tensor
&
dets
,
const
at
::
Tensor
&
scores
,
const
float
iou_threshold
)
{
// nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
// however, the code in this function is much shorter because
// we delegate the IoU computation for rotated boxes to
// the single_box_iou_rotated function in box_iou_rotated_utils.h
AT_ASSERTM
(
dets
.
device
().
is_cpu
(),
"dets must be a CPU tensor"
);
AT_ASSERTM
(
scores
.
device
().
is_cpu
(),
"scores must be a CPU tensor"
);
AT_ASSERTM
(
dets
.
scalar_type
()
==
scores
.
scalar_type
(),
"dets should have the same type as scores"
);
if
(
dets
.
numel
()
==
0
)
{
return
at
::
empty
({
0
},
dets
.
options
().
dtype
(
at
::
kLong
));
}
auto
order_t
=
std
::
get
<
1
>
(
scores
.
sort
(
0
,
/* descending=*/
true
));
auto
ndets
=
dets
.
size
(
0
);
at
::
Tensor
suppressed_t
=
at
::
zeros
({
ndets
},
dets
.
options
().
dtype
(
at
::
kByte
));
at
::
Tensor
keep_t
=
at
::
zeros
({
ndets
},
dets
.
options
().
dtype
(
at
::
kLong
));
auto
suppressed
=
suppressed_t
.
data_ptr
<
uint8_t
>
();
auto
keep
=
keep_t
.
data_ptr
<
int64_t
>
();
auto
order
=
order_t
.
data_ptr
<
int64_t
>
();
int64_t
num_to_keep
=
0
;
for
(
int64_t
_i
=
0
;
_i
<
ndets
;
_i
++
)
{
auto
i
=
order
[
_i
];
if
(
suppressed
[
i
]
==
1
)
{
continue
;
}
keep
[
num_to_keep
++
]
=
i
;
for
(
int64_t
_j
=
_i
+
1
;
_j
<
ndets
;
_j
++
)
{
auto
j
=
order
[
_j
];
if
(
suppressed
[
j
]
==
1
)
{
continue
;
}
auto
ovr
=
single_box_iou_rotated
<
scalar_t
>
(
dets
[
i
].
data_ptr
<
scalar_t
>
(),
dets
[
j
].
data_ptr
<
scalar_t
>
());
if
(
ovr
>=
iou_threshold
)
{
suppressed
[
j
]
=
1
;
}
}
}
return
keep_t
.
narrow
(
/*dim=*/
0
,
/*start=*/
0
,
/*length=*/
num_to_keep
);
}
at
::
Tensor
nms_rotated_cpu
(
// input must be contiguous
const
at
::
Tensor
&
dets
,
const
at
::
Tensor
&
scores
,
const
float
iou_threshold
)
{
auto
result
=
at
::
empty
({
0
},
dets
.
options
());
AT_DISPATCH_FLOATING_TYPES
(
dets
.
scalar_type
(),
"nms_rotated"
,
[
&
]
{
result
=
nms_rotated_cpu_kernel
<
scalar_t
>
(
dets
,
scores
,
iou_threshold
);
});
return
result
;
}
}
// namespace detectron2
detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include "../box_iou_rotated/box_iou_rotated_utils.h"
using
namespace
detectron2
;
namespace
{
int
const
threadsPerBlock
=
sizeof
(
unsigned
long
long
)
*
8
;
}
template
<
typename
T
>
__global__
void
nms_rotated_cuda_kernel
(
const
int
n_boxes
,
const
float
iou_threshold
,
const
T
*
dev_boxes
,
unsigned
long
long
*
dev_mask
)
{
// nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel
const
int
row_start
=
blockIdx
.
y
;
const
int
col_start
=
blockIdx
.
x
;
// if (row_start > col_start) return;
const
int
row_size
=
min
(
n_boxes
-
row_start
*
threadsPerBlock
,
threadsPerBlock
);
const
int
col_size
=
min
(
n_boxes
-
col_start
*
threadsPerBlock
,
threadsPerBlock
);
// Compared to nms_cuda_kernel, where each box is represented with 4 values
// (x1, y1, x2, y2), each rotated box is represented with 5 values
// (x_center, y_center, width, height, angle_degrees) here.
__shared__
T
block_boxes
[
threadsPerBlock
*
5
];
if
(
threadIdx
.
x
<
col_size
)
{
block_boxes
[
threadIdx
.
x
*
5
+
0
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
threadIdx
.
x
)
*
5
+
0
];
block_boxes
[
threadIdx
.
x
*
5
+
1
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
threadIdx
.
x
)
*
5
+
1
];
block_boxes
[
threadIdx
.
x
*
5
+
2
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
threadIdx
.
x
)
*
5
+
2
];
block_boxes
[
threadIdx
.
x
*
5
+
3
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
threadIdx
.
x
)
*
5
+
3
];
block_boxes
[
threadIdx
.
x
*
5
+
4
]
=
dev_boxes
[(
threadsPerBlock
*
col_start
+
threadIdx
.
x
)
*
5
+
4
];
}
__syncthreads
();
if
(
threadIdx
.
x
<
row_size
)
{
const
int
cur_box_idx
=
threadsPerBlock
*
row_start
+
threadIdx
.
x
;
const
T
*
cur_box
=
dev_boxes
+
cur_box_idx
*
5
;
int
i
=
0
;
unsigned
long
long
t
=
0
;
int
start
=
0
;
if
(
row_start
==
col_start
)
{
start
=
threadIdx
.
x
+
1
;
}
for
(
i
=
start
;
i
<
col_size
;
i
++
)
{
// Instead of devIoU used by original horizontal nms, here
// we use the single_box_iou_rotated function from box_iou_rotated_utils.h
if
(
single_box_iou_rotated
<
T
>
(
cur_box
,
block_boxes
+
i
*
5
)
>
iou_threshold
)
{
t
|=
1ULL
<<
i
;
}
}
const
int
col_blocks
=
at
::
cuda
::
ATenCeilDiv
(
n_boxes
,
threadsPerBlock
);
dev_mask
[
cur_box_idx
*
col_blocks
+
col_start
]
=
t
;
}
}
namespace
detectron2
{
at
::
Tensor
nms_rotated_cuda
(
// input must be contiguous
const
at
::
Tensor
&
dets
,
const
at
::
Tensor
&
scores
,
float
iou_threshold
)
{
// using scalar_t = float;
AT_ASSERTM
(
dets
.
is_cuda
(),
"dets must be a CUDA tensor"
);
AT_ASSERTM
(
scores
.
is_cuda
(),
"scores must be a CUDA tensor"
);
at
::
cuda
::
CUDAGuard
device_guard
(
dets
.
device
());
auto
order_t
=
std
::
get
<
1
>
(
scores
.
sort
(
0
,
/* descending=*/
true
));
auto
dets_sorted
=
dets
.
index_select
(
0
,
order_t
);
auto
dets_num
=
dets
.
size
(
0
);
const
int
col_blocks
=
at
::
cuda
::
ATenCeilDiv
(
static_cast
<
int
>
(
dets_num
),
threadsPerBlock
);
at
::
Tensor
mask
=
at
::
empty
({
dets_num
*
col_blocks
},
dets
.
options
().
dtype
(
at
::
kLong
));
dim3
blocks
(
col_blocks
,
col_blocks
);
dim3
threads
(
threadsPerBlock
);
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES
(
dets_sorted
.
scalar_type
(),
"nms_rotated_kernel_cuda"
,
[
&
]
{
nms_rotated_cuda_kernel
<
scalar_t
><<<
blocks
,
threads
,
0
,
stream
>>>
(
dets_num
,
iou_threshold
,
dets_sorted
.
data_ptr
<
scalar_t
>
(),
(
unsigned
long
long
*
)
mask
.
data_ptr
<
int64_t
>
());
});
at
::
Tensor
mask_cpu
=
mask
.
to
(
at
::
kCPU
);
unsigned
long
long
*
mask_host
=
(
unsigned
long
long
*
)
mask_cpu
.
data_ptr
<
int64_t
>
();
std
::
vector
<
unsigned
long
long
>
remv
(
col_blocks
);
memset
(
&
remv
[
0
],
0
,
sizeof
(
unsigned
long
long
)
*
col_blocks
);
at
::
Tensor
keep
=
at
::
empty
({
dets_num
},
dets
.
options
().
dtype
(
at
::
kLong
).
device
(
at
::
kCPU
));
int64_t
*
keep_out
=
keep
.
data_ptr
<
int64_t
>
();
int
num_to_keep
=
0
;
for
(
int
i
=
0
;
i
<
dets_num
;
i
++
)
{
int
nblock
=
i
/
threadsPerBlock
;
int
inblock
=
i
%
threadsPerBlock
;
if
(
!
(
remv
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
keep_out
[
num_to_keep
++
]
=
i
;
unsigned
long
long
*
p
=
mask_host
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
remv
[
j
]
|=
p
[
j
];
}
}
}
AT_CUDA_CHECK
(
cudaGetLastError
());
return
order_t
.
index
(
{
keep
.
narrow
(
/*dim=*/
0
,
/*start=*/
0
,
/*length=*/
num_to_keep
)
.
to
(
order_t
.
device
(),
keep
.
scalar_type
())});
}
}
// namespace detectron2
detectron2/layers/csrc/vision.cpp
0 → 100644
View file @
c732df65
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <torch/extension.h>
#include "ROIAlign/ROIAlign.h"
#include "ROIAlignRotated/ROIAlignRotated.h"
#include "box_iou_rotated/box_iou_rotated.h"
#include "deformable/deform_conv.h"
#include "nms_rotated/nms_rotated.h"
namespace
detectron2
{
#ifdef WITH_CUDA
extern
int
get_cudart_version
();
#endif
std
::
string
get_cuda_version
()
{
#ifdef WITH_CUDA
std
::
ostringstream
oss
;
// copied from
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
auto
printCudaStyleVersion
=
[
&
](
int
v
)
{
oss
<<
(
v
/
1000
)
<<
"."
<<
(
v
/
10
%
100
);
if
(
v
%
10
!=
0
)
{
oss
<<
"."
<<
(
v
%
10
);
}
};
printCudaStyleVersion
(
get_cudart_version
());
return
oss
.
str
();
#else
return
std
::
string
(
"not available"
);
#endif
}
// similar to
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
std
::
string
get_compiler_version
()
{
std
::
ostringstream
ss
;
#if defined(__GNUC__)
#ifndef __clang__
#if ((__GNUC__ <= 4) && (__GNUC_MINOR__ <= 8))
#error "GCC >= 4.9 is required!"
#endif
{
ss
<<
"GCC "
<<
__GNUC__
<<
"."
<<
__GNUC_MINOR__
;
}
#endif
#endif
#if defined(__clang_major__)
{
ss
<<
"clang "
<<
__clang_major__
<<
"."
<<
__clang_minor__
<<
"."
<<
__clang_patchlevel__
;
}
#endif
#if defined(_MSC_VER)
{
ss
<<
"MSVC "
<<
_MSC_FULL_VER
;
}
#endif
return
ss
.
str
();
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"get_compiler_version"
,
&
get_compiler_version
,
"get_compiler_version"
);
m
.
def
(
"get_cuda_version"
,
&
get_cuda_version
,
"get_cuda_version"
);
m
.
def
(
"box_iou_rotated"
,
&
box_iou_rotated
,
"IoU for rotated boxes"
);
m
.
def
(
"deform_conv_forward"
,
&
deform_conv_forward
,
"deform_conv_forward"
);
m
.
def
(
"deform_conv_backward_input"
,
&
deform_conv_backward_input
,
"deform_conv_backward_input"
);
m
.
def
(
"deform_conv_backward_filter"
,
&
deform_conv_backward_filter
,
"deform_conv_backward_filter"
);
m
.
def
(
"modulated_deform_conv_forward"
,
&
modulated_deform_conv_forward
,
"modulated_deform_conv_forward"
);
m
.
def
(
"modulated_deform_conv_backward"
,
&
modulated_deform_conv_backward
,
"modulated_deform_conv_backward"
);
m
.
def
(
"nms_rotated"
,
&
nms_rotated
,
"NMS for rotated boxes"
);
m
.
def
(
"roi_align_forward"
,
&
ROIAlign_forward
,
"ROIAlign_forward"
);
m
.
def
(
"roi_align_backward"
,
&
ROIAlign_backward
,
"ROIAlign_backward"
);
m
.
def
(
"roi_align_rotated_forward"
,
&
ROIAlignRotated_forward
,
"Forward pass for Rotated ROI-Align Operator"
);
m
.
def
(
"roi_align_rotated_backward"
,
&
ROIAlignRotated_backward
,
"Backward pass for Rotated ROI-Align Operator"
);
}
}
// namespace detectron2
detectron2/layers/deform_conv.py
0 → 100644
View file @
c732df65
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import
math
from
functools
import
lru_cache
import
torch
from
torch
import
nn
from
torch.autograd
import
Function
from
torch.autograd.function
import
once_differentiable
from
torch.nn.modules.utils
import
_pair
from
detectron2
import
_C
from
.wrappers
import
_NewEmptyTensorOp
class
_DeformConv
(
Function
):
@
staticmethod
def
forward
(
ctx
,
input
,
offset
,
weight
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
deformable_groups
=
1
,
im2col_step
=
64
,
):
if
input
is
not
None
and
input
.
dim
()
!=
4
:
raise
ValueError
(
"Expected 4D tensor as input, got {}D tensor instead."
.
format
(
input
.
dim
())
)
ctx
.
stride
=
_pair
(
stride
)
ctx
.
padding
=
_pair
(
padding
)
ctx
.
dilation
=
_pair
(
dilation
)
ctx
.
groups
=
groups
ctx
.
deformable_groups
=
deformable_groups
ctx
.
im2col_step
=
im2col_step
ctx
.
save_for_backward
(
input
,
offset
,
weight
)
output
=
input
.
new_empty
(
_DeformConv
.
_output_size
(
input
,
weight
,
ctx
.
padding
,
ctx
.
dilation
,
ctx
.
stride
)
)
ctx
.
bufs_
=
[
input
.
new_empty
(
0
),
input
.
new_empty
(
0
)]
# columns, ones
if
not
input
.
is_cuda
:
raise
NotImplementedError
else
:
cur_im2col_step
=
_DeformConv
.
_cal_im2col_step
(
input
.
shape
[
0
],
ctx
.
im2col_step
)
assert
(
input
.
shape
[
0
]
%
cur_im2col_step
)
==
0
,
"im2col step must divide batchsize"
_C
.
deform_conv_forward
(
input
,
weight
,
offset
,
output
,
ctx
.
bufs_
[
0
],
ctx
.
bufs_
[
1
],
weight
.
size
(
3
),
weight
.
size
(
2
),
ctx
.
stride
[
1
],
ctx
.
stride
[
0
],
ctx
.
padding
[
1
],
ctx
.
padding
[
0
],
ctx
.
dilation
[
1
],
ctx
.
dilation
[
0
],
ctx
.
groups
,
ctx
.
deformable_groups
,
cur_im2col_step
,
)
return
output
@
staticmethod
@
once_differentiable
def
backward
(
ctx
,
grad_output
):
input
,
offset
,
weight
=
ctx
.
saved_tensors
grad_input
=
grad_offset
=
grad_weight
=
None
if
not
grad_output
.
is_cuda
:
raise
NotImplementedError
else
:
cur_im2col_step
=
_DeformConv
.
_cal_im2col_step
(
input
.
shape
[
0
],
ctx
.
im2col_step
)
assert
(
input
.
shape
[
0
]
%
cur_im2col_step
)
==
0
,
"im2col step must divide batchsize"
if
ctx
.
needs_input_grad
[
0
]
or
ctx
.
needs_input_grad
[
1
]:
grad_input
=
torch
.
zeros_like
(
input
)
grad_offset
=
torch
.
zeros_like
(
offset
)
_C
.
deform_conv_backward_input
(
input
,
offset
,
grad_output
,
grad_input
,
grad_offset
,
weight
,
ctx
.
bufs_
[
0
],
weight
.
size
(
3
),
weight
.
size
(
2
),
ctx
.
stride
[
1
],
ctx
.
stride
[
0
],
ctx
.
padding
[
1
],
ctx
.
padding
[
0
],
ctx
.
dilation
[
1
],
ctx
.
dilation
[
0
],
ctx
.
groups
,
ctx
.
deformable_groups
,
cur_im2col_step
,
)
if
ctx
.
needs_input_grad
[
2
]:
grad_weight
=
torch
.
zeros_like
(
weight
)
_C
.
deform_conv_backward_filter
(
input
,
offset
,
grad_output
,
grad_weight
,
ctx
.
bufs_
[
0
],
ctx
.
bufs_
[
1
],
weight
.
size
(
3
),
weight
.
size
(
2
),
ctx
.
stride
[
1
],
ctx
.
stride
[
0
],
ctx
.
padding
[
1
],
ctx
.
padding
[
0
],
ctx
.
dilation
[
1
],
ctx
.
dilation
[
0
],
ctx
.
groups
,
ctx
.
deformable_groups
,
1
,
cur_im2col_step
,
)
return
grad_input
,
grad_offset
,
grad_weight
,
None
,
None
,
None
,
None
,
None
,
None
@
staticmethod
def
_output_size
(
input
,
weight
,
padding
,
dilation
,
stride
):
channels
=
weight
.
size
(
0
)
output_size
=
(
input
.
size
(
0
),
channels
)
for
d
in
range
(
input
.
dim
()
-
2
):
in_size
=
input
.
size
(
d
+
2
)
pad
=
padding
[
d
]
kernel
=
dilation
[
d
]
*
(
weight
.
size
(
d
+
2
)
-
1
)
+
1
stride_
=
stride
[
d
]
output_size
+=
((
in_size
+
(
2
*
pad
)
-
kernel
)
//
stride_
+
1
,)
if
not
all
(
map
(
lambda
s
:
s
>
0
,
output_size
)):
raise
ValueError
(
"convolution input is too small (output would be {})"
.
format
(
"x"
.
join
(
map
(
str
,
output_size
))
)
)
return
output_size
@
staticmethod
@
lru_cache
(
maxsize
=
128
)
def
_cal_im2col_step
(
input_size
,
default_size
):
"""
Calculate proper im2col step size, which should be divisible by input_size and not larger
than prefer_size. Meanwhile the step size should be as large as possible to be more
efficient. So we choose the largest one among all divisors of input_size which are smaller
than prefer_size.
:param input_size: input batch size .
:param default_size: default preferred im2col step size.
:return: the largest proper step size.
"""
if
input_size
<=
default_size
:
return
input_size
best_step
=
1
for
step
in
range
(
2
,
min
(
int
(
math
.
sqrt
(
input_size
))
+
1
,
default_size
)):
if
input_size
%
step
==
0
:
if
input_size
//
step
<=
default_size
:
return
input_size
//
step
best_step
=
step
return
best_step
class
_ModulatedDeformConv
(
Function
):
@
staticmethod
def
forward
(
ctx
,
input
,
offset
,
mask
,
weight
,
bias
=
None
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
deformable_groups
=
1
,
):
ctx
.
stride
=
stride
ctx
.
padding
=
padding
ctx
.
dilation
=
dilation
ctx
.
groups
=
groups
ctx
.
deformable_groups
=
deformable_groups
ctx
.
with_bias
=
bias
is
not
None
if
not
ctx
.
with_bias
:
bias
=
input
.
new_empty
(
1
)
# fake tensor
if
not
input
.
is_cuda
:
raise
NotImplementedError
if
(
weight
.
requires_grad
or
mask
.
requires_grad
or
offset
.
requires_grad
or
input
.
requires_grad
):
ctx
.
save_for_backward
(
input
,
offset
,
mask
,
weight
,
bias
)
output
=
input
.
new_empty
(
_ModulatedDeformConv
.
_infer_shape
(
ctx
,
input
,
weight
))
ctx
.
_bufs
=
[
input
.
new_empty
(
0
),
input
.
new_empty
(
0
)]
_C
.
modulated_deform_conv_forward
(
input
,
weight
,
bias
,
ctx
.
_bufs
[
0
],
offset
,
mask
,
output
,
ctx
.
_bufs
[
1
],
weight
.
shape
[
2
],
weight
.
shape
[
3
],
ctx
.
stride
,
ctx
.
stride
,
ctx
.
padding
,
ctx
.
padding
,
ctx
.
dilation
,
ctx
.
dilation
,
ctx
.
groups
,
ctx
.
deformable_groups
,
ctx
.
with_bias
,
)
return
output
@
staticmethod
@
once_differentiable
def
backward
(
ctx
,
grad_output
):
if
not
grad_output
.
is_cuda
:
raise
NotImplementedError
input
,
offset
,
mask
,
weight
,
bias
=
ctx
.
saved_tensors
grad_input
=
torch
.
zeros_like
(
input
)
grad_offset
=
torch
.
zeros_like
(
offset
)
grad_mask
=
torch
.
zeros_like
(
mask
)
grad_weight
=
torch
.
zeros_like
(
weight
)
grad_bias
=
torch
.
zeros_like
(
bias
)
_C
.
modulated_deform_conv_backward
(
input
,
weight
,
bias
,
ctx
.
_bufs
[
0
],
offset
,
mask
,
ctx
.
_bufs
[
1
],
grad_input
,
grad_weight
,
grad_bias
,
grad_offset
,
grad_mask
,
grad_output
,
weight
.
shape
[
2
],
weight
.
shape
[
3
],
ctx
.
stride
,
ctx
.
stride
,
ctx
.
padding
,
ctx
.
padding
,
ctx
.
dilation
,
ctx
.
dilation
,
ctx
.
groups
,
ctx
.
deformable_groups
,
ctx
.
with_bias
,
)
if
not
ctx
.
with_bias
:
grad_bias
=
None
return
(
grad_input
,
grad_offset
,
grad_mask
,
grad_weight
,
grad_bias
,
None
,
None
,
None
,
None
,
None
,
)
@
staticmethod
def
_infer_shape
(
ctx
,
input
,
weight
):
n
=
input
.
size
(
0
)
channels_out
=
weight
.
size
(
0
)
height
,
width
=
input
.
shape
[
2
:
4
]
kernel_h
,
kernel_w
=
weight
.
shape
[
2
:
4
]
height_out
=
(
height
+
2
*
ctx
.
padding
-
(
ctx
.
dilation
*
(
kernel_h
-
1
)
+
1
)
)
//
ctx
.
stride
+
1
width_out
=
(
width
+
2
*
ctx
.
padding
-
(
ctx
.
dilation
*
(
kernel_w
-
1
)
+
1
)
)
//
ctx
.
stride
+
1
return
n
,
channels_out
,
height_out
,
width_out
deform_conv
=
_DeformConv
.
apply
modulated_deform_conv
=
_ModulatedDeformConv
.
apply
class
DeformConv
(
nn
.
Module
):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
deformable_groups
=
1
,
bias
=
False
,
norm
=
None
,
activation
=
None
,
):
"""
Deformable convolution from :paper:`deformconv`.
Arguments are similar to :class:`Conv2D`. Extra arguments:
Args:
deformable_groups (int): number of groups used in deformable convolution.
norm (nn.Module, optional): a normalization layer
activation (callable(Tensor) -> Tensor): a callable activation function
"""
super
(
DeformConv
,
self
).
__init__
()
assert
not
bias
assert
in_channels
%
groups
==
0
,
"in_channels {} cannot be divisible by groups {}"
.
format
(
in_channels
,
groups
)
assert
(
out_channels
%
groups
==
0
),
"out_channels {} cannot be divisible by groups {}"
.
format
(
out_channels
,
groups
)
self
.
in_channels
=
in_channels
self
.
out_channels
=
out_channels
self
.
kernel_size
=
_pair
(
kernel_size
)
self
.
stride
=
_pair
(
stride
)
self
.
padding
=
_pair
(
padding
)
self
.
dilation
=
_pair
(
dilation
)
self
.
groups
=
groups
self
.
deformable_groups
=
deformable_groups
self
.
norm
=
norm
self
.
activation
=
activation
self
.
weight
=
nn
.
Parameter
(
torch
.
Tensor
(
out_channels
,
in_channels
//
self
.
groups
,
*
self
.
kernel_size
)
)
self
.
bias
=
None
nn
.
init
.
kaiming_uniform_
(
self
.
weight
,
nonlinearity
=
"relu"
)
def
forward
(
self
,
x
,
offset
):
if
x
.
numel
()
==
0
:
# When input is empty, we want to return a empty tensor with "correct" shape,
# So that the following operations will not panic
# if they check for the shape of the tensor.
# This computes the height and width of the output tensor
output_shape
=
[
(
i
+
2
*
p
-
(
di
*
(
k
-
1
)
+
1
))
//
s
+
1
for
i
,
p
,
di
,
k
,
s
in
zip
(
x
.
shape
[
-
2
:],
self
.
padding
,
self
.
dilation
,
self
.
kernel_size
,
self
.
stride
)
]
output_shape
=
[
x
.
shape
[
0
],
self
.
weight
.
shape
[
0
]]
+
output_shape
return
_NewEmptyTensorOp
.
apply
(
x
,
output_shape
)
x
=
deform_conv
(
x
,
offset
,
self
.
weight
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
self
.
groups
,
self
.
deformable_groups
,
)
if
self
.
norm
is
not
None
:
x
=
self
.
norm
(
x
)
if
self
.
activation
is
not
None
:
x
=
self
.
activation
(
x
)
return
x
def
extra_repr
(
self
):
tmpstr
=
"in_channels="
+
str
(
self
.
in_channels
)
tmpstr
+=
", out_channels="
+
str
(
self
.
out_channels
)
tmpstr
+=
", kernel_size="
+
str
(
self
.
kernel_size
)
tmpstr
+=
", stride="
+
str
(
self
.
stride
)
tmpstr
+=
", padding="
+
str
(
self
.
padding
)
tmpstr
+=
", dilation="
+
str
(
self
.
dilation
)
tmpstr
+=
", groups="
+
str
(
self
.
groups
)
tmpstr
+=
", deformable_groups="
+
str
(
self
.
deformable_groups
)
tmpstr
+=
", bias=False"
return
tmpstr
class
ModulatedDeformConv
(
nn
.
Module
):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
deformable_groups
=
1
,
bias
=
True
,
norm
=
None
,
activation
=
None
,
):
"""
Modulated deformable convolution from :paper:`deformconv2`.
Arguments are similar to :class:`Conv2D`. Extra arguments:
Args:
deformable_groups (int): number of groups used in deformable convolution.
norm (nn.Module, optional): a normalization layer
activation (callable(Tensor) -> Tensor): a callable activation function
"""
super
(
ModulatedDeformConv
,
self
).
__init__
()
self
.
in_channels
=
in_channels
self
.
out_channels
=
out_channels
self
.
kernel_size
=
_pair
(
kernel_size
)
self
.
stride
=
stride
self
.
padding
=
padding
self
.
dilation
=
dilation
self
.
groups
=
groups
self
.
deformable_groups
=
deformable_groups
self
.
with_bias
=
bias
self
.
norm
=
norm
self
.
activation
=
activation
self
.
weight
=
nn
.
Parameter
(
torch
.
Tensor
(
out_channels
,
in_channels
//
groups
,
*
self
.
kernel_size
)
)
if
bias
:
self
.
bias
=
nn
.
Parameter
(
torch
.
Tensor
(
out_channels
))
else
:
self
.
bias
=
None
nn
.
init
.
kaiming_uniform_
(
self
.
weight
,
nonlinearity
=
"relu"
)
if
self
.
bias
is
not
None
:
nn
.
init
.
constant_
(
self
.
bias
,
0
)
def
forward
(
self
,
x
,
offset
,
mask
):
if
x
.
numel
()
==
0
:
output_shape
=
[
(
i
+
2
*
p
-
(
di
*
(
k
-
1
)
+
1
))
//
s
+
1
for
i
,
p
,
di
,
k
,
s
in
zip
(
x
.
shape
[
-
2
:],
self
.
padding
,
self
.
dilation
,
self
.
kernel_size
,
self
.
stride
)
]
output_shape
=
[
x
.
shape
[
0
],
self
.
weight
.
shape
[
0
]]
+
output_shape
return
_NewEmptyTensorOp
.
apply
(
x
,
output_shape
)
x
=
modulated_deform_conv
(
x
,
offset
,
mask
,
self
.
weight
,
self
.
bias
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
self
.
groups
,
self
.
deformable_groups
,
)
if
self
.
norm
is
not
None
:
x
=
self
.
norm
(
x
)
if
self
.
activation
is
not
None
:
x
=
self
.
activation
(
x
)
return
x
def
extra_repr
(
self
):
tmpstr
=
"in_channels="
+
str
(
self
.
in_channels
)
tmpstr
+=
", out_channels="
+
str
(
self
.
out_channels
)
tmpstr
+=
", kernel_size="
+
str
(
self
.
kernel_size
)
tmpstr
+=
", stride="
+
str
(
self
.
stride
)
tmpstr
+=
", padding="
+
str
(
self
.
padding
)
tmpstr
+=
", dilation="
+
str
(
self
.
dilation
)
tmpstr
+=
", groups="
+
str
(
self
.
groups
)
tmpstr
+=
", deformable_groups="
+
str
(
self
.
deformable_groups
)
tmpstr
+=
", bias="
+
str
(
self
.
with_bias
)
return
tmpstr
detectron2/layers/mask_ops.py
0 → 100644
View file @
c732df65
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import
numpy
as
np
import
torch
from
PIL
import
Image
from
torch.nn
import
functional
as
F
__all__
=
[
"paste_masks_in_image"
]
BYTES_PER_FLOAT
=
4
# TODO: This memory limit may be too much or too little. It would be better to
# determine it based on available resources.
GPU_MEM_LIMIT
=
1024
**
3
# 1 GB memory limit
def
_do_paste_mask
(
masks
,
boxes
,
img_h
,
img_w
,
skip_empty
=
True
):
"""
Args:
masks: N, 1, H, W
boxes: N, 4
img_h, img_w (int):
skip_empty (bool): only paste masks within the region that
tightly bound all boxes, and returns the results this region only.
An important optimization for CPU.
Returns:
if skip_empty == False, a mask of shape (N, img_h, img_w)
if skip_empty == True, a mask of shape (N, h', w'), and the slice
object for the corresponding region.
"""
# On GPU, paste all masks together (up to chunk size)
# by using the entire image to sample the masks
# Compared to pasting them one by one,
# this has more operations but is faster on COCO-scale dataset.
device
=
masks
.
device
if
skip_empty
:
x0_int
,
y0_int
=
torch
.
clamp
(
boxes
.
min
(
dim
=
0
).
values
.
floor
()[:
2
]
-
1
,
min
=
0
).
to
(
dtype
=
torch
.
int32
)
x1_int
=
torch
.
clamp
(
boxes
[:,
2
].
max
().
ceil
()
+
1
,
max
=
img_w
).
to
(
dtype
=
torch
.
int32
)
y1_int
=
torch
.
clamp
(
boxes
[:,
3
].
max
().
ceil
()
+
1
,
max
=
img_h
).
to
(
dtype
=
torch
.
int32
)
else
:
x0_int
,
y0_int
=
0
,
0
x1_int
,
y1_int
=
img_w
,
img_h
x0
,
y0
,
x1
,
y1
=
torch
.
split
(
boxes
,
1
,
dim
=
1
)
# each is Nx1
N
=
masks
.
shape
[
0
]
img_y
=
torch
.
arange
(
y0_int
,
y1_int
,
device
=
device
,
dtype
=
torch
.
float32
)
+
0.5
img_x
=
torch
.
arange
(
x0_int
,
x1_int
,
device
=
device
,
dtype
=
torch
.
float32
)
+
0.5
img_y
=
(
img_y
-
y0
)
/
(
y1
-
y0
)
*
2
-
1
img_x
=
(
img_x
-
x0
)
/
(
x1
-
x0
)
*
2
-
1
# img_x, img_y have shapes (N, w), (N, h)
gx
=
img_x
[:,
None
,
:].
expand
(
N
,
img_y
.
size
(
1
),
img_x
.
size
(
1
))
gy
=
img_y
[:,
:,
None
].
expand
(
N
,
img_y
.
size
(
1
),
img_x
.
size
(
1
))
grid
=
torch
.
stack
([
gx
,
gy
],
dim
=
3
)
img_masks
=
F
.
grid_sample
(
masks
.
to
(
dtype
=
torch
.
float32
),
grid
,
align_corners
=
False
)
if
skip_empty
:
return
img_masks
[:,
0
],
(
slice
(
y0_int
,
y1_int
),
slice
(
x0_int
,
x1_int
))
else
:
return
img_masks
[:,
0
],
()
def
paste_masks_in_image
(
masks
,
boxes
,
image_shape
,
threshold
=
0.5
):
"""
Paste a set of masks that are of a fixed resolution (e.g., 28 x 28) into an image.
The location, height, and width for pasting each mask is determined by their
corresponding bounding boxes in boxes.
Note:
This is a complicated but more accurate implementation. In actual deployment, it is
often enough to use a faster but less accurate implementation.
See :func:`paste_mask_in_image_old` in this file for an alternative implementation.
Args:
masks (tensor): Tensor of shape (Bimg, Hmask, Wmask), where Bimg is the number of
detected object instances in the image and Hmask, Wmask are the mask width and mask
height of the predicted mask (e.g., Hmask = Wmask = 28). Values are in [0, 1].
boxes (Boxes or Tensor): A Boxes of length Bimg or Tensor of shape (Bimg, 4).
boxes[i] and masks[i] correspond to the same object instance.
image_shape (tuple): height, width
threshold (float): A threshold in [0, 1] for converting the (soft) masks to
binary masks.
Returns:
img_masks (Tensor): A tensor of shape (Bimg, Himage, Wimage), where Bimg is the
number of detected object instances and Himage, Wimage are the image width
and height. img_masks[i] is a binary mask for object instance i.
"""
assert
masks
.
shape
[
-
1
]
==
masks
.
shape
[
-
2
],
"Only square mask predictions are supported"
N
=
len
(
masks
)
if
N
==
0
:
return
masks
.
new_empty
((
0
,)
+
image_shape
,
dtype
=
torch
.
uint8
)
if
not
isinstance
(
boxes
,
torch
.
Tensor
):
boxes
=
boxes
.
tensor
device
=
boxes
.
device
assert
len
(
boxes
)
==
N
,
boxes
.
shape
img_h
,
img_w
=
image_shape
# The actual implementation split the input into chunks,
# and paste them chunk by chunk.
if
device
.
type
==
"cpu"
:
# CPU is most efficient when they are pasted one by one with skip_empty=True
# so that it performs minimal number of operations.
num_chunks
=
N
else
:
# GPU benefits from parallelism for larger chunks, but may have memory issue
# int(img_h) because shape may be tensors in tracing
num_chunks
=
int
(
np
.
ceil
(
N
*
int
(
img_h
)
*
int
(
img_w
)
*
BYTES_PER_FLOAT
/
GPU_MEM_LIMIT
))
assert
(
num_chunks
<=
N
),
"Default GPU_MEM_LIMIT in mask_ops.py is too small; try increasing it"
chunks
=
torch
.
chunk
(
torch
.
arange
(
N
,
device
=
device
),
num_chunks
)
img_masks
=
torch
.
zeros
(
N
,
img_h
,
img_w
,
device
=
device
,
dtype
=
torch
.
bool
if
threshold
>=
0
else
torch
.
uint8
)
for
inds
in
chunks
:
masks_chunk
,
spatial_inds
=
_do_paste_mask
(
masks
[
inds
,
None
,
:,
:],
boxes
[
inds
],
img_h
,
img_w
,
skip_empty
=
device
.
type
==
"cpu"
)
if
threshold
>=
0
:
masks_chunk
=
(
masks_chunk
>=
threshold
).
to
(
dtype
=
torch
.
bool
)
else
:
# for visualization and debugging
masks_chunk
=
(
masks_chunk
*
255
).
to
(
dtype
=
torch
.
uint8
)
img_masks
[(
inds
,)
+
spatial_inds
]
=
masks_chunk
return
img_masks
# The below are the original paste function (from Detectron1) which has
# larger quantization error.
# It is faster on CPU, while the aligned one is faster on GPU thanks to grid_sample.
def
paste_mask_in_image_old
(
mask
,
box
,
img_h
,
img_w
,
threshold
):
"""
Paste a single mask in an image.
This is a per-box implementation of :func:`paste_masks_in_image`.
This function has larger quantization error due to incorrect pixel
modeling and is not used any more.
Args:
mask (Tensor): A tensor of shape (Hmask, Wmask) storing the mask of a single
object instance. Values are in [0, 1].
box (Tensor): A tensor of shape (4, ) storing the x0, y0, x1, y1 box corners
of the object instance.
img_h, img_w (int): Image height and width.
threshold (float): Mask binarization threshold in [0, 1].
Returns:
im_mask (Tensor):
The resized and binarized object mask pasted into the original
image plane (a tensor of shape (img_h, img_w)).
"""
# Conversion from continuous box coordinates to discrete pixel coordinates
# via truncation (cast to int32). This determines which pixels to paste the
# mask onto.
box
=
box
.
to
(
dtype
=
torch
.
int32
)
# Continuous to discrete coordinate conversion
# An example (1D) box with continuous coordinates (x0=0.7, x1=4.3) will map to
# a discrete coordinates (x0=0, x1=4). Note that box is mapped to 5 = x1 - x0 + 1
# pixels (not x1 - x0 pixels).
samples_w
=
box
[
2
]
-
box
[
0
]
+
1
# Number of pixel samples, *not* geometric width
samples_h
=
box
[
3
]
-
box
[
1
]
+
1
# Number of pixel samples, *not* geometric height
# Resample the mask from it's original grid to the new samples_w x samples_h grid
mask
=
Image
.
fromarray
(
mask
.
cpu
().
numpy
())
mask
=
mask
.
resize
((
samples_w
,
samples_h
),
resample
=
Image
.
BILINEAR
)
mask
=
np
.
array
(
mask
,
copy
=
False
)
if
threshold
>=
0
:
mask
=
np
.
array
(
mask
>
threshold
,
dtype
=
np
.
uint8
)
mask
=
torch
.
from_numpy
(
mask
)
else
:
# for visualization and debugging, we also
# allow it to return an unmodified mask
mask
=
torch
.
from_numpy
(
mask
*
255
).
to
(
torch
.
uint8
)
im_mask
=
torch
.
zeros
((
img_h
,
img_w
),
dtype
=
torch
.
uint8
)
x_0
=
max
(
box
[
0
],
0
)
x_1
=
min
(
box
[
2
]
+
1
,
img_w
)
y_0
=
max
(
box
[
1
],
0
)
y_1
=
min
(
box
[
3
]
+
1
,
img_h
)
im_mask
[
y_0
:
y_1
,
x_0
:
x_1
]
=
mask
[
(
y_0
-
box
[
1
])
:
(
y_1
-
box
[
1
]),
(
x_0
-
box
[
0
])
:
(
x_1
-
box
[
0
])
]
return
im_mask
# Our pixel modeling requires extrapolation for any continuous
# coordinate < 0.5 or > length - 0.5. When sampling pixels on the masks,
# we would like this extrapolation to be an interpolation between boundary values and zero,
# instead of using absolute zero or boundary values.
# Therefore `paste_mask_in_image_old` is often used with zero padding around the masks like this:
# masks, scale = pad_masks(masks[:, 0, :, :], 1)
# boxes = scale_boxes(boxes.tensor, scale)
def
pad_masks
(
masks
,
padding
):
"""
Args:
masks (tensor): A tensor of shape (B, M, M) representing B masks.
padding (int): Number of cells to pad on all sides.
Returns:
The padded masks and the scale factor of the padding size / original size.
"""
B
=
masks
.
shape
[
0
]
M
=
masks
.
shape
[
-
1
]
pad2
=
2
*
padding
scale
=
float
(
M
+
pad2
)
/
M
padded_masks
=
masks
.
new_zeros
((
B
,
M
+
pad2
,
M
+
pad2
))
padded_masks
[:,
padding
:
-
padding
,
padding
:
-
padding
]
=
masks
return
padded_masks
,
scale
def
scale_boxes
(
boxes
,
scale
):
"""
Args:
boxes (tensor): A tensor of shape (B, 4) representing B boxes with 4
coords representing the corners x0, y0, x1, y1,
scale (float): The box scaling factor.
Returns:
Scaled boxes.
"""
w_half
=
(
boxes
[:,
2
]
-
boxes
[:,
0
])
*
0.5
h_half
=
(
boxes
[:,
3
]
-
boxes
[:,
1
])
*
0.5
x_c
=
(
boxes
[:,
2
]
+
boxes
[:,
0
])
*
0.5
y_c
=
(
boxes
[:,
3
]
+
boxes
[:,
1
])
*
0.5
w_half
*=
scale
h_half
*=
scale
scaled_boxes
=
torch
.
zeros_like
(
boxes
)
scaled_boxes
[:,
0
]
=
x_c
-
w_half
scaled_boxes
[:,
2
]
=
x_c
+
w_half
scaled_boxes
[:,
1
]
=
y_c
-
h_half
scaled_boxes
[:,
3
]
=
y_c
+
h_half
return
scaled_boxes
Prev
1
…
5
6
7
8
9
10
11
12
13
…
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment