Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
CatVTON_pytorch
Commits
3144257c
Commit
3144257c
authored
Oct 11, 2024
by
mashun1
Browse files
catvton
parents
Changes
471
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
2748 additions
and
0 deletions
+2748
-0
detectron2/modeling/box_regression.py
detectron2/modeling/box_regression.py
+369
-0
detectron2/modeling/matcher.py
detectron2/modeling/matcher.py
+127
-0
detectron2/modeling/meta_arch/__init__.py
detectron2/modeling/meta_arch/__init__.py
+16
-0
detectron2/modeling/meta_arch/build.py
detectron2/modeling/meta_arch/build.py
+25
-0
detectron2/modeling/meta_arch/dense_detector.py
detectron2/modeling/meta_arch/dense_detector.py
+294
-0
detectron2/modeling/meta_arch/fcos.py
detectron2/modeling/meta_arch/fcos.py
+328
-0
detectron2/modeling/meta_arch/panoptic_fpn.py
detectron2/modeling/meta_arch/panoptic_fpn.py
+269
-0
detectron2/modeling/meta_arch/rcnn.py
detectron2/modeling/meta_arch/rcnn.py
+341
-0
detectron2/modeling/meta_arch/retinanet.py
detectron2/modeling/meta_arch/retinanet.py
+439
-0
detectron2/modeling/meta_arch/semantic_seg.py
detectron2/modeling/meta_arch/semantic_seg.py
+267
-0
detectron2/modeling/mmdet_wrapper.py
detectron2/modeling/mmdet_wrapper.py
+273
-0
No files found.
Too many changes to show.
To preserve performance only
471 of 471+
files are displayed.
Plain diff
Email patch
detectron2/modeling/box_regression.py
0 → 100644
View file @
3144257c
# Copyright (c) Facebook, Inc. and its affiliates.
import
math
from
typing
import
List
,
Tuple
,
Union
import
torch
from
fvcore.nn
import
giou_loss
,
smooth_l1_loss
from
torch.nn
import
functional
as
F
from
detectron2.layers
import
cat
,
ciou_loss
,
diou_loss
from
detectron2.structures
import
Boxes
# Value for clamping large dw and dh predictions. The heuristic is that we clamp
# such that dw and dh are no larger than what would transform a 16px box into a
# 1000px box (based on a small anchor, 16px, and a typical image size, 1000px).
_DEFAULT_SCALE_CLAMP
=
math
.
log
(
1000.0
/
16
)
__all__
=
[
"Box2BoxTransform"
,
"Box2BoxTransformRotated"
,
"Box2BoxTransformLinear"
]
@
torch
.
jit
.
script
class
Box2BoxTransform
:
"""
The box-to-box transform defined in R-CNN. The transformation is parameterized
by 4 deltas: (dx, dy, dw, dh). The transformation scales the box's width and height
by exp(dw), exp(dh) and shifts a box's center by the offset (dx * width, dy * height).
"""
def
__init__
(
self
,
weights
:
Tuple
[
float
,
float
,
float
,
float
],
scale_clamp
:
float
=
_DEFAULT_SCALE_CLAMP
):
"""
Args:
weights (4-element tuple): Scaling factors that are applied to the
(dx, dy, dw, dh) deltas. In Fast R-CNN, these were originally set
such that the deltas have unit variance; now they are treated as
hyperparameters of the system.
scale_clamp (float): When predicting deltas, the predicted box scaling
factors (dw and dh) are clamped such that they are <= scale_clamp.
"""
self
.
weights
=
weights
self
.
scale_clamp
=
scale_clamp
def
get_deltas
(
self
,
src_boxes
,
target_boxes
):
"""
Get box regression transformation deltas (dx, dy, dw, dh) that can be used
to transform the `src_boxes` into the `target_boxes`. That is, the relation
``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
any delta is too large and is clamped).
Args:
src_boxes (Tensor): source boxes, e.g., object proposals
target_boxes (Tensor): target of the transformation, e.g., ground-truth
boxes.
"""
assert
isinstance
(
src_boxes
,
torch
.
Tensor
),
type
(
src_boxes
)
assert
isinstance
(
target_boxes
,
torch
.
Tensor
),
type
(
target_boxes
)
src_widths
=
src_boxes
[:,
2
]
-
src_boxes
[:,
0
]
src_heights
=
src_boxes
[:,
3
]
-
src_boxes
[:,
1
]
src_ctr_x
=
src_boxes
[:,
0
]
+
0.5
*
src_widths
src_ctr_y
=
src_boxes
[:,
1
]
+
0.5
*
src_heights
target_widths
=
target_boxes
[:,
2
]
-
target_boxes
[:,
0
]
target_heights
=
target_boxes
[:,
3
]
-
target_boxes
[:,
1
]
target_ctr_x
=
target_boxes
[:,
0
]
+
0.5
*
target_widths
target_ctr_y
=
target_boxes
[:,
1
]
+
0.5
*
target_heights
wx
,
wy
,
ww
,
wh
=
self
.
weights
dx
=
wx
*
(
target_ctr_x
-
src_ctr_x
)
/
src_widths
dy
=
wy
*
(
target_ctr_y
-
src_ctr_y
)
/
src_heights
dw
=
ww
*
torch
.
log
(
target_widths
/
src_widths
)
dh
=
wh
*
torch
.
log
(
target_heights
/
src_heights
)
deltas
=
torch
.
stack
((
dx
,
dy
,
dw
,
dh
),
dim
=
1
)
assert
(
src_widths
>
0
).
all
().
item
(),
"Input boxes to Box2BoxTransform are not valid!"
return
deltas
def
apply_deltas
(
self
,
deltas
,
boxes
):
"""
Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`.
Args:
deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
deltas[i] represents k potentially different class-specific
box transformations for the single box boxes[i].
boxes (Tensor): boxes to transform, of shape (N, 4)
"""
deltas
=
deltas
.
float
()
# ensure fp32 for decoding precision
boxes
=
boxes
.
to
(
deltas
.
dtype
)
widths
=
boxes
[:,
2
]
-
boxes
[:,
0
]
heights
=
boxes
[:,
3
]
-
boxes
[:,
1
]
ctr_x
=
boxes
[:,
0
]
+
0.5
*
widths
ctr_y
=
boxes
[:,
1
]
+
0.5
*
heights
wx
,
wy
,
ww
,
wh
=
self
.
weights
dx
=
deltas
[:,
0
::
4
]
/
wx
dy
=
deltas
[:,
1
::
4
]
/
wy
dw
=
deltas
[:,
2
::
4
]
/
ww
dh
=
deltas
[:,
3
::
4
]
/
wh
# Prevent sending too large values into torch.exp()
dw
=
torch
.
clamp
(
dw
,
max
=
self
.
scale_clamp
)
dh
=
torch
.
clamp
(
dh
,
max
=
self
.
scale_clamp
)
pred_ctr_x
=
dx
*
widths
[:,
None
]
+
ctr_x
[:,
None
]
pred_ctr_y
=
dy
*
heights
[:,
None
]
+
ctr_y
[:,
None
]
pred_w
=
torch
.
exp
(
dw
)
*
widths
[:,
None
]
pred_h
=
torch
.
exp
(
dh
)
*
heights
[:,
None
]
x1
=
pred_ctr_x
-
0.5
*
pred_w
y1
=
pred_ctr_y
-
0.5
*
pred_h
x2
=
pred_ctr_x
+
0.5
*
pred_w
y2
=
pred_ctr_y
+
0.5
*
pred_h
pred_boxes
=
torch
.
stack
((
x1
,
y1
,
x2
,
y2
),
dim
=-
1
)
return
pred_boxes
.
reshape
(
deltas
.
shape
)
@
torch
.
jit
.
script
class
Box2BoxTransformRotated
:
"""
The box-to-box transform defined in Rotated R-CNN. The transformation is parameterized
by 5 deltas: (dx, dy, dw, dh, da). The transformation scales the box's width and height
by exp(dw), exp(dh), shifts a box's center by the offset (dx * width, dy * height),
and rotate a box's angle by da (radians).
Note: angles of deltas are in radians while angles of boxes are in degrees.
"""
def
__init__
(
self
,
weights
:
Tuple
[
float
,
float
,
float
,
float
,
float
],
scale_clamp
:
float
=
_DEFAULT_SCALE_CLAMP
,
):
"""
Args:
weights (5-element tuple): Scaling factors that are applied to the
(dx, dy, dw, dh, da) deltas. These are treated as
hyperparameters of the system.
scale_clamp (float): When predicting deltas, the predicted box scaling
factors (dw and dh) are clamped such that they are <= scale_clamp.
"""
self
.
weights
=
weights
self
.
scale_clamp
=
scale_clamp
def
get_deltas
(
self
,
src_boxes
,
target_boxes
):
"""
Get box regression transformation deltas (dx, dy, dw, dh, da) that can be used
to transform the `src_boxes` into the `target_boxes`. That is, the relation
``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
any delta is too large and is clamped).
Args:
src_boxes (Tensor): Nx5 source boxes, e.g., object proposals
target_boxes (Tensor): Nx5 target of the transformation, e.g., ground-truth
boxes.
"""
assert
isinstance
(
src_boxes
,
torch
.
Tensor
),
type
(
src_boxes
)
assert
isinstance
(
target_boxes
,
torch
.
Tensor
),
type
(
target_boxes
)
src_ctr_x
,
src_ctr_y
,
src_widths
,
src_heights
,
src_angles
=
torch
.
unbind
(
src_boxes
,
dim
=
1
)
target_ctr_x
,
target_ctr_y
,
target_widths
,
target_heights
,
target_angles
=
torch
.
unbind
(
target_boxes
,
dim
=
1
)
wx
,
wy
,
ww
,
wh
,
wa
=
self
.
weights
dx
=
wx
*
(
target_ctr_x
-
src_ctr_x
)
/
src_widths
dy
=
wy
*
(
target_ctr_y
-
src_ctr_y
)
/
src_heights
dw
=
ww
*
torch
.
log
(
target_widths
/
src_widths
)
dh
=
wh
*
torch
.
log
(
target_heights
/
src_heights
)
# Angles of deltas are in radians while angles of boxes are in degrees.
# the conversion to radians serve as a way to normalize the values
da
=
target_angles
-
src_angles
da
=
(
da
+
180.0
)
%
360.0
-
180.0
# make it in [-180, 180)
da
*=
wa
*
math
.
pi
/
180.0
deltas
=
torch
.
stack
((
dx
,
dy
,
dw
,
dh
,
da
),
dim
=
1
)
assert
(
(
src_widths
>
0
).
all
().
item
()
),
"Input boxes to Box2BoxTransformRotated are not valid!"
return
deltas
def
apply_deltas
(
self
,
deltas
,
boxes
):
"""
Apply transformation `deltas` (dx, dy, dw, dh, da) to `boxes`.
Args:
deltas (Tensor): transformation deltas of shape (N, k*5).
deltas[i] represents box transformation for the single box boxes[i].
boxes (Tensor): boxes to transform, of shape (N, 5)
"""
assert
deltas
.
shape
[
1
]
%
5
==
0
and
boxes
.
shape
[
1
]
==
5
boxes
=
boxes
.
to
(
deltas
.
dtype
).
unsqueeze
(
2
)
ctr_x
=
boxes
[:,
0
]
ctr_y
=
boxes
[:,
1
]
widths
=
boxes
[:,
2
]
heights
=
boxes
[:,
3
]
angles
=
boxes
[:,
4
]
wx
,
wy
,
ww
,
wh
,
wa
=
self
.
weights
dx
=
deltas
[:,
0
::
5
]
/
wx
dy
=
deltas
[:,
1
::
5
]
/
wy
dw
=
deltas
[:,
2
::
5
]
/
ww
dh
=
deltas
[:,
3
::
5
]
/
wh
da
=
deltas
[:,
4
::
5
]
/
wa
# Prevent sending too large values into torch.exp()
dw
=
torch
.
clamp
(
dw
,
max
=
self
.
scale_clamp
)
dh
=
torch
.
clamp
(
dh
,
max
=
self
.
scale_clamp
)
pred_boxes
=
torch
.
zeros_like
(
deltas
)
pred_boxes
[:,
0
::
5
]
=
dx
*
widths
+
ctr_x
# x_ctr
pred_boxes
[:,
1
::
5
]
=
dy
*
heights
+
ctr_y
# y_ctr
pred_boxes
[:,
2
::
5
]
=
torch
.
exp
(
dw
)
*
widths
# width
pred_boxes
[:,
3
::
5
]
=
torch
.
exp
(
dh
)
*
heights
# height
# Following original RRPN implementation,
# angles of deltas are in radians while angles of boxes are in degrees.
pred_angle
=
da
*
180.0
/
math
.
pi
+
angles
pred_angle
=
(
pred_angle
+
180.0
)
%
360.0
-
180.0
# make it in [-180, 180)
pred_boxes
[:,
4
::
5
]
=
pred_angle
return
pred_boxes
class
Box2BoxTransformLinear
:
"""
The linear box-to-box transform defined in FCOS. The transformation is parameterized
by the distance from the center of (square) src box to 4 edges of the target box.
"""
def
__init__
(
self
,
normalize_by_size
=
True
):
"""
Args:
normalize_by_size: normalize deltas by the size of src (anchor) boxes.
"""
self
.
normalize_by_size
=
normalize_by_size
def
get_deltas
(
self
,
src_boxes
,
target_boxes
):
"""
Get box regression transformation deltas (dx1, dy1, dx2, dy2) that can be used
to transform the `src_boxes` into the `target_boxes`. That is, the relation
``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true.
The center of src must be inside target boxes.
Args:
src_boxes (Tensor): square source boxes, e.g., anchors
target_boxes (Tensor): target of the transformation, e.g., ground-truth
boxes.
"""
assert
isinstance
(
src_boxes
,
torch
.
Tensor
),
type
(
src_boxes
)
assert
isinstance
(
target_boxes
,
torch
.
Tensor
),
type
(
target_boxes
)
src_ctr_x
=
0.5
*
(
src_boxes
[:,
0
]
+
src_boxes
[:,
2
])
src_ctr_y
=
0.5
*
(
src_boxes
[:,
1
]
+
src_boxes
[:,
3
])
target_l
=
src_ctr_x
-
target_boxes
[:,
0
]
target_t
=
src_ctr_y
-
target_boxes
[:,
1
]
target_r
=
target_boxes
[:,
2
]
-
src_ctr_x
target_b
=
target_boxes
[:,
3
]
-
src_ctr_y
deltas
=
torch
.
stack
((
target_l
,
target_t
,
target_r
,
target_b
),
dim
=
1
)
if
self
.
normalize_by_size
:
stride_w
=
src_boxes
[:,
2
]
-
src_boxes
[:,
0
]
stride_h
=
src_boxes
[:,
3
]
-
src_boxes
[:,
1
]
strides
=
torch
.
stack
([
stride_w
,
stride_h
,
stride_w
,
stride_h
],
axis
=
1
)
deltas
=
deltas
/
strides
return
deltas
def
apply_deltas
(
self
,
deltas
,
boxes
):
"""
Apply transformation `deltas` (dx1, dy1, dx2, dy2) to `boxes`.
Args:
deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
deltas[i] represents k potentially different class-specific
box transformations for the single box boxes[i].
boxes (Tensor): boxes to transform, of shape (N, 4)
"""
# Ensure the output is a valid box. See Sec 2.1 of https://arxiv.org/abs/2006.09214
deltas
=
F
.
relu
(
deltas
)
boxes
=
boxes
.
to
(
deltas
.
dtype
)
ctr_x
=
0.5
*
(
boxes
[:,
0
]
+
boxes
[:,
2
])
ctr_y
=
0.5
*
(
boxes
[:,
1
]
+
boxes
[:,
3
])
if
self
.
normalize_by_size
:
stride_w
=
boxes
[:,
2
]
-
boxes
[:,
0
]
stride_h
=
boxes
[:,
3
]
-
boxes
[:,
1
]
strides
=
torch
.
stack
([
stride_w
,
stride_h
,
stride_w
,
stride_h
],
axis
=
1
)
deltas
=
deltas
*
strides
l
=
deltas
[:,
0
::
4
]
t
=
deltas
[:,
1
::
4
]
r
=
deltas
[:,
2
::
4
]
b
=
deltas
[:,
3
::
4
]
pred_boxes
=
torch
.
zeros_like
(
deltas
)
pred_boxes
[:,
0
::
4
]
=
ctr_x
[:,
None
]
-
l
# x1
pred_boxes
[:,
1
::
4
]
=
ctr_y
[:,
None
]
-
t
# y1
pred_boxes
[:,
2
::
4
]
=
ctr_x
[:,
None
]
+
r
# x2
pred_boxes
[:,
3
::
4
]
=
ctr_y
[:,
None
]
+
b
# y2
return
pred_boxes
def
_dense_box_regression_loss
(
anchors
:
List
[
Union
[
Boxes
,
torch
.
Tensor
]],
box2box_transform
:
Box2BoxTransform
,
pred_anchor_deltas
:
List
[
torch
.
Tensor
],
gt_boxes
:
List
[
torch
.
Tensor
],
fg_mask
:
torch
.
Tensor
,
box_reg_loss_type
=
"smooth_l1"
,
smooth_l1_beta
=
0.0
,
):
"""
Compute loss for dense multi-level box regression.
Loss is accumulated over ``fg_mask``.
Args:
anchors: #lvl anchor boxes, each is (HixWixA, 4)
pred_anchor_deltas: #lvl predictions, each is (N, HixWixA, 4)
gt_boxes: N ground truth boxes, each has shape (R, 4) (R = sum(Hi * Wi * A))
fg_mask: the foreground boolean mask of shape (N, R) to compute loss on
box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou",
"diou", "ciou".
smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to
use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1"
"""
if
isinstance
(
anchors
[
0
],
Boxes
):
anchors
=
type
(
anchors
[
0
]).
cat
(
anchors
).
tensor
# (R, 4)
else
:
anchors
=
cat
(
anchors
)
if
box_reg_loss_type
==
"smooth_l1"
:
gt_anchor_deltas
=
[
box2box_transform
.
get_deltas
(
anchors
,
k
)
for
k
in
gt_boxes
]
gt_anchor_deltas
=
torch
.
stack
(
gt_anchor_deltas
)
# (N, R, 4)
loss_box_reg
=
smooth_l1_loss
(
cat
(
pred_anchor_deltas
,
dim
=
1
)[
fg_mask
],
gt_anchor_deltas
[
fg_mask
],
beta
=
smooth_l1_beta
,
reduction
=
"sum"
,
)
elif
box_reg_loss_type
==
"giou"
:
pred_boxes
=
[
box2box_transform
.
apply_deltas
(
k
,
anchors
)
for
k
in
cat
(
pred_anchor_deltas
,
dim
=
1
)
]
loss_box_reg
=
giou_loss
(
torch
.
stack
(
pred_boxes
)[
fg_mask
],
torch
.
stack
(
gt_boxes
)[
fg_mask
],
reduction
=
"sum"
)
elif
box_reg_loss_type
==
"diou"
:
pred_boxes
=
[
box2box_transform
.
apply_deltas
(
k
,
anchors
)
for
k
in
cat
(
pred_anchor_deltas
,
dim
=
1
)
]
loss_box_reg
=
diou_loss
(
torch
.
stack
(
pred_boxes
)[
fg_mask
],
torch
.
stack
(
gt_boxes
)[
fg_mask
],
reduction
=
"sum"
)
elif
box_reg_loss_type
==
"ciou"
:
pred_boxes
=
[
box2box_transform
.
apply_deltas
(
k
,
anchors
)
for
k
in
cat
(
pred_anchor_deltas
,
dim
=
1
)
]
loss_box_reg
=
ciou_loss
(
torch
.
stack
(
pred_boxes
)[
fg_mask
],
torch
.
stack
(
gt_boxes
)[
fg_mask
],
reduction
=
"sum"
)
else
:
raise
ValueError
(
f
"Invalid dense box regression loss type '
{
box_reg_loss_type
}
'"
)
return
loss_box_reg
detectron2/modeling/matcher.py
0 → 100644
View file @
3144257c
# Copyright (c) Facebook, Inc. and its affiliates.
from
typing
import
List
import
torch
from
detectron2.layers
import
nonzero_tuple
# TODO: the name is too general
class
Matcher
:
"""
This class assigns to each predicted "element" (e.g., a box) a ground-truth
element. Each predicted element will have exactly zero or one matches; each
ground-truth element may be matched to zero or more predicted elements.
The matching is determined by the MxN match_quality_matrix, that characterizes
how well each (ground-truth, prediction)-pair match each other. For example,
if the elements are boxes, this matrix may contain box intersection-over-union
overlap values.
The matcher returns (a) a vector of length N containing the index of the
ground-truth element m in [0, M) that matches to prediction n in [0, N).
(b) a vector of length N containing the labels for each prediction.
"""
def
__init__
(
self
,
thresholds
:
List
[
float
],
labels
:
List
[
int
],
allow_low_quality_matches
:
bool
=
False
):
"""
Args:
thresholds (list): a list of thresholds used to stratify predictions
into levels.
labels (list): a list of values to label predictions belonging at
each level. A label can be one of {-1, 0, 1} signifying
{ignore, negative class, positive class}, respectively.
allow_low_quality_matches (bool): if True, produce additional matches
for predictions with maximum match quality lower than high_threshold.
See set_low_quality_matches_ for more details.
For example,
thresholds = [0.3, 0.5]
labels = [0, -1, 1]
All predictions with iou < 0.3 will be marked with 0 and
thus will be considered as false positives while training.
All predictions with 0.3 <= iou < 0.5 will be marked with -1 and
thus will be ignored.
All predictions with 0.5 <= iou will be marked with 1 and
thus will be considered as true positives.
"""
# Add -inf and +inf to first and last position in thresholds
thresholds
=
thresholds
[:]
assert
thresholds
[
0
]
>
0
thresholds
.
insert
(
0
,
-
float
(
"inf"
))
thresholds
.
append
(
float
(
"inf"
))
# Currently torchscript does not support all + generator
assert
all
([
low
<=
high
for
(
low
,
high
)
in
zip
(
thresholds
[:
-
1
],
thresholds
[
1
:])])
assert
all
([
l
in
[
-
1
,
0
,
1
]
for
l
in
labels
])
assert
len
(
labels
)
==
len
(
thresholds
)
-
1
self
.
thresholds
=
thresholds
self
.
labels
=
labels
self
.
allow_low_quality_matches
=
allow_low_quality_matches
def
__call__
(
self
,
match_quality_matrix
):
"""
Args:
match_quality_matrix (Tensor[float]): an MxN tensor, containing the
pairwise quality between M ground-truth elements and N predicted
elements. All elements must be >= 0 (due to the us of `torch.nonzero`
for selecting indices in :meth:`set_low_quality_matches_`).
Returns:
matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
ground-truth index in [0, M)
match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
whether a prediction is a true or false positive or ignored
"""
assert
match_quality_matrix
.
dim
()
==
2
if
match_quality_matrix
.
numel
()
==
0
:
default_matches
=
match_quality_matrix
.
new_full
(
(
match_quality_matrix
.
size
(
1
),),
0
,
dtype
=
torch
.
int64
)
# When no gt boxes exist, we define IOU = 0 and therefore set labels
# to `self.labels[0]`, which usually defaults to background class 0
# To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds
default_match_labels
=
match_quality_matrix
.
new_full
(
(
match_quality_matrix
.
size
(
1
),),
self
.
labels
[
0
],
dtype
=
torch
.
int8
)
return
default_matches
,
default_match_labels
assert
torch
.
all
(
match_quality_matrix
>=
0
)
# match_quality_matrix is M (gt) x N (predicted)
# Max over gt elements (dim 0) to find best gt candidate for each prediction
matched_vals
,
matches
=
match_quality_matrix
.
max
(
dim
=
0
)
match_labels
=
matches
.
new_full
(
matches
.
size
(),
1
,
dtype
=
torch
.
int8
)
for
l
,
low
,
high
in
zip
(
self
.
labels
,
self
.
thresholds
[:
-
1
],
self
.
thresholds
[
1
:]):
low_high
=
(
matched_vals
>=
low
)
&
(
matched_vals
<
high
)
match_labels
[
low_high
]
=
l
if
self
.
allow_low_quality_matches
:
self
.
set_low_quality_matches_
(
match_labels
,
match_quality_matrix
)
return
matches
,
match_labels
def
set_low_quality_matches_
(
self
,
match_labels
,
match_quality_matrix
):
"""
Produce additional matches for predictions that have only low-quality matches.
Specifically, for each ground-truth G find the set of predictions that have
maximum overlap with it (including ties); for each prediction in that set, if
it is unmatched, then match it to the ground-truth G.
This function implements the RPN assignment case (i) in Sec. 3.1.2 of
:paper:`Faster R-CNN`.
"""
# For each gt, find the prediction with which it has highest quality
highest_quality_foreach_gt
,
_
=
match_quality_matrix
.
max
(
dim
=
1
)
# Find the highest quality match available, even if it is low, including ties.
# Note that the matches qualities must be positive due to the use of
# `torch.nonzero`.
_
,
pred_inds_with_highest_quality
=
nonzero_tuple
(
match_quality_matrix
==
highest_quality_foreach_gt
[:,
None
]
)
# If an anchor was labeled positive only due to a low-quality match
# with gt_A, but it has larger overlap with gt_B, it's matched index will still be gt_B.
# This follows the implementation in Detectron, and is found to have no significant impact.
match_labels
[
pred_inds_with_highest_quality
]
=
1
detectron2/modeling/meta_arch/__init__.py
0 → 100644
View file @
3144257c
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
from
.build
import
META_ARCH_REGISTRY
,
build_model
# isort:skip
from
.panoptic_fpn
import
PanopticFPN
# import all the meta_arch, so they will be registered
from
.rcnn
import
GeneralizedRCNN
,
ProposalNetwork
from
.dense_detector
import
DenseDetector
from
.retinanet
import
RetinaNet
from
.fcos
import
FCOS
from
.semantic_seg
import
SEM_SEG_HEADS_REGISTRY
,
SemanticSegmentor
,
build_sem_seg_head
__all__
=
list
(
globals
().
keys
())
detectron2/modeling/meta_arch/build.py
0 → 100644
View file @
3144257c
# Copyright (c) Facebook, Inc. and its affiliates.
import
torch
from
detectron2.utils.logger
import
_log_api_usage
from
detectron2.utils.registry
import
Registry
META_ARCH_REGISTRY
=
Registry
(
"META_ARCH"
)
# noqa F401 isort:skip
META_ARCH_REGISTRY
.
__doc__
=
"""
Registry for meta-architectures, i.e. the whole model.
The registered object will be called with `obj(cfg)`
and expected to return a `nn.Module` object.
"""
def
build_model
(
cfg
):
"""
Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
Note that it does not load any weights from ``cfg``.
"""
meta_arch
=
cfg
.
MODEL
.
META_ARCHITECTURE
model
=
META_ARCH_REGISTRY
.
get
(
meta_arch
)(
cfg
)
model
.
to
(
torch
.
device
(
cfg
.
MODEL
.
DEVICE
))
_log_api_usage
(
"modeling.meta_arch."
+
meta_arch
)
return
model
detectron2/modeling/meta_arch/dense_detector.py
0 → 100644
View file @
3144257c
import
numpy
as
np
from
typing
import
Dict
,
List
,
Optional
,
Tuple
import
torch
from
torch
import
Tensor
,
nn
from
detectron2.data.detection_utils
import
convert_image_to_rgb
from
detectron2.layers
import
move_device_like
from
detectron2.modeling
import
Backbone
from
detectron2.structures
import
Boxes
,
ImageList
,
Instances
from
detectron2.utils.events
import
get_event_storage
from
..postprocessing
import
detector_postprocess
def
permute_to_N_HWA_K
(
tensor
,
K
:
int
):
"""
Transpose/reshape a tensor from (N, (Ai x K), H, W) to (N, (HxWxAi), K)
"""
assert
tensor
.
dim
()
==
4
,
tensor
.
shape
N
,
_
,
H
,
W
=
tensor
.
shape
tensor
=
tensor
.
view
(
N
,
-
1
,
K
,
H
,
W
)
tensor
=
tensor
.
permute
(
0
,
3
,
4
,
1
,
2
)
tensor
=
tensor
.
reshape
(
N
,
-
1
,
K
)
# Size=(N,HWA,K)
return
tensor
class
DenseDetector
(
nn
.
Module
):
"""
Base class for dense detector. We define a dense detector as a fully-convolutional model that
makes per-pixel (i.e. dense) predictions.
"""
def
__init__
(
self
,
backbone
:
Backbone
,
head
:
nn
.
Module
,
head_in_features
:
Optional
[
List
[
str
]]
=
None
,
*
,
pixel_mean
,
pixel_std
,
):
"""
Args:
backbone: backbone module
head: head module
head_in_features: backbone features to use in head. Default to all backbone features.
pixel_mean (Tuple[float]):
Values to be used for image normalization (BGR order).
To train on images of different number of channels, set different mean & std.
Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675]
pixel_std (Tuple[float]):
When using pre-trained models in Detectron1 or any MSRA models,
std has been absorbed into its conv1 weights, so the std needs to be set 1.
Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
"""
super
().
__init__
()
self
.
backbone
=
backbone
self
.
head
=
head
if
head_in_features
is
None
:
shapes
=
self
.
backbone
.
output_shape
()
self
.
head_in_features
=
sorted
(
shapes
.
keys
(),
key
=
lambda
x
:
shapes
[
x
].
stride
)
else
:
self
.
head_in_features
=
head_in_features
self
.
register_buffer
(
"pixel_mean"
,
torch
.
tensor
(
pixel_mean
).
view
(
-
1
,
1
,
1
),
False
)
self
.
register_buffer
(
"pixel_std"
,
torch
.
tensor
(
pixel_std
).
view
(
-
1
,
1
,
1
),
False
)
@
property
def
device
(
self
):
return
self
.
pixel_mean
.
device
def
_move_to_current_device
(
self
,
x
):
return
move_device_like
(
x
,
self
.
pixel_mean
)
def
forward
(
self
,
batched_inputs
:
List
[
Dict
[
str
,
Tensor
]]):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* image: Tensor, image in (C, H, W) format.
* instances: Instances
Other information that's included in the original dicts, such as:
* "height", "width" (int): the output resolution of the model, used in inference.
See :meth:`postprocess` for details.
Returns:
In training, dict[str, Tensor]: mapping from a named loss to a tensor storing the
loss. Used during training only. In inference, the standard output format, described
in :doc:`/tutorials/models`.
"""
images
=
self
.
preprocess_image
(
batched_inputs
)
features
=
self
.
backbone
(
images
.
tensor
)
features
=
[
features
[
f
]
for
f
in
self
.
head_in_features
]
predictions
=
self
.
head
(
features
)
if
self
.
training
:
assert
not
torch
.
jit
.
is_scripting
(),
"Not supported"
assert
"instances"
in
batched_inputs
[
0
],
"Instance annotations are missing in training!"
gt_instances
=
[
x
[
"instances"
].
to
(
self
.
device
)
for
x
in
batched_inputs
]
return
self
.
forward_training
(
images
,
features
,
predictions
,
gt_instances
)
else
:
results
=
self
.
forward_inference
(
images
,
features
,
predictions
)
if
torch
.
jit
.
is_scripting
():
return
results
processed_results
=
[]
for
results_per_image
,
input_per_image
,
image_size
in
zip
(
results
,
batched_inputs
,
images
.
image_sizes
):
height
=
input_per_image
.
get
(
"height"
,
image_size
[
0
])
width
=
input_per_image
.
get
(
"width"
,
image_size
[
1
])
r
=
detector_postprocess
(
results_per_image
,
height
,
width
)
processed_results
.
append
({
"instances"
:
r
})
return
processed_results
def
forward_training
(
self
,
images
,
features
,
predictions
,
gt_instances
):
raise
NotImplementedError
()
def
preprocess_image
(
self
,
batched_inputs
:
List
[
Dict
[
str
,
Tensor
]]):
"""
Normalize, pad and batch the input images.
"""
images
=
[
self
.
_move_to_current_device
(
x
[
"image"
])
for
x
in
batched_inputs
]
images
=
[(
x
-
self
.
pixel_mean
)
/
self
.
pixel_std
for
x
in
images
]
images
=
ImageList
.
from_tensors
(
images
,
self
.
backbone
.
size_divisibility
,
padding_constraints
=
self
.
backbone
.
padding_constraints
,
)
return
images
def
_transpose_dense_predictions
(
self
,
predictions
:
List
[
List
[
Tensor
]],
dims_per_anchor
:
List
[
int
]
)
->
List
[
List
[
Tensor
]]:
"""
Transpose the dense per-level predictions.
Args:
predictions: a list of outputs, each is a list of per-level
predictions with shape (N, Ai x K, Hi, Wi), where N is the
number of images, Ai is the number of anchors per location on
level i, K is the dimension of predictions per anchor.
dims_per_anchor: the value of K for each predictions. e.g. 4 for
box prediction, #classes for classification prediction.
Returns:
List[List[Tensor]]: each prediction is transposed to (N, Hi x Wi x Ai, K).
"""
assert
len
(
predictions
)
==
len
(
dims_per_anchor
)
res
:
List
[
List
[
Tensor
]]
=
[]
for
pred
,
dim_per_anchor
in
zip
(
predictions
,
dims_per_anchor
):
pred
=
[
permute_to_N_HWA_K
(
x
,
dim_per_anchor
)
for
x
in
pred
]
res
.
append
(
pred
)
return
res
def
_ema_update
(
self
,
name
:
str
,
value
:
float
,
initial_value
:
float
,
momentum
:
float
=
0.9
):
"""
Apply EMA update to `self.name` using `value`.
This is mainly used for loss normalizer. In Detectron1, loss is normalized by number
of foreground samples in the batch. When batch size is 1 per GPU, #foreground has a
large variance and using it lead to lower performance. Therefore we maintain an EMA of
#foreground to stabilize the normalizer.
Args:
name: name of the normalizer
value: the new value to update
initial_value: the initial value to start with
momentum: momentum of EMA
Returns:
float: the updated EMA value
"""
if
hasattr
(
self
,
name
):
old
=
getattr
(
self
,
name
)
else
:
old
=
initial_value
new
=
old
*
momentum
+
value
*
(
1
-
momentum
)
setattr
(
self
,
name
,
new
)
return
new
def
_decode_per_level_predictions
(
self
,
anchors
:
Boxes
,
pred_scores
:
Tensor
,
pred_deltas
:
Tensor
,
score_thresh
:
float
,
topk_candidates
:
int
,
image_size
:
Tuple
[
int
,
int
],
)
->
Instances
:
"""
Decode boxes and classification predictions of one featuer level, by
the following steps:
1. filter the predictions based on score threshold and top K scores.
2. transform the box regression outputs
3. return the predicted scores, classes and boxes
Args:
anchors: Boxes, anchor for this feature level
pred_scores: HxWxA,K
pred_deltas: HxWxA,4
Returns:
Instances: with field "scores", "pred_boxes", "pred_classes".
"""
# Apply two filtering to make NMS faster.
# 1. Keep boxes with confidence score higher than threshold
keep_idxs
=
pred_scores
>
score_thresh
pred_scores
=
pred_scores
[
keep_idxs
]
topk_idxs
=
torch
.
nonzero
(
keep_idxs
)
# Kx2
# 2. Keep top k top scoring boxes only
topk_idxs_size
=
topk_idxs
.
shape
[
0
]
if
isinstance
(
topk_idxs_size
,
Tensor
):
# It's a tensor in tracing
num_topk
=
torch
.
clamp
(
topk_idxs_size
,
max
=
topk_candidates
)
else
:
num_topk
=
min
(
topk_idxs_size
,
topk_candidates
)
pred_scores
,
idxs
=
pred_scores
.
topk
(
num_topk
)
topk_idxs
=
topk_idxs
[
idxs
]
anchor_idxs
,
classes_idxs
=
topk_idxs
.
unbind
(
dim
=
1
)
pred_boxes
=
self
.
box2box_transform
.
apply_deltas
(
pred_deltas
[
anchor_idxs
],
anchors
.
tensor
[
anchor_idxs
]
)
return
Instances
(
image_size
,
pred_boxes
=
Boxes
(
pred_boxes
),
scores
=
pred_scores
,
pred_classes
=
classes_idxs
)
def
_decode_multi_level_predictions
(
self
,
anchors
:
List
[
Boxes
],
pred_scores
:
List
[
Tensor
],
pred_deltas
:
List
[
Tensor
],
score_thresh
:
float
,
topk_candidates
:
int
,
image_size
:
Tuple
[
int
,
int
],
)
->
Instances
:
"""
Run `_decode_per_level_predictions` for all feature levels and concat the results.
"""
predictions
=
[
self
.
_decode_per_level_predictions
(
anchors_i
,
box_cls_i
,
box_reg_i
,
score_thresh
,
topk_candidates
,
image_size
,
)
# Iterate over every feature level
for
box_cls_i
,
box_reg_i
,
anchors_i
in
zip
(
pred_scores
,
pred_deltas
,
anchors
)
]
return
predictions
[
0
].
cat
(
predictions
)
# 'Instances.cat' is not scriptale but this is
def
visualize_training
(
self
,
batched_inputs
,
results
):
"""
A function used to visualize ground truth images and final network predictions.
It shows ground truth bounding boxes on the original image and up to 20
predicted object bounding boxes on the original image.
Args:
batched_inputs (list): a list that contains input to the model.
results (List[Instances]): a list of #images elements returned by forward_inference().
"""
from
detectron2.utils.visualizer
import
Visualizer
assert
len
(
batched_inputs
)
==
len
(
results
),
"Cannot visualize inputs and results of different sizes"
storage
=
get_event_storage
()
max_boxes
=
20
image_index
=
0
# only visualize a single image
img
=
batched_inputs
[
image_index
][
"image"
]
img
=
convert_image_to_rgb
(
img
.
permute
(
1
,
2
,
0
),
self
.
input_format
)
v_gt
=
Visualizer
(
img
,
None
)
v_gt
=
v_gt
.
overlay_instances
(
boxes
=
batched_inputs
[
image_index
][
"instances"
].
gt_boxes
)
anno_img
=
v_gt
.
get_image
()
processed_results
=
detector_postprocess
(
results
[
image_index
],
img
.
shape
[
0
],
img
.
shape
[
1
])
predicted_boxes
=
processed_results
.
pred_boxes
.
tensor
.
detach
().
cpu
().
numpy
()
v_pred
=
Visualizer
(
img
,
None
)
v_pred
=
v_pred
.
overlay_instances
(
boxes
=
predicted_boxes
[
0
:
max_boxes
])
prop_img
=
v_pred
.
get_image
()
vis_img
=
np
.
vstack
((
anno_img
,
prop_img
))
vis_img
=
vis_img
.
transpose
(
2
,
0
,
1
)
vis_name
=
f
"Top: GT bounding boxes; Bottom:
{
max_boxes
}
Highest Scoring Results"
storage
.
put_image
(
vis_name
,
vis_img
)
detectron2/modeling/meta_arch/fcos.py
0 → 100644
View file @
3144257c
# Copyright (c) Facebook, Inc. and its affiliates.
import
logging
from
typing
import
List
,
Optional
,
Tuple
import
torch
from
fvcore.nn
import
sigmoid_focal_loss_jit
from
torch
import
nn
from
torch.nn
import
functional
as
F
from
detectron2.layers
import
ShapeSpec
,
batched_nms
from
detectron2.structures
import
Boxes
,
ImageList
,
Instances
,
pairwise_point_box_distance
from
detectron2.utils.events
import
get_event_storage
from
..anchor_generator
import
DefaultAnchorGenerator
from
..backbone
import
Backbone
from
..box_regression
import
Box2BoxTransformLinear
,
_dense_box_regression_loss
from
.dense_detector
import
DenseDetector
from
.retinanet
import
RetinaNetHead
__all__
=
[
"FCOS"
]
logger
=
logging
.
getLogger
(
__name__
)
class
FCOS
(
DenseDetector
):
"""
Implement FCOS in :paper:`fcos`.
"""
def
__init__
(
self
,
*
,
backbone
:
Backbone
,
head
:
nn
.
Module
,
head_in_features
:
Optional
[
List
[
str
]]
=
None
,
box2box_transform
=
None
,
num_classes
,
center_sampling_radius
:
float
=
1.5
,
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
test_score_thresh
=
0.2
,
test_topk_candidates
=
1000
,
test_nms_thresh
=
0.6
,
max_detections_per_image
=
100
,
pixel_mean
,
pixel_std
,
):
"""
Args:
center_sampling_radius: radius of the "center" of a groundtruth box,
within which all anchor points are labeled positive.
Other arguments mean the same as in :class:`RetinaNet`.
"""
super
().
__init__
(
backbone
,
head
,
head_in_features
,
pixel_mean
=
pixel_mean
,
pixel_std
=
pixel_std
)
self
.
num_classes
=
num_classes
# FCOS uses one anchor point per location.
# We represent the anchor point by a box whose size equals the anchor stride.
feature_shapes
=
backbone
.
output_shape
()
fpn_strides
=
[
feature_shapes
[
k
].
stride
for
k
in
self
.
head_in_features
]
self
.
anchor_generator
=
DefaultAnchorGenerator
(
sizes
=
[[
k
]
for
k
in
fpn_strides
],
aspect_ratios
=
[
1.0
],
strides
=
fpn_strides
)
# FCOS parameterizes box regression by a linear transform,
# where predictions are normalized by anchor stride (equal to anchor size).
if
box2box_transform
is
None
:
box2box_transform
=
Box2BoxTransformLinear
(
normalize_by_size
=
True
)
self
.
box2box_transform
=
box2box_transform
self
.
center_sampling_radius
=
float
(
center_sampling_radius
)
# Loss parameters:
self
.
focal_loss_alpha
=
focal_loss_alpha
self
.
focal_loss_gamma
=
focal_loss_gamma
# Inference parameters:
self
.
test_score_thresh
=
test_score_thresh
self
.
test_topk_candidates
=
test_topk_candidates
self
.
test_nms_thresh
=
test_nms_thresh
self
.
max_detections_per_image
=
max_detections_per_image
def
forward_training
(
self
,
images
,
features
,
predictions
,
gt_instances
):
# Transpose the Hi*Wi*A dimension to the middle:
pred_logits
,
pred_anchor_deltas
,
pred_centerness
=
self
.
_transpose_dense_predictions
(
predictions
,
[
self
.
num_classes
,
4
,
1
]
)
anchors
=
self
.
anchor_generator
(
features
)
gt_labels
,
gt_boxes
=
self
.
label_anchors
(
anchors
,
gt_instances
)
return
self
.
losses
(
anchors
,
pred_logits
,
gt_labels
,
pred_anchor_deltas
,
gt_boxes
,
pred_centerness
)
@
torch
.
no_grad
()
def
_match_anchors
(
self
,
gt_boxes
:
Boxes
,
anchors
:
List
[
Boxes
]):
"""
Match ground-truth boxes to a set of multi-level anchors.
Args:
gt_boxes: Ground-truth boxes from instances of an image.
anchors: List of anchors for each feature map (of different scales).
Returns:
torch.Tensor
A tensor of shape `(M, R)`, given `M` ground-truth boxes and total
`R` anchor points from all feature levels, indicating the quality
of match between m-th box and r-th anchor. Higher value indicates
better match.
"""
# Naming convention: (M = ground-truth boxes, R = anchor points)
# Anchor points are represented as square boxes of size = stride.
num_anchors_per_level
=
[
len
(
x
)
for
x
in
anchors
]
anchors
=
Boxes
.
cat
(
anchors
)
# (R, 4)
anchor_centers
=
anchors
.
get_centers
()
# (R, 2)
anchor_sizes
=
anchors
.
tensor
[:,
2
]
-
anchors
.
tensor
[:,
0
]
# (R, )
lower_bound
=
anchor_sizes
*
4
lower_bound
[:
num_anchors_per_level
[
0
]]
=
0
upper_bound
=
anchor_sizes
*
8
upper_bound
[
-
num_anchors_per_level
[
-
1
]
:]
=
float
(
"inf"
)
gt_centers
=
gt_boxes
.
get_centers
()
# FCOS with center sampling: anchor point must be close enough to
# ground-truth box center.
center_dists
=
(
anchor_centers
[
None
,
:,
:]
-
gt_centers
[:,
None
,
:]).
abs_
()
sampling_regions
=
self
.
center_sampling_radius
*
anchor_sizes
[
None
,
:]
match_quality_matrix
=
center_dists
.
max
(
dim
=
2
).
values
<
sampling_regions
pairwise_dist
=
pairwise_point_box_distance
(
anchor_centers
,
gt_boxes
)
pairwise_dist
=
pairwise_dist
.
permute
(
1
,
0
,
2
)
# (M, R, 4)
# The original FCOS anchor matching rule: anchor point must be inside GT.
match_quality_matrix
&=
pairwise_dist
.
min
(
dim
=
2
).
values
>
0
# Multilevel anchor matching in FCOS: each anchor is only responsible
# for certain scale range.
pairwise_dist
=
pairwise_dist
.
max
(
dim
=
2
).
values
match_quality_matrix
&=
(
pairwise_dist
>
lower_bound
[
None
,
:])
&
(
pairwise_dist
<
upper_bound
[
None
,
:]
)
# Match the GT box with minimum area, if there are multiple GT matches.
gt_areas
=
gt_boxes
.
area
()
# (M, )
match_quality_matrix
=
match_quality_matrix
.
to
(
torch
.
float32
)
match_quality_matrix
*=
1e8
-
gt_areas
[:,
None
]
return
match_quality_matrix
# (M, R)
@
torch
.
no_grad
()
def
label_anchors
(
self
,
anchors
:
List
[
Boxes
],
gt_instances
:
List
[
Instances
]):
"""
Same interface as :meth:`RetinaNet.label_anchors`, but implemented with FCOS
anchor matching rule.
Unlike RetinaNet, there are no ignored anchors.
"""
gt_labels
,
matched_gt_boxes
=
[],
[]
for
inst
in
gt_instances
:
if
len
(
inst
)
>
0
:
match_quality_matrix
=
self
.
_match_anchors
(
inst
.
gt_boxes
,
anchors
)
# Find matched ground-truth box per anchor. Un-matched anchors are
# assigned -1. This is equivalent to using an anchor matcher as used
# in R-CNN/RetinaNet: `Matcher(thresholds=[1e-5], labels=[0, 1])`
match_quality
,
matched_idxs
=
match_quality_matrix
.
max
(
dim
=
0
)
matched_idxs
[
match_quality
<
1e-5
]
=
-
1
matched_gt_boxes_i
=
inst
.
gt_boxes
.
tensor
[
matched_idxs
.
clip
(
min
=
0
)]
gt_labels_i
=
inst
.
gt_classes
[
matched_idxs
.
clip
(
min
=
0
)]
# Anchors with matched_idxs = -1 are labeled background.
gt_labels_i
[
matched_idxs
<
0
]
=
self
.
num_classes
else
:
matched_gt_boxes_i
=
torch
.
zeros_like
(
Boxes
.
cat
(
anchors
).
tensor
)
gt_labels_i
=
torch
.
full
(
(
len
(
matched_gt_boxes_i
),),
fill_value
=
self
.
num_classes
,
dtype
=
torch
.
long
,
device
=
matched_gt_boxes_i
.
device
,
)
gt_labels
.
append
(
gt_labels_i
)
matched_gt_boxes
.
append
(
matched_gt_boxes_i
)
return
gt_labels
,
matched_gt_boxes
def
losses
(
self
,
anchors
,
pred_logits
,
gt_labels
,
pred_anchor_deltas
,
gt_boxes
,
pred_centerness
):
"""
This method is almost identical to :meth:`RetinaNet.losses`, with an extra
"loss_centerness" in the returned dict.
"""
num_images
=
len
(
gt_labels
)
gt_labels
=
torch
.
stack
(
gt_labels
)
# (M, R)
pos_mask
=
(
gt_labels
>=
0
)
&
(
gt_labels
!=
self
.
num_classes
)
num_pos_anchors
=
pos_mask
.
sum
().
item
()
get_event_storage
().
put_scalar
(
"num_pos_anchors"
,
num_pos_anchors
/
num_images
)
normalizer
=
self
.
_ema_update
(
"loss_normalizer"
,
max
(
num_pos_anchors
,
1
),
300
)
# classification and regression loss
gt_labels_target
=
F
.
one_hot
(
gt_labels
,
num_classes
=
self
.
num_classes
+
1
)[
:,
:,
:
-
1
]
# no loss for the last (background) class
loss_cls
=
sigmoid_focal_loss_jit
(
torch
.
cat
(
pred_logits
,
dim
=
1
),
gt_labels_target
.
to
(
pred_logits
[
0
].
dtype
),
alpha
=
self
.
focal_loss_alpha
,
gamma
=
self
.
focal_loss_gamma
,
reduction
=
"sum"
,
)
loss_box_reg
=
_dense_box_regression_loss
(
anchors
,
self
.
box2box_transform
,
pred_anchor_deltas
,
gt_boxes
,
pos_mask
,
box_reg_loss_type
=
"giou"
,
)
ctrness_targets
=
self
.
compute_ctrness_targets
(
anchors
,
gt_boxes
)
# (M, R)
pred_centerness
=
torch
.
cat
(
pred_centerness
,
dim
=
1
).
squeeze
(
dim
=
2
)
# (M, R)
ctrness_loss
=
F
.
binary_cross_entropy_with_logits
(
pred_centerness
[
pos_mask
],
ctrness_targets
[
pos_mask
],
reduction
=
"sum"
)
return
{
"loss_fcos_cls"
:
loss_cls
/
normalizer
,
"loss_fcos_loc"
:
loss_box_reg
/
normalizer
,
"loss_fcos_ctr"
:
ctrness_loss
/
normalizer
,
}
def
compute_ctrness_targets
(
self
,
anchors
:
List
[
Boxes
],
gt_boxes
:
List
[
torch
.
Tensor
]):
anchors
=
Boxes
.
cat
(
anchors
).
tensor
# Rx4
reg_targets
=
[
self
.
box2box_transform
.
get_deltas
(
anchors
,
m
)
for
m
in
gt_boxes
]
reg_targets
=
torch
.
stack
(
reg_targets
,
dim
=
0
)
# NxRx4
if
len
(
reg_targets
)
==
0
:
return
reg_targets
.
new_zeros
(
len
(
reg_targets
))
left_right
=
reg_targets
[:,
:,
[
0
,
2
]]
top_bottom
=
reg_targets
[:,
:,
[
1
,
3
]]
ctrness
=
(
left_right
.
min
(
dim
=-
1
)[
0
]
/
left_right
.
max
(
dim
=-
1
)[
0
])
*
(
top_bottom
.
min
(
dim
=-
1
)[
0
]
/
top_bottom
.
max
(
dim
=-
1
)[
0
]
)
return
torch
.
sqrt
(
ctrness
)
def
forward_inference
(
self
,
images
:
ImageList
,
features
:
List
[
torch
.
Tensor
],
predictions
:
List
[
List
[
torch
.
Tensor
]],
):
pred_logits
,
pred_anchor_deltas
,
pred_centerness
=
self
.
_transpose_dense_predictions
(
predictions
,
[
self
.
num_classes
,
4
,
1
]
)
anchors
=
self
.
anchor_generator
(
features
)
results
:
List
[
Instances
]
=
[]
for
img_idx
,
image_size
in
enumerate
(
images
.
image_sizes
):
scores_per_image
=
[
# Multiply and sqrt centerness & classification scores
# (See eqn. 4 in https://arxiv.org/abs/2006.09214)
torch
.
sqrt
(
x
[
img_idx
].
sigmoid_
()
*
y
[
img_idx
].
sigmoid_
())
for
x
,
y
in
zip
(
pred_logits
,
pred_centerness
)
]
deltas_per_image
=
[
x
[
img_idx
]
for
x
in
pred_anchor_deltas
]
results_per_image
=
self
.
inference_single_image
(
anchors
,
scores_per_image
,
deltas_per_image
,
image_size
)
results
.
append
(
results_per_image
)
return
results
def
inference_single_image
(
self
,
anchors
:
List
[
Boxes
],
box_cls
:
List
[
torch
.
Tensor
],
box_delta
:
List
[
torch
.
Tensor
],
image_size
:
Tuple
[
int
,
int
],
):
"""
Identical to :meth:`RetinaNet.inference_single_image.
"""
pred
=
self
.
_decode_multi_level_predictions
(
anchors
,
box_cls
,
box_delta
,
self
.
test_score_thresh
,
self
.
test_topk_candidates
,
image_size
,
)
keep
=
batched_nms
(
pred
.
pred_boxes
.
tensor
,
pred
.
scores
,
pred
.
pred_classes
,
self
.
test_nms_thresh
)
return
pred
[
keep
[:
self
.
max_detections_per_image
]]
class
FCOSHead
(
RetinaNetHead
):
"""
The head used in :paper:`fcos`. It adds an additional centerness
prediction branch on top of :class:`RetinaNetHead`.
"""
def
__init__
(
self
,
*
,
input_shape
:
List
[
ShapeSpec
],
conv_dims
:
List
[
int
],
**
kwargs
):
super
().
__init__
(
input_shape
=
input_shape
,
conv_dims
=
conv_dims
,
num_anchors
=
1
,
**
kwargs
)
# Unlike original FCOS, we do not add an additional learnable scale layer
# because it's found to have no benefits after normalizing regression targets by stride.
self
.
_num_features
=
len
(
input_shape
)
self
.
ctrness
=
nn
.
Conv2d
(
conv_dims
[
-
1
],
1
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
torch
.
nn
.
init
.
normal_
(
self
.
ctrness
.
weight
,
std
=
0.01
)
torch
.
nn
.
init
.
constant_
(
self
.
ctrness
.
bias
,
0
)
def
forward
(
self
,
features
):
assert
len
(
features
)
==
self
.
_num_features
logits
=
[]
bbox_reg
=
[]
ctrness
=
[]
for
feature
in
features
:
logits
.
append
(
self
.
cls_score
(
self
.
cls_subnet
(
feature
)))
bbox_feature
=
self
.
bbox_subnet
(
feature
)
bbox_reg
.
append
(
self
.
bbox_pred
(
bbox_feature
))
ctrness
.
append
(
self
.
ctrness
(
bbox_feature
))
return
logits
,
bbox_reg
,
ctrness
detectron2/modeling/meta_arch/panoptic_fpn.py
0 → 100644
View file @
3144257c
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
import
logging
from
typing
import
Dict
,
List
import
torch
from
torch
import
nn
from
detectron2.config
import
configurable
from
detectron2.structures
import
ImageList
from
..postprocessing
import
detector_postprocess
,
sem_seg_postprocess
from
.build
import
META_ARCH_REGISTRY
from
.rcnn
import
GeneralizedRCNN
from
.semantic_seg
import
build_sem_seg_head
__all__
=
[
"PanopticFPN"
]
@
META_ARCH_REGISTRY
.
register
()
class
PanopticFPN
(
GeneralizedRCNN
):
"""
Implement the paper :paper:`PanopticFPN`.
"""
@
configurable
def
__init__
(
self
,
*
,
sem_seg_head
:
nn
.
Module
,
combine_overlap_thresh
:
float
=
0.5
,
combine_stuff_area_thresh
:
float
=
4096
,
combine_instances_score_thresh
:
float
=
0.5
,
**
kwargs
,
):
"""
NOTE: this interface is experimental.
Args:
sem_seg_head: a module for the semantic segmentation head.
combine_overlap_thresh: combine masks into one instances if
they have enough overlap
combine_stuff_area_thresh: ignore stuff areas smaller than this threshold
combine_instances_score_thresh: ignore instances whose score is
smaller than this threshold
Other arguments are the same as :class:`GeneralizedRCNN`.
"""
super
().
__init__
(
**
kwargs
)
self
.
sem_seg_head
=
sem_seg_head
# options when combining instance & semantic outputs
self
.
combine_overlap_thresh
=
combine_overlap_thresh
self
.
combine_stuff_area_thresh
=
combine_stuff_area_thresh
self
.
combine_instances_score_thresh
=
combine_instances_score_thresh
@
classmethod
def
from_config
(
cls
,
cfg
):
ret
=
super
().
from_config
(
cfg
)
ret
.
update
(
{
"combine_overlap_thresh"
:
cfg
.
MODEL
.
PANOPTIC_FPN
.
COMBINE
.
OVERLAP_THRESH
,
"combine_stuff_area_thresh"
:
cfg
.
MODEL
.
PANOPTIC_FPN
.
COMBINE
.
STUFF_AREA_LIMIT
,
"combine_instances_score_thresh"
:
cfg
.
MODEL
.
PANOPTIC_FPN
.
COMBINE
.
INSTANCES_CONFIDENCE_THRESH
,
# noqa
}
)
ret
[
"sem_seg_head"
]
=
build_sem_seg_head
(
cfg
,
ret
[
"backbone"
].
output_shape
())
logger
=
logging
.
getLogger
(
__name__
)
if
not
cfg
.
MODEL
.
PANOPTIC_FPN
.
COMBINE
.
ENABLED
:
logger
.
warning
(
"PANOPTIC_FPN.COMBINED.ENABLED is no longer used. "
" model.inference(do_postprocess=) should be used to toggle postprocessing."
)
if
cfg
.
MODEL
.
PANOPTIC_FPN
.
INSTANCE_LOSS_WEIGHT
!=
1.0
:
w
=
cfg
.
MODEL
.
PANOPTIC_FPN
.
INSTANCE_LOSS_WEIGHT
logger
.
warning
(
"PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT should be replaced by weights on each ROI head."
)
def
update_weight
(
x
):
if
isinstance
(
x
,
dict
):
return
{
k
:
v
*
w
for
k
,
v
in
x
.
items
()}
else
:
return
x
*
w
roi_heads
=
ret
[
"roi_heads"
]
roi_heads
.
box_predictor
.
loss_weight
=
update_weight
(
roi_heads
.
box_predictor
.
loss_weight
)
roi_heads
.
mask_head
.
loss_weight
=
update_weight
(
roi_heads
.
mask_head
.
loss_weight
)
return
ret
def
forward
(
self
,
batched_inputs
):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* "image": Tensor, image in (C, H, W) format.
* "instances": Instances
* "sem_seg": semantic segmentation ground truth.
* Other information that's included in the original dicts, such as:
"height", "width" (int): the output resolution of the model, used in inference.
See :meth:`postprocess` for details.
Returns:
list[dict]:
each dict has the results for one image. The dict contains the following keys:
* "instances": see :meth:`GeneralizedRCNN.forward` for its format.
* "sem_seg": see :meth:`SemanticSegmentor.forward` for its format.
* "panoptic_seg": See the return value of
:func:`combine_semantic_and_instance_outputs` for its format.
"""
if
not
self
.
training
:
return
self
.
inference
(
batched_inputs
)
images
=
self
.
preprocess_image
(
batched_inputs
)
features
=
self
.
backbone
(
images
.
tensor
)
assert
"sem_seg"
in
batched_inputs
[
0
]
gt_sem_seg
=
[
x
[
"sem_seg"
].
to
(
self
.
device
)
for
x
in
batched_inputs
]
gt_sem_seg
=
ImageList
.
from_tensors
(
gt_sem_seg
,
self
.
backbone
.
size_divisibility
,
self
.
sem_seg_head
.
ignore_value
,
self
.
backbone
.
padding_constraints
,
).
tensor
sem_seg_results
,
sem_seg_losses
=
self
.
sem_seg_head
(
features
,
gt_sem_seg
)
gt_instances
=
[
x
[
"instances"
].
to
(
self
.
device
)
for
x
in
batched_inputs
]
proposals
,
proposal_losses
=
self
.
proposal_generator
(
images
,
features
,
gt_instances
)
detector_results
,
detector_losses
=
self
.
roi_heads
(
images
,
features
,
proposals
,
gt_instances
)
losses
=
sem_seg_losses
losses
.
update
(
proposal_losses
)
losses
.
update
(
detector_losses
)
return
losses
def
inference
(
self
,
batched_inputs
:
List
[
Dict
[
str
,
torch
.
Tensor
]],
do_postprocess
:
bool
=
True
):
"""
Run inference on the given inputs.
Args:
batched_inputs (list[dict]): same as in :meth:`forward`
do_postprocess (bool): whether to apply post-processing on the outputs.
Returns:
When do_postprocess=True, see docs in :meth:`forward`.
Otherwise, returns a (list[Instances], list[Tensor]) that contains
the raw detector outputs, and raw semantic segmentation outputs.
"""
images
=
self
.
preprocess_image
(
batched_inputs
)
features
=
self
.
backbone
(
images
.
tensor
)
sem_seg_results
,
sem_seg_losses
=
self
.
sem_seg_head
(
features
,
None
)
proposals
,
_
=
self
.
proposal_generator
(
images
,
features
,
None
)
detector_results
,
_
=
self
.
roi_heads
(
images
,
features
,
proposals
,
None
)
if
do_postprocess
:
processed_results
=
[]
for
sem_seg_result
,
detector_result
,
input_per_image
,
image_size
in
zip
(
sem_seg_results
,
detector_results
,
batched_inputs
,
images
.
image_sizes
):
height
=
input_per_image
.
get
(
"height"
,
image_size
[
0
])
width
=
input_per_image
.
get
(
"width"
,
image_size
[
1
])
sem_seg_r
=
sem_seg_postprocess
(
sem_seg_result
,
image_size
,
height
,
width
)
detector_r
=
detector_postprocess
(
detector_result
,
height
,
width
)
processed_results
.
append
({
"sem_seg"
:
sem_seg_r
,
"instances"
:
detector_r
})
panoptic_r
=
combine_semantic_and_instance_outputs
(
detector_r
,
sem_seg_r
.
argmax
(
dim
=
0
),
self
.
combine_overlap_thresh
,
self
.
combine_stuff_area_thresh
,
self
.
combine_instances_score_thresh
,
)
processed_results
[
-
1
][
"panoptic_seg"
]
=
panoptic_r
return
processed_results
else
:
return
detector_results
,
sem_seg_results
def
combine_semantic_and_instance_outputs
(
instance_results
,
semantic_results
,
overlap_threshold
,
stuff_area_thresh
,
instances_score_thresh
,
):
"""
Implement a simple combining logic following
"combine_semantic_and_instance_predictions.py" in panopticapi
to produce panoptic segmentation outputs.
Args:
instance_results: output of :func:`detector_postprocess`.
semantic_results: an (H, W) tensor, each element is the contiguous semantic
category id
Returns:
panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
segments_info (list[dict]): Describe each segment in `panoptic_seg`.
Each dict contains keys "id", "category_id", "isthing".
"""
panoptic_seg
=
torch
.
zeros_like
(
semantic_results
,
dtype
=
torch
.
int32
)
# sort instance outputs by scores
sorted_inds
=
torch
.
argsort
(
-
instance_results
.
scores
)
current_segment_id
=
0
segments_info
=
[]
instance_masks
=
instance_results
.
pred_masks
.
to
(
dtype
=
torch
.
bool
,
device
=
panoptic_seg
.
device
)
# Add instances one-by-one, check for overlaps with existing ones
for
inst_id
in
sorted_inds
:
score
=
instance_results
.
scores
[
inst_id
].
item
()
if
score
<
instances_score_thresh
:
break
mask
=
instance_masks
[
inst_id
]
# H,W
mask_area
=
mask
.
sum
().
item
()
if
mask_area
==
0
:
continue
intersect
=
(
mask
>
0
)
&
(
panoptic_seg
>
0
)
intersect_area
=
intersect
.
sum
().
item
()
if
intersect_area
*
1.0
/
mask_area
>
overlap_threshold
:
continue
if
intersect_area
>
0
:
mask
=
mask
&
(
panoptic_seg
==
0
)
current_segment_id
+=
1
panoptic_seg
[
mask
]
=
current_segment_id
segments_info
.
append
(
{
"id"
:
current_segment_id
,
"isthing"
:
True
,
"score"
:
score
,
"category_id"
:
instance_results
.
pred_classes
[
inst_id
].
item
(),
"instance_id"
:
inst_id
.
item
(),
}
)
# Add semantic results to remaining empty areas
semantic_labels
=
torch
.
unique
(
semantic_results
).
cpu
().
tolist
()
for
semantic_label
in
semantic_labels
:
if
semantic_label
==
0
:
# 0 is a special "thing" class
continue
mask
=
(
semantic_results
==
semantic_label
)
&
(
panoptic_seg
==
0
)
mask_area
=
mask
.
sum
().
item
()
if
mask_area
<
stuff_area_thresh
:
continue
current_segment_id
+=
1
panoptic_seg
[
mask
]
=
current_segment_id
segments_info
.
append
(
{
"id"
:
current_segment_id
,
"isthing"
:
False
,
"category_id"
:
semantic_label
,
"area"
:
mask_area
,
}
)
return
panoptic_seg
,
segments_info
detectron2/modeling/meta_arch/rcnn.py
0 → 100644
View file @
3144257c
# Copyright (c) Facebook, Inc. and its affiliates.
import
logging
import
numpy
as
np
from
typing
import
Dict
,
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
detectron2.config
import
configurable
from
detectron2.data.detection_utils
import
convert_image_to_rgb
from
detectron2.layers
import
move_device_like
from
detectron2.structures
import
ImageList
,
Instances
from
detectron2.utils.events
import
get_event_storage
from
detectron2.utils.logger
import
log_first_n
from
..backbone
import
Backbone
,
build_backbone
from
..postprocessing
import
detector_postprocess
from
..proposal_generator
import
build_proposal_generator
from
..roi_heads
import
build_roi_heads
from
.build
import
META_ARCH_REGISTRY
__all__
=
[
"GeneralizedRCNN"
,
"ProposalNetwork"
]
@
META_ARCH_REGISTRY
.
register
()
class
GeneralizedRCNN
(
nn
.
Module
):
"""
Generalized R-CNN. Any models that contains the following three components:
1. Per-image feature extraction (aka backbone)
2. Region proposal generation
3. Per-region feature extraction and prediction
"""
@
configurable
def
__init__
(
self
,
*
,
backbone
:
Backbone
,
proposal_generator
:
nn
.
Module
,
roi_heads
:
nn
.
Module
,
pixel_mean
:
Tuple
[
float
],
pixel_std
:
Tuple
[
float
],
input_format
:
Optional
[
str
]
=
None
,
vis_period
:
int
=
0
,
):
"""
Args:
backbone: a backbone module, must follow detectron2's backbone interface
proposal_generator: a module that generates proposals using backbone features
roi_heads: a ROI head that performs per-region computation
pixel_mean, pixel_std: list or tuple with #channels element, representing
the per-channel mean and std to be used to normalize the input image
input_format: describe the meaning of channels of input. Needed by visualization
vis_period: the period to run visualization. Set to 0 to disable.
"""
super
().
__init__
()
self
.
backbone
=
backbone
self
.
proposal_generator
=
proposal_generator
self
.
roi_heads
=
roi_heads
self
.
input_format
=
input_format
self
.
vis_period
=
vis_period
if
vis_period
>
0
:
assert
input_format
is
not
None
,
"input_format is required for visualization!"
self
.
register_buffer
(
"pixel_mean"
,
torch
.
tensor
(
pixel_mean
).
view
(
-
1
,
1
,
1
),
False
)
self
.
register_buffer
(
"pixel_std"
,
torch
.
tensor
(
pixel_std
).
view
(
-
1
,
1
,
1
),
False
)
assert
(
self
.
pixel_mean
.
shape
==
self
.
pixel_std
.
shape
),
f
"
{
self
.
pixel_mean
}
and
{
self
.
pixel_std
}
have different shapes!"
@
classmethod
def
from_config
(
cls
,
cfg
):
backbone
=
build_backbone
(
cfg
)
return
{
"backbone"
:
backbone
,
"proposal_generator"
:
build_proposal_generator
(
cfg
,
backbone
.
output_shape
()),
"roi_heads"
:
build_roi_heads
(
cfg
,
backbone
.
output_shape
()),
"input_format"
:
cfg
.
INPUT
.
FORMAT
,
"vis_period"
:
cfg
.
VIS_PERIOD
,
"pixel_mean"
:
cfg
.
MODEL
.
PIXEL_MEAN
,
"pixel_std"
:
cfg
.
MODEL
.
PIXEL_STD
,
}
@
property
def
device
(
self
):
return
self
.
pixel_mean
.
device
def
_move_to_current_device
(
self
,
x
):
return
move_device_like
(
x
,
self
.
pixel_mean
)
def
visualize_training
(
self
,
batched_inputs
,
proposals
):
"""
A function used to visualize images and proposals. It shows ground truth
bounding boxes on the original image and up to 20 top-scoring predicted
object proposals on the original image. Users can implement different
visualization functions for different models.
Args:
batched_inputs (list): a list that contains input to the model.
proposals (list): a list that contains predicted proposals. Both
batched_inputs and proposals should have the same length.
"""
from
detectron2.utils.visualizer
import
Visualizer
storage
=
get_event_storage
()
max_vis_prop
=
20
for
input
,
prop
in
zip
(
batched_inputs
,
proposals
):
img
=
input
[
"image"
]
img
=
convert_image_to_rgb
(
img
.
permute
(
1
,
2
,
0
),
self
.
input_format
)
v_gt
=
Visualizer
(
img
,
None
)
v_gt
=
v_gt
.
overlay_instances
(
boxes
=
input
[
"instances"
].
gt_boxes
)
anno_img
=
v_gt
.
get_image
()
box_size
=
min
(
len
(
prop
.
proposal_boxes
),
max_vis_prop
)
v_pred
=
Visualizer
(
img
,
None
)
v_pred
=
v_pred
.
overlay_instances
(
boxes
=
prop
.
proposal_boxes
[
0
:
box_size
].
tensor
.
cpu
().
numpy
()
)
prop_img
=
v_pred
.
get_image
()
vis_img
=
np
.
concatenate
((
anno_img
,
prop_img
),
axis
=
1
)
vis_img
=
vis_img
.
transpose
(
2
,
0
,
1
)
vis_name
=
"Left: GT bounding boxes; Right: Predicted proposals"
storage
.
put_image
(
vis_name
,
vis_img
)
break
# only visualize one image in a batch
def
forward
(
self
,
batched_inputs
:
List
[
Dict
[
str
,
torch
.
Tensor
]]):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* image: Tensor, image in (C, H, W) format.
* instances (optional): groundtruth :class:`Instances`
* proposals (optional): :class:`Instances`, precomputed proposals.
Other information that's included in the original dicts, such as:
* "height", "width" (int): the output resolution of the model, used in inference.
See :meth:`postprocess` for details.
Returns:
list[dict]:
Each dict is the output for one input image.
The dict contains one key "instances" whose value is a :class:`Instances`.
The :class:`Instances` object has the following keys:
"pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
"""
if
not
self
.
training
:
return
self
.
inference
(
batched_inputs
)
images
=
self
.
preprocess_image
(
batched_inputs
)
if
"instances"
in
batched_inputs
[
0
]:
gt_instances
=
[
x
[
"instances"
].
to
(
self
.
device
)
for
x
in
batched_inputs
]
else
:
gt_instances
=
None
features
=
self
.
backbone
(
images
.
tensor
)
if
self
.
proposal_generator
is
not
None
:
proposals
,
proposal_losses
=
self
.
proposal_generator
(
images
,
features
,
gt_instances
)
else
:
assert
"proposals"
in
batched_inputs
[
0
]
proposals
=
[
x
[
"proposals"
].
to
(
self
.
device
)
for
x
in
batched_inputs
]
proposal_losses
=
{}
_
,
detector_losses
=
self
.
roi_heads
(
images
,
features
,
proposals
,
gt_instances
)
if
self
.
vis_period
>
0
:
storage
=
get_event_storage
()
if
storage
.
iter
%
self
.
vis_period
==
0
:
self
.
visualize_training
(
batched_inputs
,
proposals
)
losses
=
{}
losses
.
update
(
detector_losses
)
losses
.
update
(
proposal_losses
)
return
losses
def
inference
(
self
,
batched_inputs
:
List
[
Dict
[
str
,
torch
.
Tensor
]],
detected_instances
:
Optional
[
List
[
Instances
]]
=
None
,
do_postprocess
:
bool
=
True
,
):
"""
Run inference on the given inputs.
Args:
batched_inputs (list[dict]): same as in :meth:`forward`
detected_instances (None or list[Instances]): if not None, it
contains an `Instances` object per image. The `Instances`
object contains "pred_boxes" and "pred_classes" which are
known boxes in the image.
The inference will then skip the detection of bounding boxes,
and only predict other per-ROI outputs.
do_postprocess (bool): whether to apply post-processing on the outputs.
Returns:
When do_postprocess=True, same as in :meth:`forward`.
Otherwise, a list[Instances] containing raw network outputs.
"""
assert
not
self
.
training
images
=
self
.
preprocess_image
(
batched_inputs
)
features
=
self
.
backbone
(
images
.
tensor
)
if
detected_instances
is
None
:
if
self
.
proposal_generator
is
not
None
:
proposals
,
_
=
self
.
proposal_generator
(
images
,
features
,
None
)
else
:
assert
"proposals"
in
batched_inputs
[
0
]
proposals
=
[
x
[
"proposals"
].
to
(
self
.
device
)
for
x
in
batched_inputs
]
results
,
_
=
self
.
roi_heads
(
images
,
features
,
proposals
,
None
)
else
:
detected_instances
=
[
x
.
to
(
self
.
device
)
for
x
in
detected_instances
]
results
=
self
.
roi_heads
.
forward_with_given_boxes
(
features
,
detected_instances
)
if
do_postprocess
:
assert
not
torch
.
jit
.
is_scripting
(),
"Scripting is not supported for postprocess."
return
GeneralizedRCNN
.
_postprocess
(
results
,
batched_inputs
,
images
.
image_sizes
)
return
results
def
preprocess_image
(
self
,
batched_inputs
:
List
[
Dict
[
str
,
torch
.
Tensor
]]):
"""
Normalize, pad and batch the input images.
"""
images
=
[
self
.
_move_to_current_device
(
x
[
"image"
])
for
x
in
batched_inputs
]
images
=
[(
x
-
self
.
pixel_mean
)
/
self
.
pixel_std
for
x
in
images
]
images
=
ImageList
.
from_tensors
(
images
,
self
.
backbone
.
size_divisibility
,
padding_constraints
=
self
.
backbone
.
padding_constraints
,
)
return
images
@
staticmethod
def
_postprocess
(
instances
,
batched_inputs
:
List
[
Dict
[
str
,
torch
.
Tensor
]],
image_sizes
):
"""
Rescale the output instances to the target size.
"""
# note: private function; subject to changes
processed_results
=
[]
for
results_per_image
,
input_per_image
,
image_size
in
zip
(
instances
,
batched_inputs
,
image_sizes
):
height
=
input_per_image
.
get
(
"height"
,
image_size
[
0
])
width
=
input_per_image
.
get
(
"width"
,
image_size
[
1
])
r
=
detector_postprocess
(
results_per_image
,
height
,
width
)
processed_results
.
append
({
"instances"
:
r
})
return
processed_results
@
META_ARCH_REGISTRY
.
register
()
class
ProposalNetwork
(
nn
.
Module
):
"""
A meta architecture that only predicts object proposals.
"""
@
configurable
def
__init__
(
self
,
*
,
backbone
:
Backbone
,
proposal_generator
:
nn
.
Module
,
pixel_mean
:
Tuple
[
float
],
pixel_std
:
Tuple
[
float
],
):
"""
Args:
backbone: a backbone module, must follow detectron2's backbone interface
proposal_generator: a module that generates proposals using backbone features
pixel_mean, pixel_std: list or tuple with #channels element, representing
the per-channel mean and std to be used to normalize the input image
"""
super
().
__init__
()
self
.
backbone
=
backbone
self
.
proposal_generator
=
proposal_generator
self
.
register_buffer
(
"pixel_mean"
,
torch
.
tensor
(
pixel_mean
).
view
(
-
1
,
1
,
1
),
False
)
self
.
register_buffer
(
"pixel_std"
,
torch
.
tensor
(
pixel_std
).
view
(
-
1
,
1
,
1
),
False
)
@
classmethod
def
from_config
(
cls
,
cfg
):
backbone
=
build_backbone
(
cfg
)
return
{
"backbone"
:
backbone
,
"proposal_generator"
:
build_proposal_generator
(
cfg
,
backbone
.
output_shape
()),
"pixel_mean"
:
cfg
.
MODEL
.
PIXEL_MEAN
,
"pixel_std"
:
cfg
.
MODEL
.
PIXEL_STD
,
}
@
property
def
device
(
self
):
return
self
.
pixel_mean
.
device
def
_move_to_current_device
(
self
,
x
):
return
move_device_like
(
x
,
self
.
pixel_mean
)
def
forward
(
self
,
batched_inputs
):
"""
Args:
Same as in :class:`GeneralizedRCNN.forward`
Returns:
list[dict]:
Each dict is the output for one input image.
The dict contains one key "proposals" whose value is a
:class:`Instances` with keys "proposal_boxes" and "objectness_logits".
"""
images
=
[
self
.
_move_to_current_device
(
x
[
"image"
])
for
x
in
batched_inputs
]
images
=
[(
x
-
self
.
pixel_mean
)
/
self
.
pixel_std
for
x
in
images
]
images
=
ImageList
.
from_tensors
(
images
,
self
.
backbone
.
size_divisibility
,
padding_constraints
=
self
.
backbone
.
padding_constraints
,
)
features
=
self
.
backbone
(
images
.
tensor
)
if
"instances"
in
batched_inputs
[
0
]:
gt_instances
=
[
x
[
"instances"
].
to
(
self
.
device
)
for
x
in
batched_inputs
]
elif
"targets"
in
batched_inputs
[
0
]:
log_first_n
(
logging
.
WARN
,
"'targets' in the model inputs is now renamed to 'instances'!"
,
n
=
10
)
gt_instances
=
[
x
[
"targets"
].
to
(
self
.
device
)
for
x
in
batched_inputs
]
else
:
gt_instances
=
None
proposals
,
proposal_losses
=
self
.
proposal_generator
(
images
,
features
,
gt_instances
)
# In training, the proposals are not useful at all but we generate them anyway.
# This makes RPN-only models about 5% slower.
if
self
.
training
:
return
proposal_losses
processed_results
=
[]
for
results_per_image
,
input_per_image
,
image_size
in
zip
(
proposals
,
batched_inputs
,
images
.
image_sizes
):
height
=
input_per_image
.
get
(
"height"
,
image_size
[
0
])
width
=
input_per_image
.
get
(
"width"
,
image_size
[
1
])
r
=
detector_postprocess
(
results_per_image
,
height
,
width
)
processed_results
.
append
({
"proposals"
:
r
})
return
processed_results
detectron2/modeling/meta_arch/retinanet.py
0 → 100644
View file @
3144257c
# Copyright (c) Facebook, Inc. and its affiliates.
import
logging
import
math
from
typing
import
List
,
Tuple
import
torch
from
fvcore.nn
import
sigmoid_focal_loss_jit
from
torch
import
Tensor
,
nn
from
torch.nn
import
functional
as
F
from
detectron2.config
import
configurable
from
detectron2.layers
import
CycleBatchNormList
,
ShapeSpec
,
batched_nms
,
cat
,
get_norm
from
detectron2.structures
import
Boxes
,
ImageList
,
Instances
,
pairwise_iou
from
detectron2.utils.events
import
get_event_storage
from
..anchor_generator
import
build_anchor_generator
from
..backbone
import
Backbone
,
build_backbone
from
..box_regression
import
Box2BoxTransform
,
_dense_box_regression_loss
from
..matcher
import
Matcher
from
.build
import
META_ARCH_REGISTRY
from
.dense_detector
import
DenseDetector
,
permute_to_N_HWA_K
# noqa
__all__
=
[
"RetinaNet"
]
logger
=
logging
.
getLogger
(
__name__
)
@
META_ARCH_REGISTRY
.
register
()
class
RetinaNet
(
DenseDetector
):
"""
Implement RetinaNet in :paper:`RetinaNet`.
"""
@
configurable
def
__init__
(
self
,
*
,
backbone
:
Backbone
,
head
:
nn
.
Module
,
head_in_features
,
anchor_generator
,
box2box_transform
,
anchor_matcher
,
num_classes
,
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
smooth_l1_beta
=
0.0
,
box_reg_loss_type
=
"smooth_l1"
,
test_score_thresh
=
0.05
,
test_topk_candidates
=
1000
,
test_nms_thresh
=
0.5
,
max_detections_per_image
=
100
,
pixel_mean
,
pixel_std
,
vis_period
=
0
,
input_format
=
"BGR"
,
):
"""
NOTE: this interface is experimental.
Args:
backbone: a backbone module, must follow detectron2's backbone interface
head (nn.Module): a module that predicts logits and regression deltas
for each level from a list of per-level features
head_in_features (Tuple[str]): Names of the input feature maps to be used in head
anchor_generator (nn.Module): a module that creates anchors from a
list of features. Usually an instance of :class:`AnchorGenerator`
box2box_transform (Box2BoxTransform): defines the transform from anchors boxes to
instance boxes
anchor_matcher (Matcher): label the anchors by matching them with ground truth.
num_classes (int): number of classes. Used to label background proposals.
# Loss parameters:
focal_loss_alpha (float): focal_loss_alpha
focal_loss_gamma (float): focal_loss_gamma
smooth_l1_beta (float): smooth_l1_beta
box_reg_loss_type (str): Options are "smooth_l1", "giou", "diou", "ciou"
# Inference parameters:
test_score_thresh (float): Inference cls score threshold, only anchors with
score > INFERENCE_TH are considered for inference (to improve speed)
test_topk_candidates (int): Select topk candidates before NMS
test_nms_thresh (float): Overlap threshold used for non-maximum suppression
(suppress boxes with IoU >= this threshold)
max_detections_per_image (int):
Maximum number of detections to return per image during inference
(100 is based on the limit established for the COCO dataset).
pixel_mean, pixel_std: see :class:`DenseDetector`.
"""
super
().
__init__
(
backbone
,
head
,
head_in_features
,
pixel_mean
=
pixel_mean
,
pixel_std
=
pixel_std
)
self
.
num_classes
=
num_classes
# Anchors
self
.
anchor_generator
=
anchor_generator
self
.
box2box_transform
=
box2box_transform
self
.
anchor_matcher
=
anchor_matcher
# Loss parameters:
self
.
focal_loss_alpha
=
focal_loss_alpha
self
.
focal_loss_gamma
=
focal_loss_gamma
self
.
smooth_l1_beta
=
smooth_l1_beta
self
.
box_reg_loss_type
=
box_reg_loss_type
# Inference parameters:
self
.
test_score_thresh
=
test_score_thresh
self
.
test_topk_candidates
=
test_topk_candidates
self
.
test_nms_thresh
=
test_nms_thresh
self
.
max_detections_per_image
=
max_detections_per_image
# Vis parameters
self
.
vis_period
=
vis_period
self
.
input_format
=
input_format
@
classmethod
def
from_config
(
cls
,
cfg
):
backbone
=
build_backbone
(
cfg
)
backbone_shape
=
backbone
.
output_shape
()
feature_shapes
=
[
backbone_shape
[
f
]
for
f
in
cfg
.
MODEL
.
RETINANET
.
IN_FEATURES
]
head
=
RetinaNetHead
(
cfg
,
feature_shapes
)
anchor_generator
=
build_anchor_generator
(
cfg
,
feature_shapes
)
return
{
"backbone"
:
backbone
,
"head"
:
head
,
"anchor_generator"
:
anchor_generator
,
"box2box_transform"
:
Box2BoxTransform
(
weights
=
cfg
.
MODEL
.
RETINANET
.
BBOX_REG_WEIGHTS
),
"anchor_matcher"
:
Matcher
(
cfg
.
MODEL
.
RETINANET
.
IOU_THRESHOLDS
,
cfg
.
MODEL
.
RETINANET
.
IOU_LABELS
,
allow_low_quality_matches
=
True
,
),
"pixel_mean"
:
cfg
.
MODEL
.
PIXEL_MEAN
,
"pixel_std"
:
cfg
.
MODEL
.
PIXEL_STD
,
"num_classes"
:
cfg
.
MODEL
.
RETINANET
.
NUM_CLASSES
,
"head_in_features"
:
cfg
.
MODEL
.
RETINANET
.
IN_FEATURES
,
# Loss parameters:
"focal_loss_alpha"
:
cfg
.
MODEL
.
RETINANET
.
FOCAL_LOSS_ALPHA
,
"focal_loss_gamma"
:
cfg
.
MODEL
.
RETINANET
.
FOCAL_LOSS_GAMMA
,
"smooth_l1_beta"
:
cfg
.
MODEL
.
RETINANET
.
SMOOTH_L1_LOSS_BETA
,
"box_reg_loss_type"
:
cfg
.
MODEL
.
RETINANET
.
BBOX_REG_LOSS_TYPE
,
# Inference parameters:
"test_score_thresh"
:
cfg
.
MODEL
.
RETINANET
.
SCORE_THRESH_TEST
,
"test_topk_candidates"
:
cfg
.
MODEL
.
RETINANET
.
TOPK_CANDIDATES_TEST
,
"test_nms_thresh"
:
cfg
.
MODEL
.
RETINANET
.
NMS_THRESH_TEST
,
"max_detections_per_image"
:
cfg
.
TEST
.
DETECTIONS_PER_IMAGE
,
# Vis parameters
"vis_period"
:
cfg
.
VIS_PERIOD
,
"input_format"
:
cfg
.
INPUT
.
FORMAT
,
}
def
forward_training
(
self
,
images
,
features
,
predictions
,
gt_instances
):
# Transpose the Hi*Wi*A dimension to the middle:
pred_logits
,
pred_anchor_deltas
=
self
.
_transpose_dense_predictions
(
predictions
,
[
self
.
num_classes
,
4
]
)
anchors
=
self
.
anchor_generator
(
features
)
gt_labels
,
gt_boxes
=
self
.
label_anchors
(
anchors
,
gt_instances
)
return
self
.
losses
(
anchors
,
pred_logits
,
gt_labels
,
pred_anchor_deltas
,
gt_boxes
)
def
losses
(
self
,
anchors
,
pred_logits
,
gt_labels
,
pred_anchor_deltas
,
gt_boxes
):
"""
Args:
anchors (list[Boxes]): a list of #feature level Boxes
gt_labels, gt_boxes: see output of :meth:`RetinaNet.label_anchors`.
Their shapes are (N, R) and (N, R, 4), respectively, where R is
the total number of anchors across levels, i.e. sum(Hi x Wi x Ai)
pred_logits, pred_anchor_deltas: both are list[Tensor]. Each element in the
list corresponds to one level and has shape (N, Hi * Wi * Ai, K or 4).
Where K is the number of classes used in `pred_logits`.
Returns:
dict[str, Tensor]:
mapping from a named loss to a scalar tensor storing the loss.
Used during training only. The dict keys are: "loss_cls" and "loss_box_reg"
"""
num_images
=
len
(
gt_labels
)
gt_labels
=
torch
.
stack
(
gt_labels
)
# (N, R)
valid_mask
=
gt_labels
>=
0
pos_mask
=
(
gt_labels
>=
0
)
&
(
gt_labels
!=
self
.
num_classes
)
num_pos_anchors
=
pos_mask
.
sum
().
item
()
get_event_storage
().
put_scalar
(
"num_pos_anchors"
,
num_pos_anchors
/
num_images
)
normalizer
=
self
.
_ema_update
(
"loss_normalizer"
,
max
(
num_pos_anchors
,
1
),
100
)
# classification and regression loss
gt_labels_target
=
F
.
one_hot
(
gt_labels
[
valid_mask
],
num_classes
=
self
.
num_classes
+
1
)[
:,
:
-
1
]
# no loss for the last (background) class
loss_cls
=
sigmoid_focal_loss_jit
(
cat
(
pred_logits
,
dim
=
1
)[
valid_mask
],
gt_labels_target
.
to
(
pred_logits
[
0
].
dtype
),
alpha
=
self
.
focal_loss_alpha
,
gamma
=
self
.
focal_loss_gamma
,
reduction
=
"sum"
,
)
loss_box_reg
=
_dense_box_regression_loss
(
anchors
,
self
.
box2box_transform
,
pred_anchor_deltas
,
gt_boxes
,
pos_mask
,
box_reg_loss_type
=
self
.
box_reg_loss_type
,
smooth_l1_beta
=
self
.
smooth_l1_beta
,
)
return
{
"loss_cls"
:
loss_cls
/
normalizer
,
"loss_box_reg"
:
loss_box_reg
/
normalizer
,
}
@
torch
.
no_grad
()
def
label_anchors
(
self
,
anchors
,
gt_instances
):
"""
Args:
anchors (list[Boxes]): A list of #feature level Boxes.
The Boxes contains anchors of this image on the specific feature level.
gt_instances (list[Instances]): a list of N `Instances`s. The i-th
`Instances` contains the ground-truth per-instance annotations
for the i-th input image.
Returns:
list[Tensor]: List of #img tensors. i-th element is a vector of labels whose length is
the total number of anchors across all feature maps (sum(Hi * Wi * A)).
Label values are in {-1, 0, ..., K}, with -1 means ignore, and K means background.
list[Tensor]: i-th element is a Rx4 tensor, where R is the total number of anchors
across feature maps. The values are the matched gt boxes for each anchor.
Values are undefined for those anchors not labeled as foreground.
"""
anchors
=
Boxes
.
cat
(
anchors
)
# Rx4
gt_labels
=
[]
matched_gt_boxes
=
[]
for
gt_per_image
in
gt_instances
:
match_quality_matrix
=
pairwise_iou
(
gt_per_image
.
gt_boxes
,
anchors
)
matched_idxs
,
anchor_labels
=
self
.
anchor_matcher
(
match_quality_matrix
)
del
match_quality_matrix
if
len
(
gt_per_image
)
>
0
:
matched_gt_boxes_i
=
gt_per_image
.
gt_boxes
.
tensor
[
matched_idxs
]
gt_labels_i
=
gt_per_image
.
gt_classes
[
matched_idxs
]
# Anchors with label 0 are treated as background.
gt_labels_i
[
anchor_labels
==
0
]
=
self
.
num_classes
# Anchors with label -1 are ignored.
gt_labels_i
[
anchor_labels
==
-
1
]
=
-
1
else
:
matched_gt_boxes_i
=
torch
.
zeros_like
(
anchors
.
tensor
)
gt_labels_i
=
torch
.
zeros_like
(
matched_idxs
)
+
self
.
num_classes
gt_labels
.
append
(
gt_labels_i
)
matched_gt_boxes
.
append
(
matched_gt_boxes_i
)
return
gt_labels
,
matched_gt_boxes
def
forward_inference
(
self
,
images
:
ImageList
,
features
:
List
[
Tensor
],
predictions
:
List
[
List
[
Tensor
]]
):
pred_logits
,
pred_anchor_deltas
=
self
.
_transpose_dense_predictions
(
predictions
,
[
self
.
num_classes
,
4
]
)
anchors
=
self
.
anchor_generator
(
features
)
results
:
List
[
Instances
]
=
[]
for
img_idx
,
image_size
in
enumerate
(
images
.
image_sizes
):
scores_per_image
=
[
x
[
img_idx
].
sigmoid_
()
for
x
in
pred_logits
]
deltas_per_image
=
[
x
[
img_idx
]
for
x
in
pred_anchor_deltas
]
results_per_image
=
self
.
inference_single_image
(
anchors
,
scores_per_image
,
deltas_per_image
,
image_size
)
results
.
append
(
results_per_image
)
return
results
def
inference_single_image
(
self
,
anchors
:
List
[
Boxes
],
box_cls
:
List
[
Tensor
],
box_delta
:
List
[
Tensor
],
image_size
:
Tuple
[
int
,
int
],
):
"""
Single-image inference. Return bounding-box detection results by thresholding
on scores and applying non-maximum suppression (NMS).
Arguments:
anchors (list[Boxes]): list of #feature levels. Each entry contains
a Boxes object, which contains all the anchors in that feature level.
box_cls (list[Tensor]): list of #feature levels. Each entry contains
tensor of size (H x W x A, K)
box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4.
image_size (tuple(H, W)): a tuple of the image height and width.
Returns:
Same as `inference`, but for only one image.
"""
pred
=
self
.
_decode_multi_level_predictions
(
anchors
,
box_cls
,
box_delta
,
self
.
test_score_thresh
,
self
.
test_topk_candidates
,
image_size
,
)
keep
=
batched_nms
(
# per-class NMS
pred
.
pred_boxes
.
tensor
,
pred
.
scores
,
pred
.
pred_classes
,
self
.
test_nms_thresh
)
return
pred
[
keep
[:
self
.
max_detections_per_image
]]
class
RetinaNetHead
(
nn
.
Module
):
"""
The head used in RetinaNet for object classification and box regression.
It has two subnets for the two tasks, with a common structure but separate parameters.
"""
@
configurable
def
__init__
(
self
,
*
,
input_shape
:
List
[
ShapeSpec
],
num_classes
,
num_anchors
,
conv_dims
:
List
[
int
],
norm
=
""
,
prior_prob
=
0.01
,
):
"""
NOTE: this interface is experimental.
Args:
input_shape (List[ShapeSpec]): input shape
num_classes (int): number of classes. Used to label background proposals.
num_anchors (int): number of generated anchors
conv_dims (List[int]): dimensions for each convolution layer
norm (str or callable):
Normalization for conv layers except for the two output layers.
See :func:`detectron2.layers.get_norm` for supported types.
prior_prob (float): Prior weight for computing bias
"""
super
().
__init__
()
self
.
_num_features
=
len
(
input_shape
)
if
norm
==
"BN"
or
norm
==
"SyncBN"
:
logger
.
info
(
f
"Using domain-specific
{
norm
}
in RetinaNetHead with len=
{
self
.
_num_features
}
."
)
bn_class
=
nn
.
BatchNorm2d
if
norm
==
"BN"
else
nn
.
SyncBatchNorm
def
norm
(
c
):
return
CycleBatchNormList
(
length
=
self
.
_num_features
,
bn_class
=
bn_class
,
num_features
=
c
)
else
:
norm_name
=
str
(
type
(
get_norm
(
norm
,
32
)))
if
"BN"
in
norm_name
:
logger
.
warning
(
f
"Shared BatchNorm (type=
{
norm_name
}
) may not work well in RetinaNetHead."
)
cls_subnet
=
[]
bbox_subnet
=
[]
for
in_channels
,
out_channels
in
zip
(
[
input_shape
[
0
].
channels
]
+
list
(
conv_dims
),
conv_dims
):
cls_subnet
.
append
(
nn
.
Conv2d
(
in_channels
,
out_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
)
if
norm
:
cls_subnet
.
append
(
get_norm
(
norm
,
out_channels
))
cls_subnet
.
append
(
nn
.
ReLU
())
bbox_subnet
.
append
(
nn
.
Conv2d
(
in_channels
,
out_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
)
if
norm
:
bbox_subnet
.
append
(
get_norm
(
norm
,
out_channels
))
bbox_subnet
.
append
(
nn
.
ReLU
())
self
.
cls_subnet
=
nn
.
Sequential
(
*
cls_subnet
)
self
.
bbox_subnet
=
nn
.
Sequential
(
*
bbox_subnet
)
self
.
cls_score
=
nn
.
Conv2d
(
conv_dims
[
-
1
],
num_anchors
*
num_classes
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
self
.
bbox_pred
=
nn
.
Conv2d
(
conv_dims
[
-
1
],
num_anchors
*
4
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
# Initialization
for
modules
in
[
self
.
cls_subnet
,
self
.
bbox_subnet
,
self
.
cls_score
,
self
.
bbox_pred
]:
for
layer
in
modules
.
modules
():
if
isinstance
(
layer
,
nn
.
Conv2d
):
torch
.
nn
.
init
.
normal_
(
layer
.
weight
,
mean
=
0
,
std
=
0.01
)
torch
.
nn
.
init
.
constant_
(
layer
.
bias
,
0
)
# Use prior in model initialization to improve stability
bias_value
=
-
(
math
.
log
((
1
-
prior_prob
)
/
prior_prob
))
torch
.
nn
.
init
.
constant_
(
self
.
cls_score
.
bias
,
bias_value
)
@
classmethod
def
from_config
(
cls
,
cfg
,
input_shape
:
List
[
ShapeSpec
]):
num_anchors
=
build_anchor_generator
(
cfg
,
input_shape
).
num_cell_anchors
assert
(
len
(
set
(
num_anchors
))
==
1
),
"Using different number of anchors between levels is not currently supported!"
num_anchors
=
num_anchors
[
0
]
return
{
"input_shape"
:
input_shape
,
"num_classes"
:
cfg
.
MODEL
.
RETINANET
.
NUM_CLASSES
,
"conv_dims"
:
[
input_shape
[
0
].
channels
]
*
cfg
.
MODEL
.
RETINANET
.
NUM_CONVS
,
"prior_prob"
:
cfg
.
MODEL
.
RETINANET
.
PRIOR_PROB
,
"norm"
:
cfg
.
MODEL
.
RETINANET
.
NORM
,
"num_anchors"
:
num_anchors
,
}
def
forward
(
self
,
features
:
List
[
Tensor
]):
"""
Arguments:
features (list[Tensor]): FPN feature map tensors in high to low resolution.
Each tensor in the list correspond to different feature levels.
Returns:
logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi).
The tensor predicts the classification probability
at each spatial position for each of the A anchors and K object
classes.
bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi).
The tensor predicts 4-vector (dx,dy,dw,dh) box
regression values for every anchor. These values are the
relative offset between the anchor and the ground truth box.
"""
assert
len
(
features
)
==
self
.
_num_features
logits
=
[]
bbox_reg
=
[]
for
feature
in
features
:
logits
.
append
(
self
.
cls_score
(
self
.
cls_subnet
(
feature
)))
bbox_reg
.
append
(
self
.
bbox_pred
(
self
.
bbox_subnet
(
feature
)))
return
logits
,
bbox_reg
detectron2/modeling/meta_arch/semantic_seg.py
0 → 100644
View file @
3144257c
# Copyright (c) Facebook, Inc. and its affiliates.
import
numpy
as
np
from
typing
import
Callable
,
Dict
,
Optional
,
Tuple
,
Union
import
fvcore.nn.weight_init
as
weight_init
import
torch
from
torch
import
nn
from
torch.nn
import
functional
as
F
from
detectron2.config
import
configurable
from
detectron2.layers
import
Conv2d
,
ShapeSpec
,
get_norm
from
detectron2.structures
import
ImageList
from
detectron2.utils.registry
import
Registry
from
..backbone
import
Backbone
,
build_backbone
from
..postprocessing
import
sem_seg_postprocess
from
.build
import
META_ARCH_REGISTRY
__all__
=
[
"SemanticSegmentor"
,
"SEM_SEG_HEADS_REGISTRY"
,
"SemSegFPNHead"
,
"build_sem_seg_head"
,
]
SEM_SEG_HEADS_REGISTRY
=
Registry
(
"SEM_SEG_HEADS"
)
SEM_SEG_HEADS_REGISTRY
.
__doc__
=
"""
Registry for semantic segmentation heads, which make semantic segmentation predictions
from feature maps.
"""
@
META_ARCH_REGISTRY
.
register
()
class
SemanticSegmentor
(
nn
.
Module
):
"""
Main class for semantic segmentation architectures.
"""
@
configurable
def
__init__
(
self
,
*
,
backbone
:
Backbone
,
sem_seg_head
:
nn
.
Module
,
pixel_mean
:
Tuple
[
float
],
pixel_std
:
Tuple
[
float
],
):
"""
Args:
backbone: a backbone module, must follow detectron2's backbone interface
sem_seg_head: a module that predicts semantic segmentation from backbone features
pixel_mean, pixel_std: list or tuple with #channels element, representing
the per-channel mean and std to be used to normalize the input image
"""
super
().
__init__
()
self
.
backbone
=
backbone
self
.
sem_seg_head
=
sem_seg_head
self
.
register_buffer
(
"pixel_mean"
,
torch
.
tensor
(
pixel_mean
).
view
(
-
1
,
1
,
1
),
False
)
self
.
register_buffer
(
"pixel_std"
,
torch
.
tensor
(
pixel_std
).
view
(
-
1
,
1
,
1
),
False
)
@
classmethod
def
from_config
(
cls
,
cfg
):
backbone
=
build_backbone
(
cfg
)
sem_seg_head
=
build_sem_seg_head
(
cfg
,
backbone
.
output_shape
())
return
{
"backbone"
:
backbone
,
"sem_seg_head"
:
sem_seg_head
,
"pixel_mean"
:
cfg
.
MODEL
.
PIXEL_MEAN
,
"pixel_std"
:
cfg
.
MODEL
.
PIXEL_STD
,
}
@
property
def
device
(
self
):
return
self
.
pixel_mean
.
device
def
forward
(
self
,
batched_inputs
):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* "image": Tensor, image in (C, H, W) format.
* "sem_seg": semantic segmentation ground truth
* Other information that's included in the original dicts, such as:
"height", "width" (int): the output resolution of the model (may be different
from input resolution), used in inference.
Returns:
list[dict]:
Each dict is the output for one input image.
The dict contains one key "sem_seg" whose value is a
Tensor that represents the
per-pixel segmentation prediced by the head.
The prediction has shape KxHxW that represents the logits of
each class for each pixel.
"""
images
=
[
x
[
"image"
].
to
(
self
.
device
)
for
x
in
batched_inputs
]
images
=
[(
x
-
self
.
pixel_mean
)
/
self
.
pixel_std
for
x
in
images
]
images
=
ImageList
.
from_tensors
(
images
,
self
.
backbone
.
size_divisibility
,
padding_constraints
=
self
.
backbone
.
padding_constraints
,
)
features
=
self
.
backbone
(
images
.
tensor
)
if
"sem_seg"
in
batched_inputs
[
0
]:
targets
=
[
x
[
"sem_seg"
].
to
(
self
.
device
)
for
x
in
batched_inputs
]
targets
=
ImageList
.
from_tensors
(
targets
,
self
.
backbone
.
size_divisibility
,
self
.
sem_seg_head
.
ignore_value
,
self
.
backbone
.
padding_constraints
,
).
tensor
else
:
targets
=
None
results
,
losses
=
self
.
sem_seg_head
(
features
,
targets
)
if
self
.
training
:
return
losses
processed_results
=
[]
for
result
,
input_per_image
,
image_size
in
zip
(
results
,
batched_inputs
,
images
.
image_sizes
):
height
=
input_per_image
.
get
(
"height"
,
image_size
[
0
])
width
=
input_per_image
.
get
(
"width"
,
image_size
[
1
])
r
=
sem_seg_postprocess
(
result
,
image_size
,
height
,
width
)
processed_results
.
append
({
"sem_seg"
:
r
})
return
processed_results
def
build_sem_seg_head
(
cfg
,
input_shape
):
"""
Build a semantic segmentation head from `cfg.MODEL.SEM_SEG_HEAD.NAME`.
"""
name
=
cfg
.
MODEL
.
SEM_SEG_HEAD
.
NAME
return
SEM_SEG_HEADS_REGISTRY
.
get
(
name
)(
cfg
,
input_shape
)
@
SEM_SEG_HEADS_REGISTRY
.
register
()
class
SemSegFPNHead
(
nn
.
Module
):
"""
A semantic segmentation head described in :paper:`PanopticFPN`.
It takes a list of FPN features as input, and applies a sequence of
3x3 convs and upsampling to scale all of them to the stride defined by
``common_stride``. Then these features are added and used to make final
predictions by another 1x1 conv layer.
"""
@
configurable
def
__init__
(
self
,
input_shape
:
Dict
[
str
,
ShapeSpec
],
*
,
num_classes
:
int
,
conv_dims
:
int
,
common_stride
:
int
,
loss_weight
:
float
=
1.0
,
norm
:
Optional
[
Union
[
str
,
Callable
]]
=
None
,
ignore_value
:
int
=
-
1
,
):
"""
NOTE: this interface is experimental.
Args:
input_shape: shapes (channels and stride) of the input features
num_classes: number of classes to predict
conv_dims: number of output channels for the intermediate conv layers.
common_stride: the common stride that all features will be upscaled to
loss_weight: loss weight
norm (str or callable): normalization for all conv layers
ignore_value: category id to be ignored during training.
"""
super
().
__init__
()
input_shape
=
sorted
(
input_shape
.
items
(),
key
=
lambda
x
:
x
[
1
].
stride
)
if
not
len
(
input_shape
):
raise
ValueError
(
"SemSegFPNHead(input_shape=) cannot be empty!"
)
self
.
in_features
=
[
k
for
k
,
v
in
input_shape
]
feature_strides
=
[
v
.
stride
for
k
,
v
in
input_shape
]
feature_channels
=
[
v
.
channels
for
k
,
v
in
input_shape
]
self
.
ignore_value
=
ignore_value
self
.
common_stride
=
common_stride
self
.
loss_weight
=
loss_weight
self
.
scale_heads
=
[]
for
in_feature
,
stride
,
channels
in
zip
(
self
.
in_features
,
feature_strides
,
feature_channels
):
head_ops
=
[]
head_length
=
max
(
1
,
int
(
np
.
log2
(
stride
)
-
np
.
log2
(
self
.
common_stride
)))
for
k
in
range
(
head_length
):
norm_module
=
get_norm
(
norm
,
conv_dims
)
conv
=
Conv2d
(
channels
if
k
==
0
else
conv_dims
,
conv_dims
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
bias
=
not
norm
,
norm
=
norm_module
,
activation
=
F
.
relu
,
)
weight_init
.
c2_msra_fill
(
conv
)
head_ops
.
append
(
conv
)
if
stride
!=
self
.
common_stride
:
head_ops
.
append
(
nn
.
Upsample
(
scale_factor
=
2
,
mode
=
"bilinear"
,
align_corners
=
False
)
)
self
.
scale_heads
.
append
(
nn
.
Sequential
(
*
head_ops
))
self
.
add_module
(
in_feature
,
self
.
scale_heads
[
-
1
])
self
.
predictor
=
Conv2d
(
conv_dims
,
num_classes
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
)
weight_init
.
c2_msra_fill
(
self
.
predictor
)
@
classmethod
def
from_config
(
cls
,
cfg
,
input_shape
:
Dict
[
str
,
ShapeSpec
]):
return
{
"input_shape"
:
{
k
:
v
for
k
,
v
in
input_shape
.
items
()
if
k
in
cfg
.
MODEL
.
SEM_SEG_HEAD
.
IN_FEATURES
},
"ignore_value"
:
cfg
.
MODEL
.
SEM_SEG_HEAD
.
IGNORE_VALUE
,
"num_classes"
:
cfg
.
MODEL
.
SEM_SEG_HEAD
.
NUM_CLASSES
,
"conv_dims"
:
cfg
.
MODEL
.
SEM_SEG_HEAD
.
CONVS_DIM
,
"common_stride"
:
cfg
.
MODEL
.
SEM_SEG_HEAD
.
COMMON_STRIDE
,
"norm"
:
cfg
.
MODEL
.
SEM_SEG_HEAD
.
NORM
,
"loss_weight"
:
cfg
.
MODEL
.
SEM_SEG_HEAD
.
LOSS_WEIGHT
,
}
def
forward
(
self
,
features
,
targets
=
None
):
"""
Returns:
In training, returns (None, dict of losses)
In inference, returns (CxHxW logits, {})
"""
x
=
self
.
layers
(
features
)
if
self
.
training
:
return
None
,
self
.
losses
(
x
,
targets
)
else
:
x
=
F
.
interpolate
(
x
,
scale_factor
=
self
.
common_stride
,
mode
=
"bilinear"
,
align_corners
=
False
)
return
x
,
{}
def
layers
(
self
,
features
):
for
i
,
f
in
enumerate
(
self
.
in_features
):
if
i
==
0
:
x
=
self
.
scale_heads
[
i
](
features
[
f
])
else
:
x
=
x
+
self
.
scale_heads
[
i
](
features
[
f
])
x
=
self
.
predictor
(
x
)
return
x
def
losses
(
self
,
predictions
,
targets
):
predictions
=
predictions
.
float
()
# https://github.com/pytorch/pytorch/issues/48163
predictions
=
F
.
interpolate
(
predictions
,
scale_factor
=
self
.
common_stride
,
mode
=
"bilinear"
,
align_corners
=
False
,
)
loss
=
F
.
cross_entropy
(
predictions
,
targets
,
reduction
=
"mean"
,
ignore_index
=
self
.
ignore_value
)
losses
=
{
"loss_sem_seg"
:
loss
*
self
.
loss_weight
}
return
losses
detectron2/modeling/mmdet_wrapper.py
0 → 100644
View file @
3144257c
# Copyright (c) Facebook, Inc. and its affiliates.
import
itertools
import
logging
import
numpy
as
np
from
collections
import
OrderedDict
from
collections.abc
import
Mapping
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
import
torch
from
omegaconf
import
DictConfig
,
OmegaConf
from
torch
import
Tensor
,
nn
from
detectron2.layers
import
ShapeSpec
from
detectron2.structures
import
BitMasks
,
Boxes
,
ImageList
,
Instances
from
detectron2.utils.events
import
get_event_storage
from
.backbone
import
Backbone
logger
=
logging
.
getLogger
(
__name__
)
def
_to_container
(
cfg
):
"""
mmdet will assert the type of dict/list.
So convert omegaconf objects to dict/list.
"""
if
isinstance
(
cfg
,
DictConfig
):
cfg
=
OmegaConf
.
to_container
(
cfg
,
resolve
=
True
)
from
mmcv.utils
import
ConfigDict
return
ConfigDict
(
cfg
)
class
MMDetBackbone
(
Backbone
):
"""
Wrapper of mmdetection backbones to use in detectron2.
mmdet backbones produce list/tuple of tensors, while detectron2 backbones
produce a dict of tensors. This class wraps the given backbone to produce
output in detectron2's convention, so it can be used in place of detectron2
backbones.
"""
def
__init__
(
self
,
backbone
:
Union
[
nn
.
Module
,
Mapping
],
neck
:
Union
[
nn
.
Module
,
Mapping
,
None
]
=
None
,
*
,
output_shapes
:
List
[
ShapeSpec
],
output_names
:
Optional
[
List
[
str
]]
=
None
,
):
"""
Args:
backbone: either a backbone module or a mmdet config dict that defines a
backbone. The backbone takes a 4D image tensor and returns a
sequence of tensors.
neck: either a backbone module or a mmdet config dict that defines a
neck. The neck takes outputs of backbone and returns a
sequence of tensors. If None, no neck is used.
output_shapes: shape for every output of the backbone (or neck, if given).
stride and channels are often needed.
output_names: names for every output of the backbone (or neck, if given).
By default, will use "out0", "out1", ...
"""
super
().
__init__
()
if
isinstance
(
backbone
,
Mapping
):
from
mmdet.models
import
build_backbone
backbone
=
build_backbone
(
_to_container
(
backbone
))
self
.
backbone
=
backbone
if
isinstance
(
neck
,
Mapping
):
from
mmdet.models
import
build_neck
neck
=
build_neck
(
_to_container
(
neck
))
self
.
neck
=
neck
# "Neck" weights, if any, are part of neck itself. This is the interface
# of mmdet so we follow it. Reference:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py
logger
.
info
(
"Initializing mmdet backbone weights..."
)
self
.
backbone
.
init_weights
()
# train() in mmdet modules is non-trivial, and has to be explicitly
# called. Reference:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/backbones/resnet.py
self
.
backbone
.
train
()
if
self
.
neck
is
not
None
:
logger
.
info
(
"Initializing mmdet neck weights ..."
)
if
isinstance
(
self
.
neck
,
nn
.
Sequential
):
for
m
in
self
.
neck
:
m
.
init_weights
()
else
:
self
.
neck
.
init_weights
()
self
.
neck
.
train
()
self
.
_output_shapes
=
output_shapes
if
not
output_names
:
output_names
=
[
f
"out
{
i
}
"
for
i
in
range
(
len
(
output_shapes
))]
self
.
_output_names
=
output_names
def
forward
(
self
,
x
)
->
Dict
[
str
,
Tensor
]:
outs
=
self
.
backbone
(
x
)
if
self
.
neck
is
not
None
:
outs
=
self
.
neck
(
outs
)
assert
isinstance
(
outs
,
(
list
,
tuple
)
),
"mmdet backbone should return a list/tuple of tensors!"
if
len
(
outs
)
!=
len
(
self
.
_output_shapes
):
raise
ValueError
(
"Length of output_shapes does not match outputs from the mmdet backbone: "
f
"
{
len
(
outs
)
}
!=
{
len
(
self
.
_output_shapes
)
}
"
)
return
{
k
:
v
for
k
,
v
in
zip
(
self
.
_output_names
,
outs
)}
def
output_shape
(
self
)
->
Dict
[
str
,
ShapeSpec
]:
return
{
k
:
v
for
k
,
v
in
zip
(
self
.
_output_names
,
self
.
_output_shapes
)}
class
MMDetDetector
(
nn
.
Module
):
"""
Wrapper of a mmdetection detector model, for detection and instance segmentation.
Input/output formats of this class follow detectron2's convention, so a
mmdetection model can be trained and evaluated in detectron2.
"""
def
__init__
(
self
,
detector
:
Union
[
nn
.
Module
,
Mapping
],
*
,
# Default is 32 regardless of model:
# https://github.com/open-mmlab/mmdetection/tree/master/configs/_base_/datasets
size_divisibility
=
32
,
pixel_mean
:
Tuple
[
float
],
pixel_std
:
Tuple
[
float
],
):
"""
Args:
detector: a mmdet detector, or a mmdet config dict that defines a detector.
size_divisibility: pad input images to multiple of this number
pixel_mean: per-channel mean to normalize input image
pixel_std: per-channel stddev to normalize input image
"""
super
().
__init__
()
if
isinstance
(
detector
,
Mapping
):
from
mmdet.models
import
build_detector
detector
=
build_detector
(
_to_container
(
detector
))
self
.
detector
=
detector
self
.
detector
.
init_weights
()
self
.
size_divisibility
=
size_divisibility
self
.
register_buffer
(
"pixel_mean"
,
torch
.
tensor
(
pixel_mean
).
view
(
-
1
,
1
,
1
),
False
)
self
.
register_buffer
(
"pixel_std"
,
torch
.
tensor
(
pixel_std
).
view
(
-
1
,
1
,
1
),
False
)
assert
(
self
.
pixel_mean
.
shape
==
self
.
pixel_std
.
shape
),
f
"
{
self
.
pixel_mean
}
and
{
self
.
pixel_std
}
have different shapes!"
def
forward
(
self
,
batched_inputs
:
List
[
Dict
[
str
,
torch
.
Tensor
]]):
images
=
[
x
[
"image"
].
to
(
self
.
device
)
for
x
in
batched_inputs
]
images
=
[(
x
-
self
.
pixel_mean
)
/
self
.
pixel_std
for
x
in
images
]
images
=
ImageList
.
from_tensors
(
images
,
size_divisibility
=
self
.
size_divisibility
).
tensor
metas
=
[]
rescale
=
{
"height"
in
x
for
x
in
batched_inputs
}
if
len
(
rescale
)
!=
1
:
raise
ValueError
(
"Some inputs have original height/width, but some don't!"
)
rescale
=
list
(
rescale
)[
0
]
output_shapes
=
[]
for
input
in
batched_inputs
:
meta
=
{}
c
,
h
,
w
=
input
[
"image"
].
shape
meta
[
"img_shape"
]
=
meta
[
"ori_shape"
]
=
(
h
,
w
,
c
)
if
rescale
:
scale_factor
=
np
.
array
(
[
w
/
input
[
"width"
],
h
/
input
[
"height"
]]
*
2
,
dtype
=
"float32"
)
ori_shape
=
(
input
[
"height"
],
input
[
"width"
])
output_shapes
.
append
(
ori_shape
)
meta
[
"ori_shape"
]
=
ori_shape
+
(
c
,)
else
:
scale_factor
=
1.0
output_shapes
.
append
((
h
,
w
))
meta
[
"scale_factor"
]
=
scale_factor
meta
[
"flip"
]
=
False
padh
,
padw
=
images
.
shape
[
-
2
:]
meta
[
"pad_shape"
]
=
(
padh
,
padw
,
c
)
metas
.
append
(
meta
)
if
self
.
training
:
gt_instances
=
[
x
[
"instances"
].
to
(
self
.
device
)
for
x
in
batched_inputs
]
if
gt_instances
[
0
].
has
(
"gt_masks"
):
from
mmdet.core
import
PolygonMasks
as
mm_PolygonMasks
,
BitmapMasks
as
mm_BitMasks
def
convert_mask
(
m
,
shape
):
# mmdet mask format
if
isinstance
(
m
,
BitMasks
):
return
mm_BitMasks
(
m
.
tensor
.
cpu
().
numpy
(),
shape
[
0
],
shape
[
1
])
else
:
return
mm_PolygonMasks
(
m
.
polygons
,
shape
[
0
],
shape
[
1
])
gt_masks
=
[
convert_mask
(
x
.
gt_masks
,
x
.
image_size
)
for
x
in
gt_instances
]
losses_and_metrics
=
self
.
detector
.
forward_train
(
images
,
metas
,
[
x
.
gt_boxes
.
tensor
for
x
in
gt_instances
],
[
x
.
gt_classes
for
x
in
gt_instances
],
gt_masks
=
gt_masks
,
)
else
:
losses_and_metrics
=
self
.
detector
.
forward_train
(
images
,
metas
,
[
x
.
gt_boxes
.
tensor
for
x
in
gt_instances
],
[
x
.
gt_classes
for
x
in
gt_instances
],
)
return
_parse_losses
(
losses_and_metrics
)
else
:
results
=
self
.
detector
.
simple_test
(
images
,
metas
,
rescale
=
rescale
)
results
=
[
{
"instances"
:
_convert_mmdet_result
(
r
,
shape
)}
for
r
,
shape
in
zip
(
results
,
output_shapes
)
]
return
results
@
property
def
device
(
self
):
return
self
.
pixel_mean
.
device
# Reference: show_result() in
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py
def
_convert_mmdet_result
(
result
,
shape
:
Tuple
[
int
,
int
])
->
Instances
:
if
isinstance
(
result
,
tuple
):
bbox_result
,
segm_result
=
result
if
isinstance
(
segm_result
,
tuple
):
segm_result
=
segm_result
[
0
]
else
:
bbox_result
,
segm_result
=
result
,
None
bboxes
=
torch
.
from_numpy
(
np
.
vstack
(
bbox_result
))
# Nx5
bboxes
,
scores
=
bboxes
[:,
:
4
],
bboxes
[:,
-
1
]
labels
=
[
torch
.
full
((
bbox
.
shape
[
0
],),
i
,
dtype
=
torch
.
int32
)
for
i
,
bbox
in
enumerate
(
bbox_result
)
]
labels
=
torch
.
cat
(
labels
)
inst
=
Instances
(
shape
)
inst
.
pred_boxes
=
Boxes
(
bboxes
)
inst
.
scores
=
scores
inst
.
pred_classes
=
labels
if
segm_result
is
not
None
and
len
(
labels
)
>
0
:
segm_result
=
list
(
itertools
.
chain
(
*
segm_result
))
segm_result
=
[
torch
.
from_numpy
(
x
)
if
isinstance
(
x
,
np
.
ndarray
)
else
x
for
x
in
segm_result
]
segm_result
=
torch
.
stack
(
segm_result
,
dim
=
0
)
inst
.
pred_masks
=
segm_result
return
inst
# reference: https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py
def
_parse_losses
(
losses
:
Dict
[
str
,
Tensor
])
->
Dict
[
str
,
Tensor
]:
log_vars
=
OrderedDict
()
for
loss_name
,
loss_value
in
losses
.
items
():
if
isinstance
(
loss_value
,
torch
.
Tensor
):
log_vars
[
loss_name
]
=
loss_value
.
mean
()
elif
isinstance
(
loss_value
,
list
):
log_vars
[
loss_name
]
=
sum
(
_loss
.
mean
()
for
_loss
in
loss_value
)
else
:
raise
TypeError
(
f
"
{
loss_name
}
is not a tensor or list of tensors"
)
if
"loss"
not
in
loss_name
:
# put metrics to storage; don't return them
storage
=
get_event_storage
()
value
=
log_vars
.
pop
(
loss_name
).
cpu
().
item
()
storage
.
put_scalar
(
loss_name
,
value
)
return
log_vars
Prev
1
…
20
21
22
23
24
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment