Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
142dcf29
Commit
142dcf29
authored
Apr 15, 2022
by
hepj
Browse files
增加conformer代码
parent
7f99c1c3
Changes
317
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3995 additions
and
0 deletions
+3995
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/dense_heads/ssd_head.py
...mer-main/mmdetection/mmdet/models/dense_heads/ssd_head.py
+265
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/dense_heads/transformer_head.py
.../mmdetection/mmdet/models/dense_heads/transformer_head.py
+654
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/dense_heads/vfnet_head.py
...r-main/mmdetection/mmdet/models/dense_heads/vfnet_head.py
+794
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/dense_heads/yolact_head.py
...-main/mmdetection/mmdet/models/dense_heads/yolact_head.py
+942
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/dense_heads/yolo_head.py
...er-main/mmdetection/mmdet/models/dense_heads/yolo_head.py
+536
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/__init__.py
...ormer-main/mmdetection/mmdet/models/detectors/__init__.py
+36
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/atss.py
...Conformer-main/mmdetection/mmdet/models/detectors/atss.py
+17
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/base.py
...Conformer-main/mmdetection/mmdet/models/detectors/base.py
+362
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/cascade_rcnn.py
...r-main/mmdetection/mmdet/models/detectors/cascade_rcnn.py
+37
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/cornernet.py
...rmer-main/mmdetection/mmdet/models/detectors/cornernet.py
+95
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/detr.py
...Conformer-main/mmdetection/mmdet/models/detectors/detr.py
+46
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/fast_rcnn.py
...rmer-main/mmdetection/mmdet/models/detectors/fast_rcnn.py
+52
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/faster_rcnn.py
...er-main/mmdetection/mmdet/models/detectors/faster_rcnn.py
+24
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/fcos.py
...Conformer-main/mmdetection/mmdet/models/detectors/fcos.py
+17
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/fovea.py
...onformer-main/mmdetection/mmdet/models/detectors/fovea.py
+17
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/fsaf.py
...Conformer-main/mmdetection/mmdet/models/detectors/fsaf.py
+17
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/gfl.py
.../Conformer-main/mmdetection/mmdet/models/detectors/gfl.py
+16
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/grid_rcnn.py
...rmer-main/mmdetection/mmdet/models/detectors/grid_rcnn.py
+29
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/htc.py
.../Conformer-main/mmdetection/mmdet/models/detectors/htc.py
+15
-0
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/mask_rcnn.py
...rmer-main/mmdetection/mmdet/models/detectors/mask_rcnn.py
+24
-0
No files found.
Too many changes to show.
To preserve performance only
317 of 317+
files are displayed.
Plain diff
Email patch
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/dense_heads/ssd_head.py
0 → 100644
View file @
142dcf29
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
xavier_init
from
mmcv.runner
import
force_fp32
from
mmdet.core
import
(
build_anchor_generator
,
build_assigner
,
build_bbox_coder
,
build_sampler
,
multi_apply
)
from
..builder
import
HEADS
from
..losses
import
smooth_l1_loss
from
.anchor_head
import
AnchorHead
# TODO: add loss evaluator for SSD
@
HEADS
.
register_module
()
class
SSDHead
(
AnchorHead
):
"""SSD head used in https://arxiv.org/abs/1512.02325.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
anchor_generator (dict): Config dict for anchor generator
bbox_coder (dict): Config of bounding box coder.
reg_decoded_bbox (bool): If true, the regression loss would be
applied directly on decoded bounding boxes, converting both
the predicted boxes and regression targets to absolute
coordinates format. Default False. It should be `True` when
using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
train_cfg (dict): Training config of anchor head.
test_cfg (dict): Testing config of anchor head.
"""
# noqa: W605
def
__init__
(
self
,
num_classes
=
80
,
in_channels
=
(
512
,
1024
,
512
,
256
,
256
,
256
),
anchor_generator
=
dict
(
type
=
'SSDAnchorGenerator'
,
scale_major
=
False
,
input_size
=
300
,
strides
=
[
8
,
16
,
32
,
64
,
100
,
300
],
ratios
=
([
2
],
[
2
,
3
],
[
2
,
3
],
[
2
,
3
],
[
2
],
[
2
]),
basesize_ratio_range
=
(
0.1
,
0.9
)),
bbox_coder
=
dict
(
type
=
'DeltaXYWHBBoxCoder'
,
clip_border
=
True
,
target_means
=
[.
0
,
.
0
,
.
0
,
.
0
],
target_stds
=
[
1.0
,
1.0
,
1.0
,
1.0
],
),
reg_decoded_bbox
=
False
,
train_cfg
=
None
,
test_cfg
=
None
):
super
(
AnchorHead
,
self
).
__init__
()
self
.
num_classes
=
num_classes
self
.
in_channels
=
in_channels
self
.
cls_out_channels
=
num_classes
+
1
# add background class
self
.
anchor_generator
=
build_anchor_generator
(
anchor_generator
)
num_anchors
=
self
.
anchor_generator
.
num_base_anchors
reg_convs
=
[]
cls_convs
=
[]
for
i
in
range
(
len
(
in_channels
)):
reg_convs
.
append
(
nn
.
Conv2d
(
in_channels
[
i
],
num_anchors
[
i
]
*
4
,
kernel_size
=
3
,
padding
=
1
))
cls_convs
.
append
(
nn
.
Conv2d
(
in_channels
[
i
],
num_anchors
[
i
]
*
(
num_classes
+
1
),
kernel_size
=
3
,
padding
=
1
))
self
.
reg_convs
=
nn
.
ModuleList
(
reg_convs
)
self
.
cls_convs
=
nn
.
ModuleList
(
cls_convs
)
self
.
bbox_coder
=
build_bbox_coder
(
bbox_coder
)
self
.
reg_decoded_bbox
=
reg_decoded_bbox
self
.
use_sigmoid_cls
=
False
self
.
cls_focal_loss
=
False
self
.
train_cfg
=
train_cfg
self
.
test_cfg
=
test_cfg
# set sampling=False for archor_target
self
.
sampling
=
False
if
self
.
train_cfg
:
self
.
assigner
=
build_assigner
(
self
.
train_cfg
.
assigner
)
# SSD sampling=False so use PseudoSampler
sampler_cfg
=
dict
(
type
=
'PseudoSampler'
)
self
.
sampler
=
build_sampler
(
sampler_cfg
,
context
=
self
)
self
.
fp16_enabled
=
False
def
init_weights
(
self
):
"""Initialize weights of the head."""
for
m
in
self
.
modules
():
if
isinstance
(
m
,
nn
.
Conv2d
):
xavier_init
(
m
,
distribution
=
'uniform'
,
bias
=
0
)
def
forward
(
self
,
feats
):
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
tuple:
cls_scores (list[Tensor]): Classification scores for all scale
levels, each is a 4D-tensor, the channels number is
num_anchors * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for all scale
levels, each is a 4D-tensor, the channels number is
num_anchors * 4.
"""
cls_scores
=
[]
bbox_preds
=
[]
for
feat
,
reg_conv
,
cls_conv
in
zip
(
feats
,
self
.
reg_convs
,
self
.
cls_convs
):
cls_scores
.
append
(
cls_conv
(
feat
))
bbox_preds
.
append
(
reg_conv
(
feat
))
return
cls_scores
,
bbox_preds
def
loss_single
(
self
,
cls_score
,
bbox_pred
,
anchor
,
labels
,
label_weights
,
bbox_targets
,
bbox_weights
,
num_total_samples
):
"""Compute loss of a single image.
Args:
cls_score (Tensor): Box scores for eachimage
Has shape (num_total_anchors, num_classes).
bbox_pred (Tensor): Box energies / deltas for each image
level with shape (num_total_anchors, 4).
anchors (Tensor): Box reference for each scale level with shape
(num_total_anchors, 4).
labels (Tensor): Labels of each anchors with shape
(num_total_anchors,).
label_weights (Tensor): Label weights of each anchor with shape
(num_total_anchors,)
bbox_targets (Tensor): BBox regression targets of each anchor wight
shape (num_total_anchors, 4).
bbox_weights (Tensor): BBox regression loss weights of each anchor
with shape (num_total_anchors, 4).
num_total_samples (int): If sampling, num total samples equal to
the number of total anchors; Otherwise, it is the number of
positive anchors.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
loss_cls_all
=
F
.
cross_entropy
(
cls_score
,
labels
,
reduction
=
'none'
)
*
label_weights
# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
pos_inds
=
((
labels
>=
0
)
&
(
labels
<
self
.
num_classes
)).
nonzero
().
reshape
(
-
1
)
neg_inds
=
(
labels
==
self
.
num_classes
).
nonzero
().
view
(
-
1
)
num_pos_samples
=
pos_inds
.
size
(
0
)
num_neg_samples
=
self
.
train_cfg
.
neg_pos_ratio
*
num_pos_samples
if
num_neg_samples
>
neg_inds
.
size
(
0
):
num_neg_samples
=
neg_inds
.
size
(
0
)
topk_loss_cls_neg
,
_
=
loss_cls_all
[
neg_inds
].
topk
(
num_neg_samples
)
loss_cls_pos
=
loss_cls_all
[
pos_inds
].
sum
()
loss_cls_neg
=
topk_loss_cls_neg
.
sum
()
loss_cls
=
(
loss_cls_pos
+
loss_cls_neg
)
/
num_total_samples
if
self
.
reg_decoded_bbox
:
# When the regression loss (e.g. `IouLoss`, `GIouLoss`)
# is applied directly on the decoded bounding boxes, it
# decodes the already encoded coordinates to absolute format.
bbox_pred
=
self
.
bbox_coder
.
decode
(
anchor
,
bbox_pred
)
loss_bbox
=
smooth_l1_loss
(
bbox_pred
,
bbox_targets
,
bbox_weights
,
beta
=
self
.
train_cfg
.
smoothl1_beta
,
avg_factor
=
num_total_samples
)
return
loss_cls
[
None
],
loss_bbox
@
force_fp32
(
apply_to
=
(
'cls_scores'
,
'bbox_preds'
))
def
loss
(
self
,
cls_scores
,
bbox_preds
,
gt_bboxes
,
gt_labels
,
img_metas
,
gt_bboxes_ignore
=
None
):
"""Compute losses of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_anchors * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_anchors * 4, H, W)
gt_bboxes (list[Tensor]): each item are the truth boxes for each
image in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): specify which bounding
boxes can be ignored when computing the loss.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
featmap_sizes
=
[
featmap
.
size
()[
-
2
:]
for
featmap
in
cls_scores
]
assert
len
(
featmap_sizes
)
==
self
.
anchor_generator
.
num_levels
device
=
cls_scores
[
0
].
device
anchor_list
,
valid_flag_list
=
self
.
get_anchors
(
featmap_sizes
,
img_metas
,
device
=
device
)
cls_reg_targets
=
self
.
get_targets
(
anchor_list
,
valid_flag_list
,
gt_bboxes
,
img_metas
,
gt_bboxes_ignore_list
=
gt_bboxes_ignore
,
gt_labels_list
=
gt_labels
,
label_channels
=
1
,
unmap_outputs
=
False
)
if
cls_reg_targets
is
None
:
return
None
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
)
=
cls_reg_targets
num_images
=
len
(
img_metas
)
all_cls_scores
=
torch
.
cat
([
s
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
num_images
,
-
1
,
self
.
cls_out_channels
)
for
s
in
cls_scores
],
1
)
all_labels
=
torch
.
cat
(
labels_list
,
-
1
).
view
(
num_images
,
-
1
)
all_label_weights
=
torch
.
cat
(
label_weights_list
,
-
1
).
view
(
num_images
,
-
1
)
all_bbox_preds
=
torch
.
cat
([
b
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
num_images
,
-
1
,
4
)
for
b
in
bbox_preds
],
-
2
)
all_bbox_targets
=
torch
.
cat
(
bbox_targets_list
,
-
2
).
view
(
num_images
,
-
1
,
4
)
all_bbox_weights
=
torch
.
cat
(
bbox_weights_list
,
-
2
).
view
(
num_images
,
-
1
,
4
)
# concat all level anchors to a single tensor
all_anchors
=
[]
for
i
in
range
(
num_images
):
all_anchors
.
append
(
torch
.
cat
(
anchor_list
[
i
]))
# check NaN and Inf
assert
torch
.
isfinite
(
all_cls_scores
).
all
().
item
(),
\
'classification scores become infinite or NaN!'
assert
torch
.
isfinite
(
all_bbox_preds
).
all
().
item
(),
\
'bbox predications become infinite or NaN!'
losses_cls
,
losses_bbox
=
multi_apply
(
self
.
loss_single
,
all_cls_scores
,
all_bbox_preds
,
all_anchors
,
all_labels
,
all_label_weights
,
all_bbox_targets
,
all_bbox_weights
,
num_total_samples
=
num_total_pos
)
return
dict
(
loss_cls
=
losses_cls
,
loss_bbox
=
losses_bbox
)
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/dense_heads/transformer_head.py
0 → 100644
View file @
142dcf29
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
Conv2d
,
Linear
,
build_activation_layer
from
mmcv.runner
import
force_fp32
from
mmdet.core
import
(
bbox_cxcywh_to_xyxy
,
bbox_xyxy_to_cxcywh
,
build_assigner
,
build_sampler
,
multi_apply
,
reduce_mean
)
from
mmdet.models.utils
import
(
FFN
,
build_positional_encoding
,
build_transformer
)
from
..builder
import
HEADS
,
build_loss
from
.anchor_free_head
import
AnchorFreeHead
@
HEADS
.
register_module
()
class
TransformerHead
(
AnchorFreeHead
):
"""Implements the DETR transformer head.
See `paper: End-to-End Object Detection with Transformers
<https://arxiv.org/pdf/2005.12872>`_ for details.
Args:
num_classes (int): Number of categories excluding the background.
in_channels (int): Number of channels in the input feature map.
num_fcs (int, optional): Number of fully-connected layers used in
`FFN`, which is then used for the regression head. Default 2.
transformer (dict, optional): Config for transformer.
positional_encoding (dict, optional): Config for position encoding.
loss_cls (dict, optional): Config of the classification loss.
Default `CrossEntropyLoss`.
loss_bbox (dict, optional): Config of the regression loss.
Default `L1Loss`.
loss_iou (dict, optional): Config of the regression iou loss.
Default `GIoULoss`.
tran_cfg (dict, optional): Training config of transformer head.
test_cfg (dict, optional): Testing config of transformer head.
Example:
>>> import torch
>>> self = TransformerHead(80, 2048)
>>> x = torch.rand(1, 2048, 32, 32)
>>> mask = torch.ones(1, 32, 32).to(x.dtype)
>>> mask[:, :16, :15] = 0
>>> all_cls_scores, all_bbox_preds = self(x, mask)
"""
def
__init__
(
self
,
num_classes
,
in_channels
,
num_fcs
=
2
,
transformer
=
dict
(
type
=
'Transformer'
,
embed_dims
=
256
,
num_heads
=
8
,
num_encoder_layers
=
6
,
num_decoder_layers
=
6
,
feedforward_channels
=
2048
,
dropout
=
0.1
,
act_cfg
=
dict
(
type
=
'ReLU'
,
inplace
=
True
),
norm_cfg
=
dict
(
type
=
'LN'
),
num_fcs
=
2
,
pre_norm
=
False
,
return_intermediate_dec
=
True
),
positional_encoding
=
dict
(
type
=
'SinePositionalEncoding'
,
num_feats
=
128
,
normalize
=
True
),
loss_cls
=
dict
(
type
=
'CrossEntropyLoss'
,
bg_cls_weight
=
0.1
,
use_sigmoid
=
False
,
loss_weight
=
1.0
,
class_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
5.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
2.0
),
train_cfg
=
dict
(
assigner
=
dict
(
type
=
'HungarianAssigner'
,
cls_cost
=
dict
(
type
=
'ClassificationCost'
,
weight
=
1.
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
5.0
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
2.0
))),
test_cfg
=
dict
(
max_per_img
=
100
),
**
kwargs
):
# NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
# since it brings inconvenience when the initialization of
# `AnchorFreeHead` is called.
super
(
AnchorFreeHead
,
self
).
__init__
()
use_sigmoid_cls
=
loss_cls
.
get
(
'use_sigmoid'
,
False
)
assert
not
use_sigmoid_cls
,
'setting use_sigmoid_cls as True is '
\
'not supported in DETR, since background is needed for the '
\
'matching process.'
assert
'embed_dims'
in
transformer
\
and
'num_feats'
in
positional_encoding
num_feats
=
positional_encoding
[
'num_feats'
]
embed_dims
=
transformer
[
'embed_dims'
]
assert
num_feats
*
2
==
embed_dims
,
'embed_dims should'
\
f
' be exactly 2 times of num_feats. Found
{
embed_dims
}
'
\
f
' and
{
num_feats
}
.'
assert
test_cfg
is
not
None
and
'max_per_img'
in
test_cfg
class_weight
=
loss_cls
.
get
(
'class_weight'
,
None
)
if
class_weight
is
not
None
:
assert
isinstance
(
class_weight
,
float
),
'Expected '
\
'class_weight to have type float. Found '
\
f
'
{
type
(
class_weight
)
}
.'
# NOTE following the official DETR rep0, bg_cls_weight means
# relative classification weight of the no-object class.
bg_cls_weight
=
loss_cls
.
get
(
'bg_cls_weight'
,
class_weight
)
assert
isinstance
(
bg_cls_weight
,
float
),
'Expected '
\
'bg_cls_weight to have type float. Found '
\
f
'
{
type
(
bg_cls_weight
)
}
.'
class_weight
=
torch
.
ones
(
num_classes
+
1
)
*
class_weight
# set background class as the last indice
class_weight
[
num_classes
]
=
bg_cls_weight
loss_cls
.
update
({
'class_weight'
:
class_weight
})
if
'bg_cls_weight'
in
loss_cls
:
loss_cls
.
pop
(
'bg_cls_weight'
)
self
.
bg_cls_weight
=
bg_cls_weight
if
train_cfg
:
assert
'assigner'
in
train_cfg
,
'assigner should be provided '
\
'when train_cfg is set.'
assigner
=
train_cfg
[
'assigner'
]
assert
loss_cls
[
'loss_weight'
]
==
assigner
[
'cls_cost'
][
'weight'
],
\
'The classification weight for loss and matcher should be'
\
'exactly the same.'
assert
loss_bbox
[
'loss_weight'
]
==
assigner
[
'reg_cost'
][
'weight'
],
'The regression L1 weight for loss and matcher '
\
'should be exactly the same.'
assert
loss_iou
[
'loss_weight'
]
==
assigner
[
'iou_cost'
][
'weight'
],
\
'The regression iou weight for loss and matcher should be'
\
'exactly the same.'
self
.
assigner
=
build_assigner
(
assigner
)
# DETR sampling=False, so use PseudoSampler
sampler_cfg
=
dict
(
type
=
'PseudoSampler'
)
self
.
sampler
=
build_sampler
(
sampler_cfg
,
context
=
self
)
self
.
num_classes
=
num_classes
self
.
cls_out_channels
=
num_classes
+
1
self
.
in_channels
=
in_channels
self
.
num_fcs
=
num_fcs
self
.
train_cfg
=
train_cfg
self
.
test_cfg
=
test_cfg
self
.
use_sigmoid_cls
=
use_sigmoid_cls
self
.
embed_dims
=
embed_dims
self
.
num_query
=
test_cfg
[
'max_per_img'
]
self
.
fp16_enabled
=
False
self
.
loss_cls
=
build_loss
(
loss_cls
)
self
.
loss_bbox
=
build_loss
(
loss_bbox
)
self
.
loss_iou
=
build_loss
(
loss_iou
)
self
.
act_cfg
=
transformer
.
get
(
'act_cfg'
,
dict
(
type
=
'ReLU'
,
inplace
=
True
))
self
.
activate
=
build_activation_layer
(
self
.
act_cfg
)
self
.
positional_encoding
=
build_positional_encoding
(
positional_encoding
)
self
.
transformer
=
build_transformer
(
transformer
)
self
.
_init_layers
()
def
_init_layers
(
self
):
"""Initialize layers of the transformer head."""
self
.
input_proj
=
Conv2d
(
self
.
in_channels
,
self
.
embed_dims
,
kernel_size
=
1
)
self
.
fc_cls
=
Linear
(
self
.
embed_dims
,
self
.
cls_out_channels
)
self
.
reg_ffn
=
FFN
(
self
.
embed_dims
,
self
.
embed_dims
,
self
.
num_fcs
,
self
.
act_cfg
,
dropout
=
0.0
,
add_residual
=
False
)
self
.
fc_reg
=
Linear
(
self
.
embed_dims
,
4
)
self
.
query_embedding
=
nn
.
Embedding
(
self
.
num_query
,
self
.
embed_dims
)
def
init_weights
(
self
,
distribution
=
'uniform'
):
"""Initialize weights of the transformer head."""
# The initialization for transformer is important
self
.
transformer
.
init_weights
()
def
_load_from_state_dict
(
self
,
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
):
"""load checkpoints."""
# NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
# since `AnchorFreeHead._load_from_state_dict` should not be
# called here. Invoking the default `Module._load_from_state_dict`
# is enough.
super
(
AnchorFreeHead
,
self
).
_load_from_state_dict
(
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
)
def
forward
(
self
,
feats
,
img_metas
):
"""Forward function.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
img_metas (list[dict]): List of image information.
Returns:
tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.
- all_cls_scores_list (list[Tensor]): Classification scores
\
for each scale level. Each is a 4D-tensor with shape
\
[nb_dec, bs, num_query, cls_out_channels]. Note
\
`cls_out_channels` should includes background.
- all_bbox_preds_list (list[Tensor]): Sigmoid regression
\
outputs for each scale level. Each is a 4D-tensor with
\
normalized coordinate format (cx, cy, w, h) and shape
\
[nb_dec, bs, num_query, 4].
"""
num_levels
=
len
(
feats
)
img_metas_list
=
[
img_metas
for
_
in
range
(
num_levels
)]
return
multi_apply
(
self
.
forward_single
,
feats
,
img_metas_list
)
def
forward_single
(
self
,
x
,
img_metas
):
""""Forward function for a single feature level.
Args:
x (Tensor): Input feature from backbone's single stage, shape
[bs, c, h, w].
img_metas (list[dict]): List of image information.
Returns:
all_cls_scores (Tensor): Outputs from the classification head,
shape [nb_dec, bs, num_query, cls_out_channels]. Note
cls_out_channels should includes background.
all_bbox_preds (Tensor): Sigmoid outputs from the regression
head with normalized coordinate format (cx, cy, w, h).
Shape [nb_dec, bs, num_query, 4].
"""
# construct binary masks which used for the transformer.
# NOTE following the official DETR repo, non-zero values representing
# ignored positions, while zero values means valid positions.
batch_size
=
x
.
size
(
0
)
input_img_h
,
input_img_w
=
img_metas
[
0
][
'batch_input_shape'
]
masks
=
x
.
new_ones
((
batch_size
,
input_img_h
,
input_img_w
))
for
img_id
in
range
(
batch_size
):
img_h
,
img_w
,
_
=
img_metas
[
img_id
][
'img_shape'
]
masks
[
img_id
,
:
img_h
,
:
img_w
]
=
0
x
=
self
.
input_proj
(
x
)
# interpolate masks to have the same spatial shape with x
masks
=
F
.
interpolate
(
masks
.
unsqueeze
(
1
),
size
=
x
.
shape
[
-
2
:]).
to
(
torch
.
bool
).
squeeze
(
1
)
# position encoding
pos_embed
=
self
.
positional_encoding
(
masks
)
# [bs, embed_dim, h, w]
# outs_dec: [nb_dec, bs, num_query, embed_dim]
outs_dec
,
_
=
self
.
transformer
(
x
,
masks
,
self
.
query_embedding
.
weight
,
pos_embed
)
all_cls_scores
=
self
.
fc_cls
(
outs_dec
)
all_bbox_preds
=
self
.
fc_reg
(
self
.
activate
(
self
.
reg_ffn
(
outs_dec
))).
sigmoid
()
return
all_cls_scores
,
all_bbox_preds
@
force_fp32
(
apply_to
=
(
'all_cls_scores_list'
,
'all_bbox_preds_list'
))
def
loss
(
self
,
all_cls_scores_list
,
all_bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore
=
None
):
""""Loss function.
Only outputs from the last feature level are used for computing
losses by default.
Args:
all_cls_scores_list (list[Tensor]): Classification outputs
for each feature level. Each is a 4D-tensor with shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds_list (list[Tensor]): Sigmoid regression
outputs for each feature level. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
# NOTE defaultly only the outputs from the last feature scale is used.
all_cls_scores
=
all_cls_scores_list
[
-
1
]
all_bbox_preds
=
all_bbox_preds_list
[
-
1
]
assert
gt_bboxes_ignore
is
None
,
\
'Only supports for gt_bboxes_ignore setting to None.'
num_dec_layers
=
len
(
all_cls_scores
)
all_gt_bboxes_list
=
[
gt_bboxes_list
for
_
in
range
(
num_dec_layers
)]
all_gt_labels_list
=
[
gt_labels_list
for
_
in
range
(
num_dec_layers
)]
all_gt_bboxes_ignore_list
=
[
gt_bboxes_ignore
for
_
in
range
(
num_dec_layers
)
]
img_metas_list
=
[
img_metas
for
_
in
range
(
num_dec_layers
)]
losses_cls
,
losses_bbox
,
losses_iou
=
multi_apply
(
self
.
loss_single
,
all_cls_scores
,
all_bbox_preds
,
all_gt_bboxes_list
,
all_gt_labels_list
,
img_metas_list
,
all_gt_bboxes_ignore_list
)
loss_dict
=
dict
()
# loss from the last decoder layer
loss_dict
[
'loss_cls'
]
=
losses_cls
[
-
1
]
loss_dict
[
'loss_bbox'
]
=
losses_bbox
[
-
1
]
loss_dict
[
'loss_iou'
]
=
losses_iou
[
-
1
]
# loss from other decoder layers
num_dec_layer
=
0
for
loss_cls_i
,
loss_bbox_i
,
loss_iou_i
in
zip
(
losses_cls
[:
-
1
],
losses_bbox
[:
-
1
],
losses_iou
[:
-
1
]):
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_cls'
]
=
loss_cls_i
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_bbox'
]
=
loss_bbox_i
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_iou'
]
=
loss_iou_i
num_dec_layer
+=
1
return
loss_dict
def
loss_single
(
self
,
cls_scores
,
bbox_preds
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore_list
=
None
):
""""Loss function for outputs from a single decoder layer of a single
feature level.
Args:
cls_scores (Tensor): Box score logits from a single decoder layer
for all images. Shape [bs, num_query, cls_out_channels].
bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
for all images, with normalized coordinate (cx, cy, w, h) and
shape [bs, num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components for outputs from
a single decoder layer.
"""
num_imgs
=
cls_scores
.
size
(
0
)
cls_scores_list
=
[
cls_scores
[
i
]
for
i
in
range
(
num_imgs
)]
bbox_preds_list
=
[
bbox_preds
[
i
]
for
i
in
range
(
num_imgs
)]
cls_reg_targets
=
self
.
get_targets
(
cls_scores_list
,
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore_list
)
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
)
=
cls_reg_targets
labels
=
torch
.
cat
(
labels_list
,
0
)
label_weights
=
torch
.
cat
(
label_weights_list
,
0
)
bbox_targets
=
torch
.
cat
(
bbox_targets_list
,
0
)
bbox_weights
=
torch
.
cat
(
bbox_weights_list
,
0
)
# classification loss
cls_scores
=
cls_scores
.
reshape
(
-
1
,
self
.
cls_out_channels
)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor
=
num_total_pos
*
1.0
+
\
num_total_neg
*
self
.
bg_cls_weight
loss_cls
=
self
.
loss_cls
(
cls_scores
,
labels
,
label_weights
,
avg_factor
=
cls_avg_factor
)
# Compute the average number of gt boxes accross all gpus, for
# normalization purposes
num_total_pos
=
loss_cls
.
new_tensor
([
num_total_pos
])
num_total_pos
=
torch
.
clamp
(
reduce_mean
(
num_total_pos
),
min
=
1
).
item
()
# construct factors used for rescale bboxes
factors
=
[]
for
img_meta
,
bbox_pred
in
zip
(
img_metas
,
bbox_preds
):
img_h
,
img_w
,
_
=
img_meta
[
'img_shape'
]
factor
=
bbox_pred
.
new_tensor
([
img_w
,
img_h
,
img_w
,
img_h
]).
unsqueeze
(
0
).
repeat
(
bbox_pred
.
size
(
0
),
1
)
factors
.
append
(
factor
)
factors
=
torch
.
cat
(
factors
,
0
)
# DETR regress the relative position of boxes (cxcywh) in the image,
# thus the learning target is normalized by the image size. So here
# we need to re-scale them for calculating IoU loss
bbox_preds
=
bbox_preds
.
reshape
(
-
1
,
4
)
bboxes
=
bbox_cxcywh_to_xyxy
(
bbox_preds
)
*
factors
bboxes_gt
=
bbox_cxcywh_to_xyxy
(
bbox_targets
)
*
factors
# regression IoU loss, defaultly GIoU loss
loss_iou
=
self
.
loss_iou
(
bboxes
,
bboxes_gt
,
bbox_weights
,
avg_factor
=
num_total_pos
)
# regression L1 loss
loss_bbox
=
self
.
loss_bbox
(
bbox_preds
,
bbox_targets
,
bbox_weights
,
avg_factor
=
num_total_pos
)
return
loss_cls
,
loss_bbox
,
loss_iou
def
get_targets
(
self
,
cls_scores_list
,
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore_list
=
None
):
""""Compute regression and classification targets for a batch image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_scores_list (list[Tensor]): Box score logits from a single
decoder layer for each image with shape [num_query,
cls_out_channels].
bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
decoder layer for each image, with normalized coordinate
(cx, cy, w, h) and shape [num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
tuple: a tuple containing the following targets.
- labels_list (list[Tensor]): Labels for all images.
- label_weights_list (list[Tensor]): Label weights for all
\
images.
- bbox_targets_list (list[Tensor]): BBox targets for all
\
images.
- bbox_weights_list (list[Tensor]): BBox weights for all
\
images.
- num_total_pos (int): Number of positive samples in all
\
images.
- num_total_neg (int): Number of negative samples in all
\
images.
"""
assert
gt_bboxes_ignore_list
is
None
,
\
'Only supports for gt_bboxes_ignore setting to None.'
num_imgs
=
len
(
cls_scores_list
)
gt_bboxes_ignore_list
=
[
gt_bboxes_ignore_list
for
_
in
range
(
num_imgs
)
]
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
pos_inds_list
,
neg_inds_list
)
=
multi_apply
(
self
.
_get_target_single
,
cls_scores_list
,
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore_list
)
num_total_pos
=
sum
((
inds
.
numel
()
for
inds
in
pos_inds_list
))
num_total_neg
=
sum
((
inds
.
numel
()
for
inds
in
neg_inds_list
))
return
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
)
def
_get_target_single
(
self
,
cls_score
,
bbox_pred
,
gt_bboxes
,
gt_labels
,
img_meta
,
gt_bboxes_ignore
=
None
):
""""Compute regression and classification targets for one image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_score (Tensor): Box score logits from a single decoder layer
for one image. Shape [num_query, cls_out_channels].
bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
for one image, with normalized coordinate (cx, cy, w, h) and
shape [num_query, 4].
gt_bboxes (Tensor): Ground truth bboxes for one image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (Tensor): Ground truth class indices for one image
with shape (num_gts, ).
img_meta (dict): Meta information for one image.
gt_bboxes_ignore (Tensor, optional): Bounding boxes
which can be ignored. Default None.
Returns:
tuple[Tensor]: a tuple containing the following for one image.
- labels (Tensor): Labels of each image.
- label_weights (Tensor]): Label weights of each image.
- bbox_targets (Tensor): BBox targets of each image.
- bbox_weights (Tensor): BBox weights of each image.
- pos_inds (Tensor): Sampled positive indices for each image.
- neg_inds (Tensor): Sampled negative indices for each image.
"""
num_bboxes
=
bbox_pred
.
size
(
0
)
# assigner and sampler
assign_result
=
self
.
assigner
.
assign
(
bbox_pred
,
cls_score
,
gt_bboxes
,
gt_labels
,
img_meta
,
gt_bboxes_ignore
)
sampling_result
=
self
.
sampler
.
sample
(
assign_result
,
bbox_pred
,
gt_bboxes
)
pos_inds
=
sampling_result
.
pos_inds
neg_inds
=
sampling_result
.
neg_inds
# label targets
labels
=
gt_bboxes
.
new_full
((
num_bboxes
,
),
self
.
num_classes
,
dtype
=
torch
.
long
)
labels
[
pos_inds
]
=
gt_labels
[
sampling_result
.
pos_assigned_gt_inds
]
label_weights
=
gt_bboxes
.
new_ones
(
num_bboxes
)
# bbox targets
bbox_targets
=
torch
.
zeros_like
(
bbox_pred
)
bbox_weights
=
torch
.
zeros_like
(
bbox_pred
)
bbox_weights
[
pos_inds
]
=
1.0
img_h
,
img_w
,
_
=
img_meta
[
'img_shape'
]
# DETR regress the relative position of boxes (cxcywh) in the image.
# Thus the learning target should be normalized by the image size, also
# the box format should be converted from defaultly x1y1x2y2 to cxcywh.
factor
=
bbox_pred
.
new_tensor
([
img_w
,
img_h
,
img_w
,
img_h
]).
unsqueeze
(
0
)
pos_gt_bboxes_normalized
=
sampling_result
.
pos_gt_bboxes
/
factor
pos_gt_bboxes_targets
=
bbox_xyxy_to_cxcywh
(
pos_gt_bboxes_normalized
)
bbox_targets
[
pos_inds
]
=
pos_gt_bboxes_targets
return
(
labels
,
label_weights
,
bbox_targets
,
bbox_weights
,
pos_inds
,
neg_inds
)
# over-write because img_metas are needed as inputs for bbox_head.
def
forward_train
(
self
,
x
,
img_metas
,
gt_bboxes
,
gt_labels
=
None
,
gt_bboxes_ignore
=
None
,
proposal_cfg
=
None
,
**
kwargs
):
"""Forward function for training mode.
Args:
x (list[Tensor]): Features from backbone.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes (Tensor): Ground truth bboxes of the image,
shape (num_gts, 4).
gt_labels (Tensor): Ground truth labels of each box,
shape (num_gts,).
gt_bboxes_ignore (Tensor): Ground truth bboxes to be
ignored, shape (num_ignored_gts, 4).
proposal_cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert
proposal_cfg
is
None
,
'"proposal_cfg" must be None'
outs
=
self
(
x
,
img_metas
)
if
gt_labels
is
None
:
loss_inputs
=
outs
+
(
gt_bboxes
,
img_metas
)
else
:
loss_inputs
=
outs
+
(
gt_bboxes
,
gt_labels
,
img_metas
)
losses
=
self
.
loss
(
*
loss_inputs
,
gt_bboxes_ignore
=
gt_bboxes_ignore
)
return
losses
@
force_fp32
(
apply_to
=
(
'all_cls_scores_list'
,
'all_bbox_preds_list'
))
def
get_bboxes
(
self
,
all_cls_scores_list
,
all_bbox_preds_list
,
img_metas
,
rescale
=
False
):
"""Transform network outputs for a batch into bbox predictions.
Args:
all_cls_scores_list (list[Tensor]): Classification outputs
for each feature level. Each is a 4D-tensor with shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds_list (list[Tensor]): Sigmoid regression
outputs for each feature level. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
img_metas (list[dict]): Meta information of each image.
rescale (bool, optional): If True, return boxes in original
image space. Defalut False.
Returns:
list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple.
\
The first item is an (n, 5) tensor, where the first 4 columns
\
are bounding box positions (tl_x, tl_y, br_x, br_y) and the
\
5-th column is a score between 0 and 1. The second item is a
\
(n,) tensor where each item is the predicted class label of
\
the corresponding box.
"""
# NOTE defaultly only using outputs from the last feature level,
# and only the ouputs from the last decoder layer is used.
cls_scores
=
all_cls_scores_list
[
-
1
][
-
1
]
bbox_preds
=
all_bbox_preds_list
[
-
1
][
-
1
]
result_list
=
[]
for
img_id
in
range
(
len
(
img_metas
)):
cls_score
=
cls_scores
[
img_id
]
bbox_pred
=
bbox_preds
[
img_id
]
img_shape
=
img_metas
[
img_id
][
'img_shape'
]
scale_factor
=
img_metas
[
img_id
][
'scale_factor'
]
proposals
=
self
.
_get_bboxes_single
(
cls_score
,
bbox_pred
,
img_shape
,
scale_factor
,
rescale
)
result_list
.
append
(
proposals
)
return
result_list
def
_get_bboxes_single
(
self
,
cls_score
,
bbox_pred
,
img_shape
,
scale_factor
,
rescale
=
False
):
"""Transform outputs from the last decoder layer into bbox predictions
for each image.
Args:
cls_score (Tensor): Box score logits from the last decoder layer
for each image. Shape [num_query, cls_out_channels].
bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
for each image, with coordinate format (cx, cy, w, h) and
shape [num_query, 4].
img_shape (tuple[int]): Shape of input image, (height, width, 3).
scale_factor (ndarray, optional): Scale factor of the image arange
as (w_scale, h_scale, w_scale, h_scale).
rescale (bool, optional): If True, return boxes in original image
space. Default False.
Returns:
tuple[Tensor]: Results of detected bboxes and labels.
- det_bboxes: Predicted bboxes with shape [num_query, 5],
\
where the first 4 columns are bounding box positions
\
(tl_x, tl_y, br_x, br_y) and the 5-th column are scores
\
between 0 and 1.
- det_labels: Predicted labels of the corresponding box with
\
shape [num_query].
"""
assert
len
(
cls_score
)
==
len
(
bbox_pred
)
# exclude background
scores
,
det_labels
=
F
.
softmax
(
cls_score
,
dim
=-
1
)[...,
:
-
1
].
max
(
-
1
)
det_bboxes
=
bbox_cxcywh_to_xyxy
(
bbox_pred
)
det_bboxes
[:,
0
::
2
]
=
det_bboxes
[:,
0
::
2
]
*
img_shape
[
1
]
det_bboxes
[:,
1
::
2
]
=
det_bboxes
[:,
1
::
2
]
*
img_shape
[
0
]
det_bboxes
[:,
0
::
2
].
clamp_
(
min
=
0
,
max
=
img_shape
[
1
])
det_bboxes
[:,
1
::
2
].
clamp_
(
min
=
0
,
max
=
img_shape
[
0
])
if
rescale
:
det_bboxes
/=
det_bboxes
.
new_tensor
(
scale_factor
)
det_bboxes
=
torch
.
cat
((
det_bboxes
,
scores
.
unsqueeze
(
1
)),
-
1
)
return
det_bboxes
,
det_labels
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/dense_heads/vfnet_head.py
0 → 100644
View file @
142dcf29
import
numpy
as
np
import
torch
import
torch.nn
as
nn
from
mmcv.cnn
import
ConvModule
,
Scale
,
bias_init_with_prob
,
normal_init
from
mmcv.ops
import
DeformConv2d
from
mmcv.runner
import
force_fp32
from
mmdet.core
import
(
bbox2distance
,
bbox_overlaps
,
build_anchor_generator
,
build_assigner
,
build_sampler
,
distance2bbox
,
multi_apply
,
multiclass_nms
,
reduce_mean
)
from
..builder
import
HEADS
,
build_loss
from
.atss_head
import
ATSSHead
from
.fcos_head
import
FCOSHead
INF
=
1e8
@
HEADS
.
register_module
()
class
VFNetHead
(
ATSSHead
,
FCOSHead
):
"""Head of `VarifocalNet (VFNet): An IoU-aware Dense Object
Detector.<https://arxiv.org/abs/2008.13367>`_.
The VFNet predicts IoU-aware classification scores which mix the
object presence confidence and object localization accuracy as the
detection score. It is built on the FCOS architecture and uses ATSS
for defining positive/negative training examples. The VFNet is trained
with Varifocal Loss and empolys star-shaped deformable convolution to
extract features for a bbox.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
regress_ranges (tuple[tuple[int, int]]): Regress range of multiple
level points.
center_sampling (bool): If true, use center sampling. Default: False.
center_sample_radius (float): Radius of center sampling. Default: 1.5.
sync_num_pos (bool): If true, synchronize the number of positive
examples across GPUs. Default: True
gradient_mul (float): The multiplier to gradients from bbox refinement
and recognition. Default: 0.1.
bbox_norm_type (str): The bbox normalization type, 'reg_denom' or
'stride'. Default: reg_denom
loss_cls_fl (dict): Config of focal loss.
use_vfl (bool): If true, use varifocal loss for training.
Default: True.
loss_cls (dict): Config of varifocal loss.
loss_bbox (dict): Config of localization loss, GIoU Loss.
loss_bbox (dict): Config of localization refinement loss, GIoU Loss.
norm_cfg (dict): dictionary to construct and config norm layer.
Default: norm_cfg=dict(type='GN', num_groups=32,
requires_grad=True).
use_atss (bool): If true, use ATSS to define positive/negative
examples. Default: True.
anchor_generator (dict): Config of anchor generator for ATSS.
Example:
>>> self = VFNetHead(11, 7)
>>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
>>> cls_score, bbox_pred, bbox_pred_refine= self.forward(feats)
>>> assert len(cls_score) == len(self.scales)
"""
# noqa: E501
def
__init__
(
self
,
num_classes
,
in_channels
,
regress_ranges
=
((
-
1
,
64
),
(
64
,
128
),
(
128
,
256
),
(
256
,
512
),
(
512
,
INF
)),
center_sampling
=
False
,
center_sample_radius
=
1.5
,
sync_num_pos
=
True
,
gradient_mul
=
0.1
,
bbox_norm_type
=
'reg_denom'
,
loss_cls_fl
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
1.0
),
use_vfl
=
True
,
loss_cls
=
dict
(
type
=
'VarifocalLoss'
,
use_sigmoid
=
True
,
alpha
=
0.75
,
gamma
=
2.0
,
iou_weighted
=
True
,
loss_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
1.5
),
loss_bbox_refine
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
2.0
),
norm_cfg
=
dict
(
type
=
'GN'
,
num_groups
=
32
,
requires_grad
=
True
),
use_atss
=
True
,
anchor_generator
=
dict
(
type
=
'AnchorGenerator'
,
ratios
=
[
1.0
],
octave_base_scale
=
8
,
scales_per_octave
=
1
,
center_offset
=
0.0
,
strides
=
[
8
,
16
,
32
,
64
,
128
]),
**
kwargs
):
# dcn base offsets, adapted from reppoints_head.py
self
.
num_dconv_points
=
9
self
.
dcn_kernel
=
int
(
np
.
sqrt
(
self
.
num_dconv_points
))
self
.
dcn_pad
=
int
((
self
.
dcn_kernel
-
1
)
/
2
)
dcn_base
=
np
.
arange
(
-
self
.
dcn_pad
,
self
.
dcn_pad
+
1
).
astype
(
np
.
float64
)
dcn_base_y
=
np
.
repeat
(
dcn_base
,
self
.
dcn_kernel
)
dcn_base_x
=
np
.
tile
(
dcn_base
,
self
.
dcn_kernel
)
dcn_base_offset
=
np
.
stack
([
dcn_base_y
,
dcn_base_x
],
axis
=
1
).
reshape
(
(
-
1
))
self
.
dcn_base_offset
=
torch
.
tensor
(
dcn_base_offset
).
view
(
1
,
-
1
,
1
,
1
)
super
(
FCOSHead
,
self
).
__init__
(
num_classes
,
in_channels
,
norm_cfg
=
norm_cfg
,
**
kwargs
)
self
.
regress_ranges
=
regress_ranges
self
.
reg_denoms
=
[
regress_range
[
-
1
]
for
regress_range
in
regress_ranges
]
self
.
reg_denoms
[
-
1
]
=
self
.
reg_denoms
[
-
2
]
*
2
self
.
center_sampling
=
center_sampling
self
.
center_sample_radius
=
center_sample_radius
self
.
sync_num_pos
=
sync_num_pos
self
.
bbox_norm_type
=
bbox_norm_type
self
.
gradient_mul
=
gradient_mul
self
.
use_vfl
=
use_vfl
if
self
.
use_vfl
:
self
.
loss_cls
=
build_loss
(
loss_cls
)
else
:
self
.
loss_cls
=
build_loss
(
loss_cls_fl
)
self
.
loss_bbox
=
build_loss
(
loss_bbox
)
self
.
loss_bbox_refine
=
build_loss
(
loss_bbox_refine
)
# for getting ATSS targets
self
.
use_atss
=
use_atss
self
.
use_sigmoid_cls
=
loss_cls
.
get
(
'use_sigmoid'
,
False
)
self
.
anchor_generator
=
build_anchor_generator
(
anchor_generator
)
self
.
anchor_center_offset
=
anchor_generator
[
'center_offset'
]
self
.
num_anchors
=
self
.
anchor_generator
.
num_base_anchors
[
0
]
self
.
sampling
=
False
if
self
.
train_cfg
:
self
.
assigner
=
build_assigner
(
self
.
train_cfg
.
assigner
)
sampler_cfg
=
dict
(
type
=
'PseudoSampler'
)
self
.
sampler
=
build_sampler
(
sampler_cfg
,
context
=
self
)
def
_init_layers
(
self
):
"""Initialize layers of the head."""
super
(
FCOSHead
,
self
).
_init_cls_convs
()
super
(
FCOSHead
,
self
).
_init_reg_convs
()
self
.
relu
=
nn
.
ReLU
(
inplace
=
True
)
self
.
vfnet_reg_conv
=
ConvModule
(
self
.
feat_channels
,
self
.
feat_channels
,
3
,
stride
=
1
,
padding
=
1
,
conv_cfg
=
self
.
conv_cfg
,
norm_cfg
=
self
.
norm_cfg
,
bias
=
self
.
conv_bias
)
self
.
vfnet_reg
=
nn
.
Conv2d
(
self
.
feat_channels
,
4
,
3
,
padding
=
1
)
self
.
scales
=
nn
.
ModuleList
([
Scale
(
1.0
)
for
_
in
self
.
strides
])
self
.
vfnet_reg_refine_dconv
=
DeformConv2d
(
self
.
feat_channels
,
self
.
feat_channels
,
self
.
dcn_kernel
,
1
,
padding
=
self
.
dcn_pad
)
self
.
vfnet_reg_refine
=
nn
.
Conv2d
(
self
.
feat_channels
,
4
,
3
,
padding
=
1
)
self
.
scales_refine
=
nn
.
ModuleList
([
Scale
(
1.0
)
for
_
in
self
.
strides
])
self
.
vfnet_cls_dconv
=
DeformConv2d
(
self
.
feat_channels
,
self
.
feat_channels
,
self
.
dcn_kernel
,
1
,
padding
=
self
.
dcn_pad
)
self
.
vfnet_cls
=
nn
.
Conv2d
(
self
.
feat_channels
,
self
.
cls_out_channels
,
3
,
padding
=
1
)
def
init_weights
(
self
):
"""Initialize weights of the head."""
for
m
in
self
.
cls_convs
:
if
isinstance
(
m
.
conv
,
nn
.
Conv2d
):
normal_init
(
m
.
conv
,
std
=
0.01
)
for
m
in
self
.
reg_convs
:
if
isinstance
(
m
.
conv
,
nn
.
Conv2d
):
normal_init
(
m
.
conv
,
std
=
0.01
)
normal_init
(
self
.
vfnet_reg_conv
.
conv
,
std
=
0.01
)
normal_init
(
self
.
vfnet_reg
,
std
=
0.01
)
normal_init
(
self
.
vfnet_reg_refine_dconv
,
std
=
0.01
)
normal_init
(
self
.
vfnet_reg_refine
,
std
=
0.01
)
normal_init
(
self
.
vfnet_cls_dconv
,
std
=
0.01
)
bias_cls
=
bias_init_with_prob
(
0.01
)
normal_init
(
self
.
vfnet_cls
,
std
=
0.01
,
bias
=
bias_cls
)
def
forward
(
self
,
feats
):
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
tuple:
cls_scores (list[Tensor]): Box iou-aware scores for each scale
level, each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box offsets for each
scale level, each is a 4D-tensor, the channel number is
num_points * 4.
bbox_preds_refine (list[Tensor]): Refined Box offsets for
each scale level, each is a 4D-tensor, the channel
number is num_points * 4.
"""
return
multi_apply
(
self
.
forward_single
,
feats
,
self
.
scales
,
self
.
scales_refine
,
self
.
strides
,
self
.
reg_denoms
)
def
forward_single
(
self
,
x
,
scale
,
scale_refine
,
stride
,
reg_denom
):
"""Forward features of a single scale level.
Args:
x (Tensor): FPN feature maps of the specified stride.
scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
the bbox prediction.
scale_refine (:obj: `mmcv.cnn.Scale`): Learnable scale module to
resize the refined bbox prediction.
stride (int): The corresponding stride for feature maps,
used to normalize the bbox prediction when
bbox_norm_type = 'stride'.
reg_denom (int): The corresponding regression range for feature
maps, only used to normalize the bbox prediction when
bbox_norm_type = 'reg_denom'.
Returns:
tuple: iou-aware cls scores for each box, bbox predictions and
refined bbox predictions of input feature maps.
"""
cls_feat
=
x
reg_feat
=
x
for
cls_layer
in
self
.
cls_convs
:
cls_feat
=
cls_layer
(
cls_feat
)
for
reg_layer
in
self
.
reg_convs
:
reg_feat
=
reg_layer
(
reg_feat
)
# predict the bbox_pred of different level
reg_feat_init
=
self
.
vfnet_reg_conv
(
reg_feat
)
if
self
.
bbox_norm_type
==
'reg_denom'
:
bbox_pred
=
scale
(
self
.
vfnet_reg
(
reg_feat_init
)).
float
().
exp
()
*
reg_denom
elif
self
.
bbox_norm_type
==
'stride'
:
bbox_pred
=
scale
(
self
.
vfnet_reg
(
reg_feat_init
)).
float
().
exp
()
*
stride
else
:
raise
NotImplementedError
# compute star deformable convolution offsets
# converting dcn_offset to reg_feat.dtype thus VFNet can be
# trained with FP16
dcn_offset
=
self
.
star_dcn_offset
(
bbox_pred
,
self
.
gradient_mul
,
stride
).
to
(
reg_feat
.
dtype
)
# refine the bbox_pred
reg_feat
=
self
.
relu
(
self
.
vfnet_reg_refine_dconv
(
reg_feat
,
dcn_offset
))
bbox_pred_refine
=
scale_refine
(
self
.
vfnet_reg_refine
(
reg_feat
)).
float
().
exp
()
bbox_pred_refine
=
bbox_pred_refine
*
bbox_pred
.
detach
()
# predict the iou-aware cls score
cls_feat
=
self
.
relu
(
self
.
vfnet_cls_dconv
(
cls_feat
,
dcn_offset
))
cls_score
=
self
.
vfnet_cls
(
cls_feat
)
return
cls_score
,
bbox_pred
,
bbox_pred_refine
def
star_dcn_offset
(
self
,
bbox_pred
,
gradient_mul
,
stride
):
"""Compute the star deformable conv offsets.
Args:
bbox_pred (Tensor): Predicted bbox distance offsets (l, r, t, b).
gradient_mul (float): Gradient multiplier.
stride (int): The corresponding stride for feature maps,
used to project the bbox onto the feature map.
Returns:
dcn_offsets (Tensor): The offsets for deformable convolution.
"""
dcn_base_offset
=
self
.
dcn_base_offset
.
type_as
(
bbox_pred
)
bbox_pred_grad_mul
=
(
1
-
gradient_mul
)
*
bbox_pred
.
detach
()
+
\
gradient_mul
*
bbox_pred
# map to the feature map scale
bbox_pred_grad_mul
=
bbox_pred_grad_mul
/
stride
N
,
C
,
H
,
W
=
bbox_pred
.
size
()
x1
=
bbox_pred_grad_mul
[:,
0
,
:,
:]
y1
=
bbox_pred_grad_mul
[:,
1
,
:,
:]
x2
=
bbox_pred_grad_mul
[:,
2
,
:,
:]
y2
=
bbox_pred_grad_mul
[:,
3
,
:,
:]
bbox_pred_grad_mul_offset
=
bbox_pred
.
new_zeros
(
N
,
2
*
self
.
num_dconv_points
,
H
,
W
)
bbox_pred_grad_mul_offset
[:,
0
,
:,
:]
=
-
1.0
*
y1
# -y1
bbox_pred_grad_mul_offset
[:,
1
,
:,
:]
=
-
1.0
*
x1
# -x1
bbox_pred_grad_mul_offset
[:,
2
,
:,
:]
=
-
1.0
*
y1
# -y1
bbox_pred_grad_mul_offset
[:,
4
,
:,
:]
=
-
1.0
*
y1
# -y1
bbox_pred_grad_mul_offset
[:,
5
,
:,
:]
=
x2
# x2
bbox_pred_grad_mul_offset
[:,
7
,
:,
:]
=
-
1.0
*
x1
# -x1
bbox_pred_grad_mul_offset
[:,
11
,
:,
:]
=
x2
# x2
bbox_pred_grad_mul_offset
[:,
12
,
:,
:]
=
y2
# y2
bbox_pred_grad_mul_offset
[:,
13
,
:,
:]
=
-
1.0
*
x1
# -x1
bbox_pred_grad_mul_offset
[:,
14
,
:,
:]
=
y2
# y2
bbox_pred_grad_mul_offset
[:,
16
,
:,
:]
=
y2
# y2
bbox_pred_grad_mul_offset
[:,
17
,
:,
:]
=
x2
# x2
dcn_offset
=
bbox_pred_grad_mul_offset
-
dcn_base_offset
return
dcn_offset
@
force_fp32
(
apply_to
=
(
'cls_scores'
,
'bbox_preds'
,
'bbox_preds_refine'
))
def
loss
(
self
,
cls_scores
,
bbox_preds
,
bbox_preds_refine
,
gt_bboxes
,
gt_labels
,
img_metas
,
gt_bboxes_ignore
=
None
):
"""Compute loss of the head.
Args:
cls_scores (list[Tensor]): Box iou-aware scores for each scale
level, each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box offsets for each
scale level, each is a 4D-tensor, the channel number is
num_points * 4.
bbox_preds_refine (list[Tensor]): Refined Box offsets for
each scale level, each is a 4D-tensor, the channel
number is num_points * 4.
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): specify which bounding
boxes can be ignored when computing the loss.
Default: None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert
len
(
cls_scores
)
==
len
(
bbox_preds
)
==
len
(
bbox_preds_refine
)
featmap_sizes
=
[
featmap
.
size
()[
-
2
:]
for
featmap
in
cls_scores
]
all_level_points
=
self
.
get_points
(
featmap_sizes
,
bbox_preds
[
0
].
dtype
,
bbox_preds
[
0
].
device
)
labels
,
label_weights
,
bbox_targets
,
bbox_weights
=
self
.
get_targets
(
cls_scores
,
all_level_points
,
gt_bboxes
,
gt_labels
,
img_metas
,
gt_bboxes_ignore
)
num_imgs
=
cls_scores
[
0
].
size
(
0
)
# flatten cls_scores, bbox_preds and bbox_preds_refine
flatten_cls_scores
=
[
cls_score
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
-
1
,
self
.
cls_out_channels
).
contiguous
()
for
cls_score
in
cls_scores
]
flatten_bbox_preds
=
[
bbox_pred
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
-
1
,
4
).
contiguous
()
for
bbox_pred
in
bbox_preds
]
flatten_bbox_preds_refine
=
[
bbox_pred_refine
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
-
1
,
4
).
contiguous
()
for
bbox_pred_refine
in
bbox_preds_refine
]
flatten_cls_scores
=
torch
.
cat
(
flatten_cls_scores
)
flatten_bbox_preds
=
torch
.
cat
(
flatten_bbox_preds
)
flatten_bbox_preds_refine
=
torch
.
cat
(
flatten_bbox_preds_refine
)
flatten_labels
=
torch
.
cat
(
labels
)
flatten_bbox_targets
=
torch
.
cat
(
bbox_targets
)
# repeat points to align with bbox_preds
flatten_points
=
torch
.
cat
(
[
points
.
repeat
(
num_imgs
,
1
)
for
points
in
all_level_points
])
# FG cat_id: [0, num_classes - 1], BG cat_id: num_classes
bg_class_ind
=
self
.
num_classes
pos_inds
=
torch
.
where
(
((
flatten_labels
>=
0
)
&
(
flatten_labels
<
bg_class_ind
))
>
0
)[
0
]
num_pos
=
len
(
pos_inds
)
pos_bbox_preds
=
flatten_bbox_preds
[
pos_inds
]
pos_bbox_preds_refine
=
flatten_bbox_preds_refine
[
pos_inds
]
pos_labels
=
flatten_labels
[
pos_inds
]
# sync num_pos across all gpus
if
self
.
sync_num_pos
:
num_pos_avg_per_gpu
=
reduce_mean
(
pos_inds
.
new_tensor
(
num_pos
).
float
()).
item
()
num_pos_avg_per_gpu
=
max
(
num_pos_avg_per_gpu
,
1.0
)
else
:
num_pos_avg_per_gpu
=
num_pos
if
num_pos
>
0
:
pos_bbox_targets
=
flatten_bbox_targets
[
pos_inds
]
pos_points
=
flatten_points
[
pos_inds
]
pos_decoded_bbox_preds
=
distance2bbox
(
pos_points
,
pos_bbox_preds
)
pos_decoded_target_preds
=
distance2bbox
(
pos_points
,
pos_bbox_targets
)
iou_targets_ini
=
bbox_overlaps
(
pos_decoded_bbox_preds
,
pos_decoded_target_preds
.
detach
(),
is_aligned
=
True
).
clamp
(
min
=
1e-6
)
bbox_weights_ini
=
iou_targets_ini
.
clone
().
detach
()
iou_targets_ini_avg_per_gpu
=
reduce_mean
(
bbox_weights_ini
.
sum
()).
item
()
bbox_avg_factor_ini
=
max
(
iou_targets_ini_avg_per_gpu
,
1.0
)
loss_bbox
=
self
.
loss_bbox
(
pos_decoded_bbox_preds
,
pos_decoded_target_preds
.
detach
(),
weight
=
bbox_weights_ini
,
avg_factor
=
bbox_avg_factor_ini
)
pos_decoded_bbox_preds_refine
=
\
distance2bbox
(
pos_points
,
pos_bbox_preds_refine
)
iou_targets_rf
=
bbox_overlaps
(
pos_decoded_bbox_preds_refine
,
pos_decoded_target_preds
.
detach
(),
is_aligned
=
True
).
clamp
(
min
=
1e-6
)
bbox_weights_rf
=
iou_targets_rf
.
clone
().
detach
()
iou_targets_rf_avg_per_gpu
=
reduce_mean
(
bbox_weights_rf
.
sum
()).
item
()
bbox_avg_factor_rf
=
max
(
iou_targets_rf_avg_per_gpu
,
1.0
)
loss_bbox_refine
=
self
.
loss_bbox_refine
(
pos_decoded_bbox_preds_refine
,
pos_decoded_target_preds
.
detach
(),
weight
=
bbox_weights_rf
,
avg_factor
=
bbox_avg_factor_rf
)
# build IoU-aware cls_score targets
if
self
.
use_vfl
:
pos_ious
=
iou_targets_rf
.
clone
().
detach
()
cls_iou_targets
=
torch
.
zeros_like
(
flatten_cls_scores
)
cls_iou_targets
[
pos_inds
,
pos_labels
]
=
pos_ious
else
:
loss_bbox
=
pos_bbox_preds
.
sum
()
*
0
loss_bbox_refine
=
pos_bbox_preds_refine
.
sum
()
*
0
if
self
.
use_vfl
:
cls_iou_targets
=
torch
.
zeros_like
(
flatten_cls_scores
)
if
self
.
use_vfl
:
loss_cls
=
self
.
loss_cls
(
flatten_cls_scores
,
cls_iou_targets
,
avg_factor
=
num_pos_avg_per_gpu
)
else
:
loss_cls
=
self
.
loss_cls
(
flatten_cls_scores
,
flatten_labels
,
weight
=
label_weights
,
avg_factor
=
num_pos_avg_per_gpu
)
return
dict
(
loss_cls
=
loss_cls
,
loss_bbox
=
loss_bbox
,
loss_bbox_rf
=
loss_bbox_refine
)
@
force_fp32
(
apply_to
=
(
'cls_scores'
,
'bbox_preds'
,
'bbox_preds_refine'
))
def
get_bboxes
(
self
,
cls_scores
,
bbox_preds
,
bbox_preds_refine
,
img_metas
,
cfg
=
None
,
rescale
=
None
,
with_nms
=
True
):
"""Transform network outputs for a batch into bbox predictions.
Args:
cls_scores (list[Tensor]): Box iou-aware scores for each scale
level with shape (N, num_points * num_classes, H, W).
bbox_preds (list[Tensor]): Box offsets for each scale
level with shape (N, num_points * 4, H, W).
bbox_preds_refine (list[Tensor]): Refined Box offsets for
each scale level with shape (N, num_points * 4, H, W).
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used. Default: None.
rescale (bool): If True, return boxes in original image space.
Default: False.
with_nms (bool): If True, do nms before returning boxes.
Default: True.
Returns:
list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
The first item is an (n, 5) tensor, where the first 4 columns
are bounding box positions (tl_x, tl_y, br_x, br_y) and the
5-th column is a score between 0 and 1. The second item is a
(n,) tensor where each item is the predicted class label of
the corresponding box.
"""
assert
len
(
cls_scores
)
==
len
(
bbox_preds
)
==
len
(
bbox_preds_refine
)
num_levels
=
len
(
cls_scores
)
featmap_sizes
=
[
featmap
.
size
()[
-
2
:]
for
featmap
in
cls_scores
]
mlvl_points
=
self
.
get_points
(
featmap_sizes
,
bbox_preds
[
0
].
dtype
,
bbox_preds
[
0
].
device
)
result_list
=
[]
for
img_id
in
range
(
len
(
img_metas
)):
cls_score_list
=
[
cls_scores
[
i
][
img_id
].
detach
()
for
i
in
range
(
num_levels
)
]
bbox_pred_list
=
[
bbox_preds_refine
[
i
][
img_id
].
detach
()
for
i
in
range
(
num_levels
)
]
img_shape
=
img_metas
[
img_id
][
'img_shape'
]
scale_factor
=
img_metas
[
img_id
][
'scale_factor'
]
det_bboxes
=
self
.
_get_bboxes_single
(
cls_score_list
,
bbox_pred_list
,
mlvl_points
,
img_shape
,
scale_factor
,
cfg
,
rescale
,
with_nms
)
result_list
.
append
(
det_bboxes
)
return
result_list
def
_get_bboxes_single
(
self
,
cls_scores
,
bbox_preds
,
mlvl_points
,
img_shape
,
scale_factor
,
cfg
,
rescale
=
False
,
with_nms
=
True
):
"""Transform outputs for a single batch item into bbox predictions.
Args:
cls_scores (list[Tensor]): Box iou-aware scores for a single scale
level with shape (num_points * num_classes, H, W).
bbox_preds (list[Tensor]): Box offsets for a single scale
level with shape (num_points * 4, H, W).
mlvl_points (list[Tensor]): Box reference for a single scale level
with shape (num_total_points, 4).
img_shape (tuple[int]): Shape of the input image,
(height, width, 3).
scale_factor (ndarray): Scale factor of the image arrange as
(w_scale, h_scale, w_scale, h_scale).
cfg (mmcv.Config | None): Test / postprocessing configuration,
if None, test_cfg would be used.
rescale (bool): If True, return boxes in original image space.
Default: False.
with_nms (bool): If True, do nms before returning boxes.
Default: True.
Returns:
tuple(Tensor):
det_bboxes (Tensor): BBox predictions in shape (n, 5), where
the first 4 columns are bounding box positions
(tl_x, tl_y, br_x, br_y) and the 5-th column is a score
between 0 and 1.
det_labels (Tensor): A (n,) tensor where each item is the
predicted class label of the corresponding box.
"""
cfg
=
self
.
test_cfg
if
cfg
is
None
else
cfg
assert
len
(
cls_scores
)
==
len
(
bbox_preds
)
==
len
(
mlvl_points
)
mlvl_bboxes
=
[]
mlvl_scores
=
[]
for
cls_score
,
bbox_pred
,
points
in
zip
(
cls_scores
,
bbox_preds
,
mlvl_points
):
assert
cls_score
.
size
()[
-
2
:]
==
bbox_pred
.
size
()[
-
2
:]
scores
=
cls_score
.
permute
(
1
,
2
,
0
).
reshape
(
-
1
,
self
.
cls_out_channels
).
contiguous
().
sigmoid
()
bbox_pred
=
bbox_pred
.
permute
(
1
,
2
,
0
).
reshape
(
-
1
,
4
).
contiguous
()
nms_pre
=
cfg
.
get
(
'nms_pre'
,
-
1
)
if
0
<
nms_pre
<
scores
.
shape
[
0
]:
max_scores
,
_
=
scores
.
max
(
dim
=
1
)
_
,
topk_inds
=
max_scores
.
topk
(
nms_pre
)
points
=
points
[
topk_inds
,
:]
bbox_pred
=
bbox_pred
[
topk_inds
,
:]
scores
=
scores
[
topk_inds
,
:]
bboxes
=
distance2bbox
(
points
,
bbox_pred
,
max_shape
=
img_shape
)
mlvl_bboxes
.
append
(
bboxes
)
mlvl_scores
.
append
(
scores
)
mlvl_bboxes
=
torch
.
cat
(
mlvl_bboxes
)
if
rescale
:
mlvl_bboxes
/=
mlvl_bboxes
.
new_tensor
(
scale_factor
)
mlvl_scores
=
torch
.
cat
(
mlvl_scores
)
padding
=
mlvl_scores
.
new_zeros
(
mlvl_scores
.
shape
[
0
],
1
)
# remind that we set FG labels to [0, num_class-1] since mmdet v2.0
# BG cat_id: num_class
mlvl_scores
=
torch
.
cat
([
mlvl_scores
,
padding
],
dim
=
1
)
if
with_nms
:
det_bboxes
,
det_labels
=
multiclass_nms
(
mlvl_bboxes
,
mlvl_scores
,
cfg
.
score_thr
,
cfg
.
nms
,
cfg
.
max_per_img
)
return
det_bboxes
,
det_labels
else
:
return
mlvl_bboxes
,
mlvl_scores
def
_get_points_single
(
self
,
featmap_size
,
stride
,
dtype
,
device
,
flatten
=
False
):
"""Get points according to feature map sizes."""
h
,
w
=
featmap_size
x_range
=
torch
.
arange
(
0
,
w
*
stride
,
stride
,
dtype
=
dtype
,
device
=
device
)
y_range
=
torch
.
arange
(
0
,
h
*
stride
,
stride
,
dtype
=
dtype
,
device
=
device
)
y
,
x
=
torch
.
meshgrid
(
y_range
,
x_range
)
# to be compatible with anchor points in ATSS
if
self
.
use_atss
:
points
=
torch
.
stack
(
(
x
.
reshape
(
-
1
),
y
.
reshape
(
-
1
)),
dim
=-
1
)
+
\
stride
*
self
.
anchor_center_offset
else
:
points
=
torch
.
stack
(
(
x
.
reshape
(
-
1
),
y
.
reshape
(
-
1
)),
dim
=-
1
)
+
stride
//
2
return
points
def
get_targets
(
self
,
cls_scores
,
mlvl_points
,
gt_bboxes
,
gt_labels
,
img_metas
,
gt_bboxes_ignore
):
"""A wrapper for computing ATSS and FCOS targets for points in multiple
images.
Args:
cls_scores (list[Tensor]): Box iou-aware scores for each scale
level with shape (N, num_points * num_classes, H, W).
mlvl_points (list[Tensor]): Points of each fpn level, each has
shape (num_points, 2).
gt_bboxes (list[Tensor]): Ground truth bboxes of each image,
each has shape (num_gt, 4).
gt_labels (list[Tensor]): Ground truth labels of each box,
each has shape (num_gt,).
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | Tensor): Ground truth bboxes to be
ignored, shape (num_ignored_gts, 4).
Returns:
tuple:
labels_list (list[Tensor]): Labels of each level.
label_weights (Tensor/None): Label weights of all levels.
bbox_targets_list (list[Tensor]): Regression targets of each
level, (l, t, r, b).
bbox_weights (Tensor/None): Bbox weights of all levels.
"""
if
self
.
use_atss
:
return
self
.
get_atss_targets
(
cls_scores
,
mlvl_points
,
gt_bboxes
,
gt_labels
,
img_metas
,
gt_bboxes_ignore
)
else
:
self
.
norm_on_bbox
=
False
return
self
.
get_fcos_targets
(
mlvl_points
,
gt_bboxes
,
gt_labels
)
def
_get_target_single
(
self
,
*
args
,
**
kwargs
):
"""Avoid ambiguity in multiple inheritance."""
if
self
.
use_atss
:
return
ATSSHead
.
_get_target_single
(
self
,
*
args
,
**
kwargs
)
else
:
return
FCOSHead
.
_get_target_single
(
self
,
*
args
,
**
kwargs
)
def
get_fcos_targets
(
self
,
points
,
gt_bboxes_list
,
gt_labels_list
):
"""Compute FCOS regression and classification targets for points in
multiple images.
Args:
points (list[Tensor]): Points of each fpn level, each has shape
(num_points, 2).
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
each has shape (num_gt, 4).
gt_labels_list (list[Tensor]): Ground truth labels of each box,
each has shape (num_gt,).
Returns:
tuple:
labels (list[Tensor]): Labels of each level.
label_weights: None, to be compatible with ATSS targets.
bbox_targets (list[Tensor]): BBox targets of each level.
bbox_weights: None, to be compatible with ATSS targets.
"""
labels
,
bbox_targets
=
FCOSHead
.
get_targets
(
self
,
points
,
gt_bboxes_list
,
gt_labels_list
)
label_weights
=
None
bbox_weights
=
None
return
labels
,
label_weights
,
bbox_targets
,
bbox_weights
def
get_atss_targets
(
self
,
cls_scores
,
mlvl_points
,
gt_bboxes
,
gt_labels
,
img_metas
,
gt_bboxes_ignore
=
None
):
"""A wrapper for computing ATSS targets for points in multiple images.
Args:
cls_scores (list[Tensor]): Box iou-aware scores for each scale
level with shape (N, num_points * num_classes, H, W).
mlvl_points (list[Tensor]): Points of each fpn level, each has
shape (num_points, 2).
gt_bboxes (list[Tensor]): Ground truth bboxes of each image,
each has shape (num_gt, 4).
gt_labels (list[Tensor]): Ground truth labels of each box,
each has shape (num_gt,).
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | Tensor): Ground truth bboxes to be
ignored, shape (num_ignored_gts, 4). Default: None.
Returns:
tuple:
labels_list (list[Tensor]): Labels of each level.
label_weights (Tensor): Label weights of all levels.
bbox_targets_list (list[Tensor]): Regression targets of each
level, (l, t, r, b).
bbox_weights (Tensor): Bbox weights of all levels.
"""
featmap_sizes
=
[
featmap
.
size
()[
-
2
:]
for
featmap
in
cls_scores
]
assert
len
(
featmap_sizes
)
==
self
.
anchor_generator
.
num_levels
device
=
cls_scores
[
0
].
device
anchor_list
,
valid_flag_list
=
self
.
get_anchors
(
featmap_sizes
,
img_metas
,
device
=
device
)
label_channels
=
self
.
cls_out_channels
if
self
.
use_sigmoid_cls
else
1
cls_reg_targets
=
ATSSHead
.
get_targets
(
self
,
anchor_list
,
valid_flag_list
,
gt_bboxes
,
img_metas
,
gt_bboxes_ignore_list
=
gt_bboxes_ignore
,
gt_labels_list
=
gt_labels
,
label_channels
=
label_channels
,
unmap_outputs
=
True
)
if
cls_reg_targets
is
None
:
return
None
(
anchor_list
,
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
)
=
cls_reg_targets
bbox_targets_list
=
[
bbox_targets
.
reshape
(
-
1
,
4
)
for
bbox_targets
in
bbox_targets_list
]
num_imgs
=
len
(
img_metas
)
# transform bbox_targets (x1, y1, x2, y2) into (l, t, r, b) format
bbox_targets_list
=
self
.
transform_bbox_targets
(
bbox_targets_list
,
mlvl_points
,
num_imgs
)
labels_list
=
[
labels
.
reshape
(
-
1
)
for
labels
in
labels_list
]
label_weights_list
=
[
label_weights
.
reshape
(
-
1
)
for
label_weights
in
label_weights_list
]
bbox_weights_list
=
[
bbox_weights
.
reshape
(
-
1
)
for
bbox_weights
in
bbox_weights_list
]
label_weights
=
torch
.
cat
(
label_weights_list
)
bbox_weights
=
torch
.
cat
(
bbox_weights_list
)
return
labels_list
,
label_weights
,
bbox_targets_list
,
bbox_weights
def
transform_bbox_targets
(
self
,
decoded_bboxes
,
mlvl_points
,
num_imgs
):
"""Transform bbox_targets (x1, y1, x2, y2) into (l, t, r, b) format.
Args:
decoded_bboxes (list[Tensor]): Regression targets of each level,
in the form of (x1, y1, x2, y2).
mlvl_points (list[Tensor]): Points of each fpn level, each has
shape (num_points, 2).
num_imgs (int): the number of images in a batch.
Returns:
bbox_targets (list[Tensor]): Regression targets of each level in
the form of (l, t, r, b).
"""
# TODO: Re-implemented in Class PointCoder
assert
len
(
decoded_bboxes
)
==
len
(
mlvl_points
)
num_levels
=
len
(
decoded_bboxes
)
mlvl_points
=
[
points
.
repeat
(
num_imgs
,
1
)
for
points
in
mlvl_points
]
bbox_targets
=
[]
for
i
in
range
(
num_levels
):
bbox_target
=
bbox2distance
(
mlvl_points
[
i
],
decoded_bboxes
[
i
])
bbox_targets
.
append
(
bbox_target
)
return
bbox_targets
def
_load_from_state_dict
(
self
,
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
):
"""Override the method in the parent class to avoid changing para's
name."""
pass
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/dense_heads/yolact_head.py
0 → 100644
View file @
142dcf29
import
numpy
as
np
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
ConvModule
,
xavier_init
from
mmcv.runner
import
force_fp32
from
mmdet.core
import
build_sampler
,
fast_nms
,
images_to_levels
,
multi_apply
from
..builder
import
HEADS
,
build_loss
from
.anchor_head
import
AnchorHead
@
HEADS
.
register_module
()
class
YOLACTHead
(
AnchorHead
):
"""YOLACT box head used in https://arxiv.org/abs/1904.02689.
Note that YOLACT head is a light version of RetinaNet head.
Four differences are described as follows:
1. YOLACT box head has three-times fewer anchors.
2. YOLACT box head shares the convs for box and cls branches.
3. YOLACT box head uses OHEM instead of Focal loss.
4. YOLACT box head predicts a set of mask coefficients for each box.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
anchor_generator (dict): Config dict for anchor generator
loss_cls (dict): Config of classification loss.
loss_bbox (dict): Config of localization loss.
num_head_convs (int): Number of the conv layers shared by
box and cls branches.
num_protos (int): Number of the mask coefficients.
use_ohem (bool): If true, ``loss_single_OHEM`` will be used for
cls loss calculation. If false, ``loss_single`` will be used.
conv_cfg (dict): Dictionary to construct and config conv layer.
norm_cfg (dict): Dictionary to construct and config norm layer.
"""
def
__init__
(
self
,
num_classes
,
in_channels
,
anchor_generator
=
dict
(
type
=
'AnchorGenerator'
,
octave_base_scale
=
3
,
scales_per_octave
=
1
,
ratios
=
[
0.5
,
1.0
,
2.0
],
strides
=
[
8
,
16
,
32
,
64
,
128
]),
loss_cls
=
dict
(
type
=
'CrossEntropyLoss'
,
use_sigmoid
=
False
,
reduction
=
'none'
,
loss_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'SmoothL1Loss'
,
beta
=
1.0
,
loss_weight
=
1.5
),
num_head_convs
=
1
,
num_protos
=
32
,
use_ohem
=
True
,
conv_cfg
=
None
,
norm_cfg
=
None
,
**
kwargs
):
self
.
num_head_convs
=
num_head_convs
self
.
num_protos
=
num_protos
self
.
use_ohem
=
use_ohem
self
.
conv_cfg
=
conv_cfg
self
.
norm_cfg
=
norm_cfg
super
(
YOLACTHead
,
self
).
__init__
(
num_classes
,
in_channels
,
loss_cls
=
loss_cls
,
loss_bbox
=
loss_bbox
,
anchor_generator
=
anchor_generator
,
**
kwargs
)
if
self
.
use_ohem
:
sampler_cfg
=
dict
(
type
=
'PseudoSampler'
)
self
.
sampler
=
build_sampler
(
sampler_cfg
,
context
=
self
)
self
.
sampling
=
False
def
_init_layers
(
self
):
"""Initialize layers of the head."""
self
.
relu
=
nn
.
ReLU
(
inplace
=
True
)
self
.
head_convs
=
nn
.
ModuleList
()
for
i
in
range
(
self
.
num_head_convs
):
chn
=
self
.
in_channels
if
i
==
0
else
self
.
feat_channels
self
.
head_convs
.
append
(
ConvModule
(
chn
,
self
.
feat_channels
,
3
,
stride
=
1
,
padding
=
1
,
conv_cfg
=
self
.
conv_cfg
,
norm_cfg
=
self
.
norm_cfg
))
self
.
conv_cls
=
nn
.
Conv2d
(
self
.
feat_channels
,
self
.
num_anchors
*
self
.
cls_out_channels
,
3
,
padding
=
1
)
self
.
conv_reg
=
nn
.
Conv2d
(
self
.
feat_channels
,
self
.
num_anchors
*
4
,
3
,
padding
=
1
)
self
.
conv_coeff
=
nn
.
Conv2d
(
self
.
feat_channels
,
self
.
num_anchors
*
self
.
num_protos
,
3
,
padding
=
1
)
def
init_weights
(
self
):
"""Initialize weights of the head."""
for
m
in
self
.
head_convs
:
xavier_init
(
m
.
conv
,
distribution
=
'uniform'
,
bias
=
0
)
xavier_init
(
self
.
conv_cls
,
distribution
=
'uniform'
,
bias
=
0
)
xavier_init
(
self
.
conv_reg
,
distribution
=
'uniform'
,
bias
=
0
)
xavier_init
(
self
.
conv_coeff
,
distribution
=
'uniform'
,
bias
=
0
)
def
forward_single
(
self
,
x
):
"""Forward feature of a single scale level.
Args:
x (Tensor): Features of a single scale level.
Returns:
tuple:
cls_score (Tensor): Cls scores for a single scale level
\
the channels number is num_anchors * num_classes.
bbox_pred (Tensor): Box energies / deltas for a single scale
\
level, the channels number is num_anchors * 4.
coeff_pred (Tensor): Mask coefficients for a single scale
\
level, the channels number is num_anchors * num_protos.
"""
for
head_conv
in
self
.
head_convs
:
x
=
head_conv
(
x
)
cls_score
=
self
.
conv_cls
(
x
)
bbox_pred
=
self
.
conv_reg
(
x
)
coeff_pred
=
self
.
conv_coeff
(
x
).
tanh
()
return
cls_score
,
bbox_pred
,
coeff_pred
@
force_fp32
(
apply_to
=
(
'cls_scores'
,
'bbox_preds'
))
def
loss
(
self
,
cls_scores
,
bbox_preds
,
gt_bboxes
,
gt_labels
,
img_metas
,
gt_bboxes_ignore
=
None
):
"""A combination of the func:``AnchorHead.loss`` and
func:``SSDHead.loss``.
When ``self.use_ohem == True``, it functions like ``SSDHead.loss``,
otherwise, it follows ``AnchorHead.loss``. Besides, it additionally
returns ``sampling_results``.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_anchors * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_anchors * 4, H, W)
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): Class indices corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
boxes can be ignored when computing the loss. Default: None
Returns:
tuple:
dict[str, Tensor]: A dictionary of loss components.
List[:obj:``SamplingResult``]: Sampler results for each image.
"""
featmap_sizes
=
[
featmap
.
size
()[
-
2
:]
for
featmap
in
cls_scores
]
assert
len
(
featmap_sizes
)
==
self
.
anchor_generator
.
num_levels
device
=
cls_scores
[
0
].
device
anchor_list
,
valid_flag_list
=
self
.
get_anchors
(
featmap_sizes
,
img_metas
,
device
=
device
)
label_channels
=
self
.
cls_out_channels
if
self
.
use_sigmoid_cls
else
1
cls_reg_targets
=
self
.
get_targets
(
anchor_list
,
valid_flag_list
,
gt_bboxes
,
img_metas
,
gt_bboxes_ignore_list
=
gt_bboxes_ignore
,
gt_labels_list
=
gt_labels
,
label_channels
=
label_channels
,
unmap_outputs
=
not
self
.
use_ohem
,
return_sampling_results
=
True
)
if
cls_reg_targets
is
None
:
return
None
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
,
sampling_results
)
=
cls_reg_targets
if
self
.
use_ohem
:
num_images
=
len
(
img_metas
)
all_cls_scores
=
torch
.
cat
([
s
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
num_images
,
-
1
,
self
.
cls_out_channels
)
for
s
in
cls_scores
],
1
)
all_labels
=
torch
.
cat
(
labels_list
,
-
1
).
view
(
num_images
,
-
1
)
all_label_weights
=
torch
.
cat
(
label_weights_list
,
-
1
).
view
(
num_images
,
-
1
)
all_bbox_preds
=
torch
.
cat
([
b
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
num_images
,
-
1
,
4
)
for
b
in
bbox_preds
],
-
2
)
all_bbox_targets
=
torch
.
cat
(
bbox_targets_list
,
-
2
).
view
(
num_images
,
-
1
,
4
)
all_bbox_weights
=
torch
.
cat
(
bbox_weights_list
,
-
2
).
view
(
num_images
,
-
1
,
4
)
# concat all level anchors to a single tensor
all_anchors
=
[]
for
i
in
range
(
num_images
):
all_anchors
.
append
(
torch
.
cat
(
anchor_list
[
i
]))
# check NaN and Inf
assert
torch
.
isfinite
(
all_cls_scores
).
all
().
item
(),
\
'classification scores become infinite or NaN!'
assert
torch
.
isfinite
(
all_bbox_preds
).
all
().
item
(),
\
'bbox predications become infinite or NaN!'
losses_cls
,
losses_bbox
=
multi_apply
(
self
.
loss_single_OHEM
,
all_cls_scores
,
all_bbox_preds
,
all_anchors
,
all_labels
,
all_label_weights
,
all_bbox_targets
,
all_bbox_weights
,
num_total_samples
=
num_total_pos
)
else
:
num_total_samples
=
(
num_total_pos
+
num_total_neg
if
self
.
sampling
else
num_total_pos
)
# anchor number of multi levels
num_level_anchors
=
[
anchors
.
size
(
0
)
for
anchors
in
anchor_list
[
0
]]
# concat all level anchors and flags to a single tensor
concat_anchor_list
=
[]
for
i
in
range
(
len
(
anchor_list
)):
concat_anchor_list
.
append
(
torch
.
cat
(
anchor_list
[
i
]))
all_anchor_list
=
images_to_levels
(
concat_anchor_list
,
num_level_anchors
)
losses_cls
,
losses_bbox
=
multi_apply
(
self
.
loss_single
,
cls_scores
,
bbox_preds
,
all_anchor_list
,
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_samples
=
num_total_samples
)
return
dict
(
loss_cls
=
losses_cls
,
loss_bbox
=
losses_bbox
),
sampling_results
def
loss_single_OHEM
(
self
,
cls_score
,
bbox_pred
,
anchors
,
labels
,
label_weights
,
bbox_targets
,
bbox_weights
,
num_total_samples
):
""""See func:``SSDHead.loss``."""
loss_cls_all
=
self
.
loss_cls
(
cls_score
,
labels
,
label_weights
)
# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
pos_inds
=
((
labels
>=
0
)
&
(
labels
<
self
.
num_classes
)).
nonzero
().
reshape
(
-
1
)
neg_inds
=
(
labels
==
self
.
num_classes
).
nonzero
().
view
(
-
1
)
num_pos_samples
=
pos_inds
.
size
(
0
)
if
num_pos_samples
==
0
:
num_neg_samples
=
neg_inds
.
size
(
0
)
else
:
num_neg_samples
=
self
.
train_cfg
.
neg_pos_ratio
*
num_pos_samples
if
num_neg_samples
>
neg_inds
.
size
(
0
):
num_neg_samples
=
neg_inds
.
size
(
0
)
topk_loss_cls_neg
,
_
=
loss_cls_all
[
neg_inds
].
topk
(
num_neg_samples
)
loss_cls_pos
=
loss_cls_all
[
pos_inds
].
sum
()
loss_cls_neg
=
topk_loss_cls_neg
.
sum
()
loss_cls
=
(
loss_cls_pos
+
loss_cls_neg
)
/
num_total_samples
if
self
.
reg_decoded_bbox
:
# When the regression loss (e.g. `IouLoss`, `GIouLoss`)
# is applied directly on the decoded bounding boxes, it
# decodes the already encoded coordinates to absolute format.
bbox_pred
=
self
.
bbox_coder
.
decode
(
anchors
,
bbox_pred
)
loss_bbox
=
self
.
loss_bbox
(
bbox_pred
,
bbox_targets
,
bbox_weights
,
avg_factor
=
num_total_samples
)
return
loss_cls
[
None
],
loss_bbox
@
force_fp32
(
apply_to
=
(
'cls_scores'
,
'bbox_preds'
,
'coeff_preds'
))
def
get_bboxes
(
self
,
cls_scores
,
bbox_preds
,
coeff_preds
,
img_metas
,
cfg
=
None
,
rescale
=
False
):
""""Similiar to func:``AnchorHead.get_bboxes``, but additionally
processes coeff_preds.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
with shape (N, num_anchors * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_anchors * 4, H, W)
coeff_preds (list[Tensor]): Mask coefficients for each scale
level with shape (N, num_anchors * num_protos, H, W)
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cfg (mmcv.Config | None): Test / postprocessing configuration,
if None, test_cfg would be used
rescale (bool): If True, return boxes in original image space.
Default: False.
Returns:
list[tuple[Tensor, Tensor, Tensor]]: Each item in result_list is
a 3-tuple. The first item is an (n, 5) tensor, where the
first 4 columns are bounding box positions
(tl_x, tl_y, br_x, br_y) and the 5-th column is a score
between 0 and 1. The second item is an (n,) tensor where each
item is the predicted class label of the corresponding box.
The third item is an (n, num_protos) tensor where each item
is the predicted mask coefficients of instance inside the
corresponding box.
"""
assert
len
(
cls_scores
)
==
len
(
bbox_preds
)
num_levels
=
len
(
cls_scores
)
device
=
cls_scores
[
0
].
device
featmap_sizes
=
[
cls_scores
[
i
].
shape
[
-
2
:]
for
i
in
range
(
num_levels
)]
mlvl_anchors
=
self
.
anchor_generator
.
grid_anchors
(
featmap_sizes
,
device
=
device
)
det_bboxes
=
[]
det_labels
=
[]
det_coeffs
=
[]
for
img_id
in
range
(
len
(
img_metas
)):
cls_score_list
=
[
cls_scores
[
i
][
img_id
].
detach
()
for
i
in
range
(
num_levels
)
]
bbox_pred_list
=
[
bbox_preds
[
i
][
img_id
].
detach
()
for
i
in
range
(
num_levels
)
]
coeff_pred_list
=
[
coeff_preds
[
i
][
img_id
].
detach
()
for
i
in
range
(
num_levels
)
]
img_shape
=
img_metas
[
img_id
][
'img_shape'
]
scale_factor
=
img_metas
[
img_id
][
'scale_factor'
]
bbox_res
=
self
.
_get_bboxes_single
(
cls_score_list
,
bbox_pred_list
,
coeff_pred_list
,
mlvl_anchors
,
img_shape
,
scale_factor
,
cfg
,
rescale
)
det_bboxes
.
append
(
bbox_res
[
0
])
det_labels
.
append
(
bbox_res
[
1
])
det_coeffs
.
append
(
bbox_res
[
2
])
return
det_bboxes
,
det_labels
,
det_coeffs
def
_get_bboxes_single
(
self
,
cls_score_list
,
bbox_pred_list
,
coeff_preds_list
,
mlvl_anchors
,
img_shape
,
scale_factor
,
cfg
,
rescale
=
False
):
""""Similiar to func:``AnchorHead._get_bboxes_single``, but
additionally processes coeff_preds_list and uses fast NMS instead of
traditional NMS.
Args:
cls_score_list (list[Tensor]): Box scores for a single scale level
Has shape (num_anchors * num_classes, H, W).
bbox_pred_list (list[Tensor]): Box energies / deltas for a single
scale level with shape (num_anchors * 4, H, W).
coeff_preds_list (list[Tensor]): Mask coefficients for a single
scale level with shape (num_anchors * num_protos, H, W).
mlvl_anchors (list[Tensor]): Box reference for a single scale level
with shape (num_total_anchors, 4).
img_shape (tuple[int]): Shape of the input image,
(height, width, 3).
scale_factor (ndarray): Scale factor of the image arange as
(w_scale, h_scale, w_scale, h_scale).
cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used.
rescale (bool): If True, return boxes in original image space.
Returns:
tuple[Tensor, Tensor, Tensor]: The first item is an (n, 5) tensor,
where the first 4 columns are bounding box positions
(tl_x, tl_y, br_x, br_y) and the 5-th column is a score between
0 and 1. The second item is an (n,) tensor where each item is
the predicted class label of the corresponding box. The third
item is an (n, num_protos) tensor where each item is the
predicted mask coefficients of instance inside the
corresponding box.
"""
cfg
=
self
.
test_cfg
if
cfg
is
None
else
cfg
assert
len
(
cls_score_list
)
==
len
(
bbox_pred_list
)
==
len
(
mlvl_anchors
)
mlvl_bboxes
=
[]
mlvl_scores
=
[]
mlvl_coeffs
=
[]
for
cls_score
,
bbox_pred
,
coeff_pred
,
anchors
in
\
zip
(
cls_score_list
,
bbox_pred_list
,
coeff_preds_list
,
mlvl_anchors
):
assert
cls_score
.
size
()[
-
2
:]
==
bbox_pred
.
size
()[
-
2
:]
cls_score
=
cls_score
.
permute
(
1
,
2
,
0
).
reshape
(
-
1
,
self
.
cls_out_channels
)
if
self
.
use_sigmoid_cls
:
scores
=
cls_score
.
sigmoid
()
else
:
scores
=
cls_score
.
softmax
(
-
1
)
bbox_pred
=
bbox_pred
.
permute
(
1
,
2
,
0
).
reshape
(
-
1
,
4
)
coeff_pred
=
coeff_pred
.
permute
(
1
,
2
,
0
).
reshape
(
-
1
,
self
.
num_protos
)
nms_pre
=
cfg
.
get
(
'nms_pre'
,
-
1
)
if
nms_pre
>
0
and
scores
.
shape
[
0
]
>
nms_pre
:
# Get maximum scores for foreground classes.
if
self
.
use_sigmoid_cls
:
max_scores
,
_
=
scores
.
max
(
dim
=
1
)
else
:
# remind that we set FG labels to [0, num_class-1]
# since mmdet v2.0
# BG cat_id: num_class
max_scores
,
_
=
scores
[:,
:
-
1
].
max
(
dim
=
1
)
_
,
topk_inds
=
max_scores
.
topk
(
nms_pre
)
anchors
=
anchors
[
topk_inds
,
:]
bbox_pred
=
bbox_pred
[
topk_inds
,
:]
scores
=
scores
[
topk_inds
,
:]
coeff_pred
=
coeff_pred
[
topk_inds
,
:]
bboxes
=
self
.
bbox_coder
.
decode
(
anchors
,
bbox_pred
,
max_shape
=
img_shape
)
mlvl_bboxes
.
append
(
bboxes
)
mlvl_scores
.
append
(
scores
)
mlvl_coeffs
.
append
(
coeff_pred
)
mlvl_bboxes
=
torch
.
cat
(
mlvl_bboxes
)
if
rescale
:
mlvl_bboxes
/=
mlvl_bboxes
.
new_tensor
(
scale_factor
)
mlvl_scores
=
torch
.
cat
(
mlvl_scores
)
mlvl_coeffs
=
torch
.
cat
(
mlvl_coeffs
)
if
self
.
use_sigmoid_cls
:
# Add a dummy background class to the backend when using sigmoid
# remind that we set FG labels to [0, num_class-1] since mmdet v2.0
# BG cat_id: num_class
padding
=
mlvl_scores
.
new_zeros
(
mlvl_scores
.
shape
[
0
],
1
)
mlvl_scores
=
torch
.
cat
([
mlvl_scores
,
padding
],
dim
=
1
)
det_bboxes
,
det_labels
,
det_coeffs
=
fast_nms
(
mlvl_bboxes
,
mlvl_scores
,
mlvl_coeffs
,
cfg
.
score_thr
,
cfg
.
iou_thr
,
cfg
.
top_k
,
cfg
.
max_per_img
)
return
det_bboxes
,
det_labels
,
det_coeffs
@
HEADS
.
register_module
()
class
YOLACTSegmHead
(
nn
.
Module
):
"""YOLACT segmentation head used in https://arxiv.org/abs/1904.02689.
Apply a semantic segmentation loss on feature space using layers that are
only evaluated during training to increase performance with no speed
penalty.
Args:
in_channels (int): Number of channels in the input feature map.
num_classes (int): Number of categories excluding the background
category.
loss_segm (dict): Config of semantic segmentation loss.
"""
def
__init__
(
self
,
num_classes
,
in_channels
=
256
,
loss_segm
=
dict
(
type
=
'CrossEntropyLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
)):
super
(
YOLACTSegmHead
,
self
).
__init__
()
self
.
in_channels
=
in_channels
self
.
num_classes
=
num_classes
self
.
loss_segm
=
build_loss
(
loss_segm
)
self
.
_init_layers
()
self
.
fp16_enabled
=
False
def
_init_layers
(
self
):
"""Initialize layers of the head."""
self
.
segm_conv
=
nn
.
Conv2d
(
self
.
in_channels
,
self
.
num_classes
,
kernel_size
=
1
)
def
init_weights
(
self
):
"""Initialize weights of the head."""
xavier_init
(
self
.
segm_conv
,
distribution
=
'uniform'
)
def
forward
(
self
,
x
):
"""Forward feature from the upstream network.
Args:
x (Tensor): Feature from the upstream network, which is
a 4D-tensor.
Returns:
Tensor: Predicted semantic segmentation map with shape
(N, num_classes, H, W).
"""
return
self
.
segm_conv
(
x
)
@
force_fp32
(
apply_to
=
(
'segm_pred'
,
))
def
loss
(
self
,
segm_pred
,
gt_masks
,
gt_labels
):
"""Compute loss of the head.
Args:
segm_pred (list[Tensor]): Predicted semantic segmentation map
with shape (N, num_classes, H, W).
gt_masks (list[Tensor]): Ground truth masks for each image with
the same shape of the input image.
gt_labels (list[Tensor]): Class indices corresponding to each box.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
loss_segm
=
[]
num_imgs
,
num_classes
,
mask_h
,
mask_w
=
segm_pred
.
size
()
for
idx
in
range
(
num_imgs
):
cur_segm_pred
=
segm_pred
[
idx
]
cur_gt_masks
=
gt_masks
[
idx
].
float
()
cur_gt_labels
=
gt_labels
[
idx
]
segm_targets
=
self
.
get_targets
(
cur_segm_pred
,
cur_gt_masks
,
cur_gt_labels
)
if
segm_targets
is
None
:
loss
=
self
.
loss_segm
(
cur_segm_pred
,
torch
.
zeros_like
(
cur_segm_pred
),
torch
.
zeros_like
(
cur_segm_pred
))
else
:
loss
=
self
.
loss_segm
(
cur_segm_pred
,
segm_targets
,
avg_factor
=
num_imgs
*
mask_h
*
mask_w
)
loss_segm
.
append
(
loss
)
return
dict
(
loss_segm
=
loss_segm
)
def
get_targets
(
self
,
segm_pred
,
gt_masks
,
gt_labels
):
"""Compute semantic segmentation targets for each image.
Args:
segm_pred (Tensor): Predicted semantic segmentation map
with shape (num_classes, H, W).
gt_masks (Tensor): Ground truth masks for each image with
the same shape of the input image.
gt_labels (Tensor): Class indices corresponding to each box.
Returns:
Tensor: Semantic segmentation targets with shape
(num_classes, H, W).
"""
if
gt_masks
.
size
(
0
)
==
0
:
return
None
num_classes
,
mask_h
,
mask_w
=
segm_pred
.
size
()
with
torch
.
no_grad
():
downsampled_masks
=
F
.
interpolate
(
gt_masks
.
unsqueeze
(
0
),
(
mask_h
,
mask_w
),
mode
=
'bilinear'
,
align_corners
=
False
).
squeeze
(
0
)
downsampled_masks
=
downsampled_masks
.
gt
(
0.5
).
float
()
segm_targets
=
torch
.
zeros_like
(
segm_pred
,
requires_grad
=
False
)
for
obj_idx
in
range
(
downsampled_masks
.
size
(
0
)):
segm_targets
[
gt_labels
[
obj_idx
]
-
1
]
=
torch
.
max
(
segm_targets
[
gt_labels
[
obj_idx
]
-
1
],
downsampled_masks
[
obj_idx
])
return
segm_targets
@
HEADS
.
register_module
()
class
YOLACTProtonet
(
nn
.
Module
):
"""YOLACT mask head used in https://arxiv.org/abs/1904.02689.
This head outputs the mask prototypes for YOLACT.
Args:
in_channels (int): Number of channels in the input feature map.
proto_channels (tuple[int]): Output channels of protonet convs.
proto_kernel_sizes (tuple[int]): Kernel sizes of protonet convs.
include_last_relu (Bool): If keep the last relu of protonet.
num_protos (int): Number of prototypes.
num_classes (int): Number of categories excluding the background
category.
loss_mask_weight (float): Reweight the mask loss by this factor.
max_masks_to_train (int): Maximum number of masks to train for
each image.
"""
def
__init__
(
self
,
num_classes
,
in_channels
=
256
,
proto_channels
=
(
256
,
256
,
256
,
None
,
256
,
32
),
proto_kernel_sizes
=
(
3
,
3
,
3
,
-
2
,
3
,
1
),
include_last_relu
=
True
,
num_protos
=
32
,
loss_mask_weight
=
1.0
,
max_masks_to_train
=
100
):
super
(
YOLACTProtonet
,
self
).
__init__
()
self
.
in_channels
=
in_channels
self
.
proto_channels
=
proto_channels
self
.
proto_kernel_sizes
=
proto_kernel_sizes
self
.
include_last_relu
=
include_last_relu
self
.
protonet
=
self
.
_init_layers
()
self
.
loss_mask_weight
=
loss_mask_weight
self
.
num_protos
=
num_protos
self
.
num_classes
=
num_classes
self
.
max_masks_to_train
=
max_masks_to_train
self
.
fp16_enabled
=
False
def
_init_layers
(
self
):
"""A helper function to take a config setting and turn it into a
network."""
# Possible patterns:
# ( 256, 3) -> conv
# ( 256,-2) -> deconv
# (None,-2) -> bilinear interpolate
in_channels
=
self
.
in_channels
protonets
=
nn
.
ModuleList
()
for
num_channels
,
kernel_size
in
zip
(
self
.
proto_channels
,
self
.
proto_kernel_sizes
):
if
kernel_size
>
0
:
layer
=
nn
.
Conv2d
(
in_channels
,
num_channels
,
kernel_size
,
padding
=
kernel_size
//
2
)
else
:
if
num_channels
is
None
:
layer
=
InterpolateModule
(
scale_factor
=-
kernel_size
,
mode
=
'bilinear'
,
align_corners
=
False
)
else
:
layer
=
nn
.
ConvTranspose2d
(
in_channels
,
num_channels
,
-
kernel_size
,
padding
=
kernel_size
//
2
)
protonets
.
append
(
layer
)
protonets
.
append
(
nn
.
ReLU
(
inplace
=
True
))
in_channels
=
num_channels
if
num_channels
is
not
None
\
else
in_channels
if
not
self
.
include_last_relu
:
protonets
=
protonets
[:
-
1
]
return
nn
.
Sequential
(
*
protonets
)
def
init_weights
(
self
):
"""Initialize weights of the head."""
for
m
in
self
.
protonet
:
if
isinstance
(
m
,
nn
.
Conv2d
):
xavier_init
(
m
,
distribution
=
'uniform'
)
def
forward
(
self
,
x
,
coeff_pred
,
bboxes
,
img_meta
,
sampling_results
=
None
):
"""Forward feature from the upstream network to get prototypes and
linearly combine the prototypes, using masks coefficients, into
instance masks. Finally, crop the instance masks with given bboxes.
Args:
x (Tensor): Feature from the upstream network, which is
a 4D-tensor.
coeff_pred (list[Tensor]): Mask coefficients for each scale
level with shape (N, num_anchors * num_protos, H, W).
bboxes (list[Tensor]): Box used for cropping with shape
(N, num_anchors * 4, H, W). During training, they are
ground truth boxes. During testing, they are predicted
boxes.
img_meta (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
sampling_results (List[:obj:``SamplingResult``]): Sampler results
for each image.
Returns:
list[Tensor]: Predicted instance segmentation masks.
"""
prototypes
=
self
.
protonet
(
x
)
prototypes
=
prototypes
.
permute
(
0
,
2
,
3
,
1
).
contiguous
()
num_imgs
=
x
.
size
(
0
)
# Training state
if
self
.
training
:
coeff_pred_list
=
[]
for
coeff_pred_per_level
in
coeff_pred
:
coeff_pred_per_level
=
\
coeff_pred_per_level
.
permute
(
0
,
2
,
3
,
1
)
\
.
reshape
(
num_imgs
,
-
1
,
self
.
num_protos
)
coeff_pred_list
.
append
(
coeff_pred_per_level
)
coeff_pred
=
torch
.
cat
(
coeff_pred_list
,
dim
=
1
)
mask_pred_list
=
[]
for
idx
in
range
(
num_imgs
):
cur_prototypes
=
prototypes
[
idx
]
cur_coeff_pred
=
coeff_pred
[
idx
]
cur_bboxes
=
bboxes
[
idx
]
cur_img_meta
=
img_meta
[
idx
]
# Testing state
if
not
self
.
training
:
bboxes_for_cropping
=
cur_bboxes
else
:
cur_sampling_results
=
sampling_results
[
idx
]
pos_assigned_gt_inds
=
\
cur_sampling_results
.
pos_assigned_gt_inds
bboxes_for_cropping
=
cur_bboxes
[
pos_assigned_gt_inds
].
clone
()
pos_inds
=
cur_sampling_results
.
pos_inds
cur_coeff_pred
=
cur_coeff_pred
[
pos_inds
]
# Linearly combine the prototypes with the mask coefficients
mask_pred
=
cur_prototypes
@
cur_coeff_pred
.
t
()
mask_pred
=
torch
.
sigmoid
(
mask_pred
)
h
,
w
=
cur_img_meta
[
'img_shape'
][:
2
]
bboxes_for_cropping
[:,
0
]
/=
w
bboxes_for_cropping
[:,
1
]
/=
h
bboxes_for_cropping
[:,
2
]
/=
w
bboxes_for_cropping
[:,
3
]
/=
h
mask_pred
=
self
.
crop
(
mask_pred
,
bboxes_for_cropping
)
mask_pred
=
mask_pred
.
permute
(
2
,
0
,
1
).
contiguous
()
mask_pred_list
.
append
(
mask_pred
)
return
mask_pred_list
@
force_fp32
(
apply_to
=
(
'mask_pred'
,
))
def
loss
(
self
,
mask_pred
,
gt_masks
,
gt_bboxes
,
img_meta
,
sampling_results
):
"""Compute loss of the head.
Args:
mask_pred (list[Tensor]): Predicted prototypes with shape
(num_classes, H, W).
gt_masks (list[Tensor]): Ground truth masks for each image with
the same shape of the input image.
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
img_meta (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
sampling_results (List[:obj:``SamplingResult``]): Sampler results
for each image.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
loss_mask
=
[]
num_imgs
=
len
(
mask_pred
)
total_pos
=
0
for
idx
in
range
(
num_imgs
):
cur_mask_pred
=
mask_pred
[
idx
]
cur_gt_masks
=
gt_masks
[
idx
].
float
()
cur_gt_bboxes
=
gt_bboxes
[
idx
]
cur_img_meta
=
img_meta
[
idx
]
cur_sampling_results
=
sampling_results
[
idx
]
pos_assigned_gt_inds
=
cur_sampling_results
.
pos_assigned_gt_inds
num_pos
=
pos_assigned_gt_inds
.
size
(
0
)
# Since we're producing (near) full image masks,
# it'd take too much vram to backprop on every single mask.
# Thus we select only a subset.
if
num_pos
>
self
.
max_masks_to_train
:
perm
=
torch
.
randperm
(
num_pos
)
select
=
perm
[:
self
.
max_masks_to_train
]
cur_mask_pred
=
cur_mask_pred
[
select
]
pos_assigned_gt_inds
=
pos_assigned_gt_inds
[
select
]
num_pos
=
self
.
max_masks_to_train
total_pos
+=
num_pos
gt_bboxes_for_reweight
=
cur_gt_bboxes
[
pos_assigned_gt_inds
]
mask_targets
=
self
.
get_targets
(
cur_mask_pred
,
cur_gt_masks
,
pos_assigned_gt_inds
)
if
num_pos
==
0
:
loss
=
cur_mask_pred
.
sum
()
*
0.
elif
mask_targets
is
None
:
loss
=
F
.
binary_cross_entropy
(
cur_mask_pred
,
torch
.
zeros_like
(
cur_mask_pred
),
torch
.
zeros_like
(
cur_mask_pred
))
else
:
cur_mask_pred
=
torch
.
clamp
(
cur_mask_pred
,
0
,
1
)
loss
=
F
.
binary_cross_entropy
(
cur_mask_pred
,
mask_targets
,
reduction
=
'none'
)
*
self
.
loss_mask_weight
h
,
w
=
cur_img_meta
[
'img_shape'
][:
2
]
gt_bboxes_width
=
(
gt_bboxes_for_reweight
[:,
2
]
-
gt_bboxes_for_reweight
[:,
0
])
/
w
gt_bboxes_height
=
(
gt_bboxes_for_reweight
[:,
3
]
-
gt_bboxes_for_reweight
[:,
1
])
/
h
loss
=
loss
.
mean
(
dim
=
(
1
,
2
))
/
gt_bboxes_width
/
gt_bboxes_height
loss
=
torch
.
sum
(
loss
)
loss_mask
.
append
(
loss
)
if
total_pos
==
0
:
total_pos
+=
1
# avoid nan
loss_mask
=
[
x
/
total_pos
for
x
in
loss_mask
]
return
dict
(
loss_mask
=
loss_mask
)
def
get_targets
(
self
,
mask_pred
,
gt_masks
,
pos_assigned_gt_inds
):
"""Compute instance segmentation targets for each image.
Args:
mask_pred (Tensor): Predicted prototypes with shape
(num_classes, H, W).
gt_masks (Tensor): Ground truth masks for each image with
the same shape of the input image.
pos_assigned_gt_inds (Tensor): GT indices of the corresponding
positive samples.
Returns:
Tensor: Instance segmentation targets with shape
(num_instances, H, W).
"""
if
gt_masks
.
size
(
0
)
==
0
:
return
None
mask_h
,
mask_w
=
mask_pred
.
shape
[
-
2
:]
gt_masks
=
F
.
interpolate
(
gt_masks
.
unsqueeze
(
0
),
(
mask_h
,
mask_w
),
mode
=
'bilinear'
,
align_corners
=
False
).
squeeze
(
0
)
gt_masks
=
gt_masks
.
gt
(
0.5
).
float
()
mask_targets
=
gt_masks
[
pos_assigned_gt_inds
]
return
mask_targets
def
get_seg_masks
(
self
,
mask_pred
,
label_pred
,
img_meta
,
rescale
):
"""Resize, binarize, and format the instance mask predictions.
Args:
mask_pred (Tensor): shape (N, H, W).
label_pred (Tensor): shape (N, ).
img_meta (dict): Meta information of each image, e.g.,
image size, scaling factor, etc.
rescale (bool): If rescale is False, then returned masks will
fit the scale of imgs[0].
Returns:
list[ndarray]: Mask predictions grouped by their predicted classes.
"""
ori_shape
=
img_meta
[
'ori_shape'
]
scale_factor
=
img_meta
[
'scale_factor'
]
if
rescale
:
img_h
,
img_w
=
ori_shape
[:
2
]
else
:
img_h
=
np
.
round
(
ori_shape
[
0
]
*
scale_factor
[
1
]).
astype
(
np
.
int32
)
img_w
=
np
.
round
(
ori_shape
[
1
]
*
scale_factor
[
0
]).
astype
(
np
.
int32
)
cls_segms
=
[[]
for
_
in
range
(
self
.
num_classes
)]
if
mask_pred
.
size
(
0
)
==
0
:
return
cls_segms
mask_pred
=
F
.
interpolate
(
mask_pred
.
unsqueeze
(
0
),
(
img_h
,
img_w
),
mode
=
'bilinear'
,
align_corners
=
False
).
squeeze
(
0
)
>
0.5
mask_pred
=
mask_pred
.
cpu
().
numpy
().
astype
(
np
.
uint8
)
for
m
,
l
in
zip
(
mask_pred
,
label_pred
):
cls_segms
[
l
].
append
(
m
)
return
cls_segms
def
crop
(
self
,
masks
,
boxes
,
padding
=
1
):
"""Crop predicted masks by zeroing out everything not in the predicted
bbox.
Args:
masks (Tensor): shape [H, W, N].
boxes (Tensor): bbox coords in relative point form with
shape [N, 4].
Return:
Tensor: The cropped masks.
"""
h
,
w
,
n
=
masks
.
size
()
x1
,
x2
=
self
.
sanitize_coordinates
(
boxes
[:,
0
],
boxes
[:,
2
],
w
,
padding
,
cast
=
False
)
y1
,
y2
=
self
.
sanitize_coordinates
(
boxes
[:,
1
],
boxes
[:,
3
],
h
,
padding
,
cast
=
False
)
rows
=
torch
.
arange
(
w
,
device
=
masks
.
device
,
dtype
=
x1
.
dtype
).
view
(
1
,
-
1
,
1
).
expand
(
h
,
w
,
n
)
cols
=
torch
.
arange
(
h
,
device
=
masks
.
device
,
dtype
=
x1
.
dtype
).
view
(
-
1
,
1
,
1
).
expand
(
h
,
w
,
n
)
masks_left
=
rows
>=
x1
.
view
(
1
,
1
,
-
1
)
masks_right
=
rows
<
x2
.
view
(
1
,
1
,
-
1
)
masks_up
=
cols
>=
y1
.
view
(
1
,
1
,
-
1
)
masks_down
=
cols
<
y2
.
view
(
1
,
1
,
-
1
)
crop_mask
=
masks_left
*
masks_right
*
masks_up
*
masks_down
return
masks
*
crop_mask
.
float
()
def
sanitize_coordinates
(
self
,
x1
,
x2
,
img_size
,
padding
=
0
,
cast
=
True
):
"""Sanitizes the input coordinates so that x1 < x2, x1 != x2, x1 >= 0,
and x2 <= image_size. Also converts from relative to absolute
coordinates and casts the results to long tensors.
Warning: this does things in-place behind the scenes so
copy if necessary.
Args:
_x1 (Tensor): shape (N, ).
_x2 (Tensor): shape (N, ).
img_size (int): Size of the input image.
padding (int): x1 >= padding, x2 <= image_size-padding.
cast (bool): If cast is false, the result won't be cast to longs.
Returns:
tuple:
x1 (Tensor): Sanitized _x1.
x2 (Tensor): Sanitized _x2.
"""
x1
=
x1
*
img_size
x2
=
x2
*
img_size
if
cast
:
x1
=
x1
.
long
()
x2
=
x2
.
long
()
x1
=
torch
.
min
(
x1
,
x2
)
x2
=
torch
.
max
(
x1
,
x2
)
x1
=
torch
.
clamp
(
x1
-
padding
,
min
=
0
)
x2
=
torch
.
clamp
(
x2
+
padding
,
max
=
img_size
)
return
x1
,
x2
class
InterpolateModule
(
nn
.
Module
):
"""This is a module version of F.interpolate.
Any arguments you give it just get passed along for the ride.
"""
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
()
self
.
args
=
args
self
.
kwargs
=
kwargs
def
forward
(
self
,
x
):
"""Forward features from the upstream network."""
return
F
.
interpolate
(
x
,
*
self
.
args
,
**
self
.
kwargs
)
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/dense_heads/yolo_head.py
0 → 100644
View file @
142dcf29
# Copyright (c) 2019 Western Digital Corporation or its affiliates.
import
warnings
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
ConvModule
,
normal_init
from
mmcv.runner
import
force_fp32
from
mmdet.core
import
(
build_anchor_generator
,
build_assigner
,
build_bbox_coder
,
build_sampler
,
images_to_levels
,
multi_apply
,
multiclass_nms
)
from
..builder
import
HEADS
,
build_loss
from
.base_dense_head
import
BaseDenseHead
from
.dense_test_mixins
import
BBoxTestMixin
@
HEADS
.
register_module
()
class
YOLOV3Head
(
BaseDenseHead
,
BBoxTestMixin
):
"""YOLOV3Head Paper link: https://arxiv.org/abs/1804.02767.
Args:
num_classes (int): The number of object classes (w/o background)
in_channels (List[int]): Number of input channels per scale.
out_channels (List[int]): The number of output channels per scale
before the final 1x1 layer. Default: (1024, 512, 256).
anchor_generator (dict): Config dict for anchor generator
bbox_coder (dict): Config of bounding box coder.
featmap_strides (List[int]): The stride of each scale.
Should be in descending order. Default: (32, 16, 8).
one_hot_smoother (float): Set a non-zero value to enable label-smooth
Default: 0.
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Dictionary to construct and config norm layer.
Default: dict(type='BN', requires_grad=True)
act_cfg (dict): Config dict for activation layer.
Default: dict(type='LeakyReLU', negative_slope=0.1).
loss_cls (dict): Config of classification loss.
loss_conf (dict): Config of confidence loss.
loss_xy (dict): Config of xy coordinate loss.
loss_wh (dict): Config of wh coordinate loss.
train_cfg (dict): Training config of YOLOV3 head. Default: None.
test_cfg (dict): Testing config of YOLOV3 head. Default: None.
"""
def
__init__
(
self
,
num_classes
,
in_channels
,
out_channels
=
(
1024
,
512
,
256
),
anchor_generator
=
dict
(
type
=
'YOLOAnchorGenerator'
,
base_sizes
=
[[(
116
,
90
),
(
156
,
198
),
(
373
,
326
)],
[(
30
,
61
),
(
62
,
45
),
(
59
,
119
)],
[(
10
,
13
),
(
16
,
30
),
(
33
,
23
)]],
strides
=
[
32
,
16
,
8
]),
bbox_coder
=
dict
(
type
=
'YOLOBBoxCoder'
),
featmap_strides
=
[
32
,
16
,
8
],
one_hot_smoother
=
0.
,
conv_cfg
=
None
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
True
),
act_cfg
=
dict
(
type
=
'LeakyReLU'
,
negative_slope
=
0.1
),
loss_cls
=
dict
(
type
=
'CrossEntropyLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
),
loss_conf
=
dict
(
type
=
'CrossEntropyLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
),
loss_xy
=
dict
(
type
=
'CrossEntropyLoss'
,
use_sigmoid
=
True
,
loss_weight
=
1.0
),
loss_wh
=
dict
(
type
=
'MSELoss'
,
loss_weight
=
1.0
),
train_cfg
=
None
,
test_cfg
=
None
):
super
(
YOLOV3Head
,
self
).
__init__
()
# Check params
assert
(
len
(
in_channels
)
==
len
(
out_channels
)
==
len
(
featmap_strides
))
self
.
num_classes
=
num_classes
self
.
in_channels
=
in_channels
self
.
out_channels
=
out_channels
self
.
featmap_strides
=
featmap_strides
self
.
train_cfg
=
train_cfg
self
.
test_cfg
=
test_cfg
if
self
.
train_cfg
:
self
.
assigner
=
build_assigner
(
self
.
train_cfg
.
assigner
)
if
hasattr
(
self
.
train_cfg
,
'sampler'
):
sampler_cfg
=
self
.
train_cfg
.
sampler
else
:
sampler_cfg
=
dict
(
type
=
'PseudoSampler'
)
self
.
sampler
=
build_sampler
(
sampler_cfg
,
context
=
self
)
self
.
one_hot_smoother
=
one_hot_smoother
self
.
conv_cfg
=
conv_cfg
self
.
norm_cfg
=
norm_cfg
self
.
act_cfg
=
act_cfg
self
.
bbox_coder
=
build_bbox_coder
(
bbox_coder
)
self
.
anchor_generator
=
build_anchor_generator
(
anchor_generator
)
self
.
loss_cls
=
build_loss
(
loss_cls
)
self
.
loss_conf
=
build_loss
(
loss_conf
)
self
.
loss_xy
=
build_loss
(
loss_xy
)
self
.
loss_wh
=
build_loss
(
loss_wh
)
# usually the numbers of anchors for each level are the same
# except SSD detectors
self
.
num_anchors
=
self
.
anchor_generator
.
num_base_anchors
[
0
]
assert
len
(
self
.
anchor_generator
.
num_base_anchors
)
==
len
(
featmap_strides
)
self
.
_init_layers
()
@
property
def
num_levels
(
self
):
return
len
(
self
.
featmap_strides
)
@
property
def
num_attrib
(
self
):
"""int: number of attributes in pred_map, bboxes (4) +
objectness (1) + num_classes"""
return
5
+
self
.
num_classes
def
_init_layers
(
self
):
self
.
convs_bridge
=
nn
.
ModuleList
()
self
.
convs_pred
=
nn
.
ModuleList
()
for
i
in
range
(
self
.
num_levels
):
conv_bridge
=
ConvModule
(
self
.
in_channels
[
i
],
self
.
out_channels
[
i
],
3
,
padding
=
1
,
conv_cfg
=
self
.
conv_cfg
,
norm_cfg
=
self
.
norm_cfg
,
act_cfg
=
self
.
act_cfg
)
conv_pred
=
nn
.
Conv2d
(
self
.
out_channels
[
i
],
self
.
num_anchors
*
self
.
num_attrib
,
1
)
self
.
convs_bridge
.
append
(
conv_bridge
)
self
.
convs_pred
.
append
(
conv_pred
)
def
init_weights
(
self
):
"""Initialize weights of the head."""
for
m
in
self
.
convs_pred
:
normal_init
(
m
,
std
=
0.01
)
def
forward
(
self
,
feats
):
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
tuple[Tensor]: A tuple of multi-level predication map, each is a
4D-tensor of shape (batch_size, 5+num_classes, height, width).
"""
assert
len
(
feats
)
==
self
.
num_levels
pred_maps
=
[]
for
i
in
range
(
self
.
num_levels
):
x
=
feats
[
i
]
x
=
self
.
convs_bridge
[
i
](
x
)
pred_map
=
self
.
convs_pred
[
i
](
x
)
pred_maps
.
append
(
pred_map
)
return
tuple
(
pred_maps
),
@
force_fp32
(
apply_to
=
(
'pred_maps'
,
))
def
get_bboxes
(
self
,
pred_maps
,
img_metas
,
cfg
=
None
,
rescale
=
False
,
with_nms
=
True
):
"""Transform network output for a batch into bbox predictions.
Args:
pred_maps (list[Tensor]): Raw predictions for a batch of images.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cfg (mmcv.Config | None): Test / postprocessing configuration,
if None, test_cfg would be used. Default: None.
rescale (bool): If True, return boxes in original image space.
Default: False.
with_nms (bool): If True, do nms before return boxes.
Default: True.
Returns:
list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
The first item is an (n, 5) tensor, where the first 4 columns
are bounding box positions (tl_x, tl_y, br_x, br_y) and the
5-th column is a score between 0 and 1. The second item is a
(n,) tensor where each item is the predicted class label of the
corresponding box.
"""
result_list
=
[]
num_levels
=
len
(
pred_maps
)
for
img_id
in
range
(
len
(
img_metas
)):
pred_maps_list
=
[
pred_maps
[
i
][
img_id
].
detach
()
for
i
in
range
(
num_levels
)
]
scale_factor
=
img_metas
[
img_id
][
'scale_factor'
]
proposals
=
self
.
_get_bboxes_single
(
pred_maps_list
,
scale_factor
,
cfg
,
rescale
,
with_nms
)
result_list
.
append
(
proposals
)
return
result_list
def
_get_bboxes_single
(
self
,
pred_maps_list
,
scale_factor
,
cfg
,
rescale
=
False
,
with_nms
=
True
):
"""Transform outputs for a single batch item into bbox predictions.
Args:
pred_maps_list (list[Tensor]): Prediction maps for different scales
of each single image in the batch.
scale_factor (ndarray): Scale factor of the image arrange as
(w_scale, h_scale, w_scale, h_scale).
cfg (mmcv.Config | None): Test / postprocessing configuration,
if None, test_cfg would be used.
rescale (bool): If True, return boxes in original image space.
Default: False.
with_nms (bool): If True, do nms before return boxes.
Default: True.
Returns:
tuple(Tensor):
det_bboxes (Tensor): BBox predictions in shape (n, 5), where
the first 4 columns are bounding box positions
(tl_x, tl_y, br_x, br_y) and the 5-th column is a score
between 0 and 1.
det_labels (Tensor): A (n,) tensor where each item is the
predicted class label of the corresponding box.
"""
cfg
=
self
.
test_cfg
if
cfg
is
None
else
cfg
assert
len
(
pred_maps_list
)
==
self
.
num_levels
multi_lvl_bboxes
=
[]
multi_lvl_cls_scores
=
[]
multi_lvl_conf_scores
=
[]
num_levels
=
len
(
pred_maps_list
)
featmap_sizes
=
[
pred_maps_list
[
i
].
shape
[
-
2
:]
for
i
in
range
(
num_levels
)
]
multi_lvl_anchors
=
self
.
anchor_generator
.
grid_anchors
(
featmap_sizes
,
pred_maps_list
[
0
][
0
].
device
)
for
i
in
range
(
self
.
num_levels
):
# get some key info for current scale
pred_map
=
pred_maps_list
[
i
]
stride
=
self
.
featmap_strides
[
i
]
# (h, w, num_anchors*num_attrib) -> (h*w*num_anchors, num_attrib)
pred_map
=
pred_map
.
permute
(
1
,
2
,
0
).
reshape
(
-
1
,
self
.
num_attrib
)
pred_map
[...,
:
2
]
=
torch
.
sigmoid
(
pred_map
[...,
:
2
])
bbox_pred
=
self
.
bbox_coder
.
decode
(
multi_lvl_anchors
[
i
],
pred_map
[...,
:
4
],
stride
)
# conf and cls
conf_pred
=
torch
.
sigmoid
(
pred_map
[...,
4
]).
view
(
-
1
)
cls_pred
=
torch
.
sigmoid
(
pred_map
[...,
5
:]).
view
(
-
1
,
self
.
num_classes
)
# Cls pred one-hot.
# Filtering out all predictions with conf < conf_thr
conf_thr
=
cfg
.
get
(
'conf_thr'
,
-
1
)
if
conf_thr
>
0
:
# add as_tuple=False for compatibility in Pytorch 1.6
# flatten would create a Reshape op with constant values,
# and raise RuntimeError when doing inference in ONNX Runtime
# with a different input image (#4221).
conf_inds
=
conf_pred
.
ge
(
conf_thr
).
nonzero
(
as_tuple
=
False
).
squeeze
(
1
)
bbox_pred
=
bbox_pred
[
conf_inds
,
:]
cls_pred
=
cls_pred
[
conf_inds
,
:]
conf_pred
=
conf_pred
[
conf_inds
]
# Get top-k prediction
nms_pre
=
cfg
.
get
(
'nms_pre'
,
-
1
)
if
0
<
nms_pre
<
conf_pred
.
size
(
0
)
and
(
not
torch
.
onnx
.
is_in_onnx_export
()):
_
,
topk_inds
=
conf_pred
.
topk
(
nms_pre
)
bbox_pred
=
bbox_pred
[
topk_inds
,
:]
cls_pred
=
cls_pred
[
topk_inds
,
:]
conf_pred
=
conf_pred
[
topk_inds
]
# Save the result of current scale
multi_lvl_bboxes
.
append
(
bbox_pred
)
multi_lvl_cls_scores
.
append
(
cls_pred
)
multi_lvl_conf_scores
.
append
(
conf_pred
)
# Merge the results of different scales together
multi_lvl_bboxes
=
torch
.
cat
(
multi_lvl_bboxes
)
multi_lvl_cls_scores
=
torch
.
cat
(
multi_lvl_cls_scores
)
multi_lvl_conf_scores
=
torch
.
cat
(
multi_lvl_conf_scores
)
if
with_nms
and
(
multi_lvl_conf_scores
.
size
(
0
)
==
0
):
return
torch
.
zeros
((
0
,
5
)),
torch
.
zeros
((
0
,
))
if
rescale
:
multi_lvl_bboxes
/=
multi_lvl_bboxes
.
new_tensor
(
scale_factor
)
# In mmdet 2.x, the class_id for background is num_classes.
# i.e., the last column.
padding
=
multi_lvl_cls_scores
.
new_zeros
(
multi_lvl_cls_scores
.
shape
[
0
],
1
)
multi_lvl_cls_scores
=
torch
.
cat
([
multi_lvl_cls_scores
,
padding
],
dim
=
1
)
# Support exporting to onnx without nms
if
with_nms
and
cfg
.
get
(
'nms'
,
None
)
is
not
None
:
det_bboxes
,
det_labels
=
multiclass_nms
(
multi_lvl_bboxes
,
multi_lvl_cls_scores
,
cfg
.
score_thr
,
cfg
.
nms
,
cfg
.
max_per_img
,
score_factors
=
multi_lvl_conf_scores
)
return
det_bboxes
,
det_labels
else
:
return
(
multi_lvl_bboxes
,
multi_lvl_cls_scores
,
multi_lvl_conf_scores
)
@
force_fp32
(
apply_to
=
(
'pred_maps'
,
))
def
loss
(
self
,
pred_maps
,
gt_bboxes
,
gt_labels
,
img_metas
,
gt_bboxes_ignore
=
None
):
"""Compute loss of the head.
Args:
pred_maps (list[Tensor]): Prediction map for each scale level,
shape (N, num_anchors * num_attrib, H, W)
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): specify which bounding
boxes can be ignored when computing the loss.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
num_imgs
=
len
(
img_metas
)
device
=
pred_maps
[
0
][
0
].
device
featmap_sizes
=
[
pred_maps
[
i
].
shape
[
-
2
:]
for
i
in
range
(
self
.
num_levels
)
]
multi_level_anchors
=
self
.
anchor_generator
.
grid_anchors
(
featmap_sizes
,
device
)
anchor_list
=
[
multi_level_anchors
for
_
in
range
(
num_imgs
)]
responsible_flag_list
=
[]
for
img_id
in
range
(
len
(
img_metas
)):
responsible_flag_list
.
append
(
self
.
anchor_generator
.
responsible_flags
(
featmap_sizes
,
gt_bboxes
[
img_id
],
device
))
target_maps_list
,
neg_maps_list
=
self
.
get_targets
(
anchor_list
,
responsible_flag_list
,
gt_bboxes
,
gt_labels
)
losses_cls
,
losses_conf
,
losses_xy
,
losses_wh
=
multi_apply
(
self
.
loss_single
,
pred_maps
,
target_maps_list
,
neg_maps_list
)
return
dict
(
loss_cls
=
losses_cls
,
loss_conf
=
losses_conf
,
loss_xy
=
losses_xy
,
loss_wh
=
losses_wh
)
def
loss_single
(
self
,
pred_map
,
target_map
,
neg_map
):
"""Compute loss of a single image from a batch.
Args:
pred_map (Tensor): Raw predictions for a single level.
target_map (Tensor): The Ground-Truth target for a single level.
neg_map (Tensor): The negative masks for a single level.
Returns:
tuple:
loss_cls (Tensor): Classification loss.
loss_conf (Tensor): Confidence loss.
loss_xy (Tensor): Regression loss of x, y coordinate.
loss_wh (Tensor): Regression loss of w, h coordinate.
"""
num_imgs
=
len
(
pred_map
)
pred_map
=
pred_map
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
num_imgs
,
-
1
,
self
.
num_attrib
)
neg_mask
=
neg_map
.
float
()
pos_mask
=
target_map
[...,
4
]
pos_and_neg_mask
=
neg_mask
+
pos_mask
pos_mask
=
pos_mask
.
unsqueeze
(
dim
=-
1
)
if
torch
.
max
(
pos_and_neg_mask
)
>
1.
:
warnings
.
warn
(
'There is overlap between pos and neg sample.'
)
pos_and_neg_mask
=
pos_and_neg_mask
.
clamp
(
min
=
0.
,
max
=
1.
)
pred_xy
=
pred_map
[...,
:
2
]
pred_wh
=
pred_map
[...,
2
:
4
]
pred_conf
=
pred_map
[...,
4
]
pred_label
=
pred_map
[...,
5
:]
target_xy
=
target_map
[...,
:
2
]
target_wh
=
target_map
[...,
2
:
4
]
target_conf
=
target_map
[...,
4
]
target_label
=
target_map
[...,
5
:]
loss_cls
=
self
.
loss_cls
(
pred_label
,
target_label
,
weight
=
pos_mask
)
loss_conf
=
self
.
loss_conf
(
pred_conf
,
target_conf
,
weight
=
pos_and_neg_mask
)
loss_xy
=
self
.
loss_xy
(
pred_xy
,
target_xy
,
weight
=
pos_mask
)
loss_wh
=
self
.
loss_wh
(
pred_wh
,
target_wh
,
weight
=
pos_mask
)
return
loss_cls
,
loss_conf
,
loss_xy
,
loss_wh
def
get_targets
(
self
,
anchor_list
,
responsible_flag_list
,
gt_bboxes_list
,
gt_labels_list
):
"""Compute target maps for anchors in multiple images.
Args:
anchor_list (list[list[Tensor]]): Multi level anchors of each
image. The outer list indicates images, and the inner list
corresponds to feature levels of the image. Each element of
the inner list is a tensor of shape (num_total_anchors, 4).
responsible_flag_list (list[list[Tensor]]): Multi level responsible
flags of each image. Each element is a tensor of shape
(num_total_anchors, )
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
gt_labels_list (list[Tensor]): Ground truth labels of each box.
Returns:
tuple: Usually returns a tuple containing learning targets.
- target_map_list (list[Tensor]): Target map of each level.
- neg_map_list (list[Tensor]): Negative map of each level.
"""
num_imgs
=
len
(
anchor_list
)
# anchor number of multi levels
num_level_anchors
=
[
anchors
.
size
(
0
)
for
anchors
in
anchor_list
[
0
]]
results
=
multi_apply
(
self
.
_get_targets_single
,
anchor_list
,
responsible_flag_list
,
gt_bboxes_list
,
gt_labels_list
)
all_target_maps
,
all_neg_maps
=
results
assert
num_imgs
==
len
(
all_target_maps
)
==
len
(
all_neg_maps
)
target_maps_list
=
images_to_levels
(
all_target_maps
,
num_level_anchors
)
neg_maps_list
=
images_to_levels
(
all_neg_maps
,
num_level_anchors
)
return
target_maps_list
,
neg_maps_list
def
_get_targets_single
(
self
,
anchors
,
responsible_flags
,
gt_bboxes
,
gt_labels
):
"""Generate matching bounding box prior and converted GT.
Args:
anchors (list[Tensor]): Multi-level anchors of the image.
responsible_flags (list[Tensor]): Multi-level responsible flags of
anchors
gt_bboxes (Tensor): Ground truth bboxes of single image.
gt_labels (Tensor): Ground truth labels of single image.
Returns:
tuple:
target_map (Tensor): Predication target map of each
scale level, shape (num_total_anchors,
5+num_classes)
neg_map (Tensor): Negative map of each scale level,
shape (num_total_anchors,)
"""
anchor_strides
=
[]
for
i
in
range
(
len
(
anchors
)):
anchor_strides
.
append
(
torch
.
tensor
(
self
.
featmap_strides
[
i
],
device
=
gt_bboxes
.
device
).
repeat
(
len
(
anchors
[
i
])))
concat_anchors
=
torch
.
cat
(
anchors
)
concat_responsible_flags
=
torch
.
cat
(
responsible_flags
)
anchor_strides
=
torch
.
cat
(
anchor_strides
)
assert
len
(
anchor_strides
)
==
len
(
concat_anchors
)
==
\
len
(
concat_responsible_flags
)
assign_result
=
self
.
assigner
.
assign
(
concat_anchors
,
concat_responsible_flags
,
gt_bboxes
)
sampling_result
=
self
.
sampler
.
sample
(
assign_result
,
concat_anchors
,
gt_bboxes
)
target_map
=
concat_anchors
.
new_zeros
(
concat_anchors
.
size
(
0
),
self
.
num_attrib
)
target_map
[
sampling_result
.
pos_inds
,
:
4
]
=
self
.
bbox_coder
.
encode
(
sampling_result
.
pos_bboxes
,
sampling_result
.
pos_gt_bboxes
,
anchor_strides
[
sampling_result
.
pos_inds
])
target_map
[
sampling_result
.
pos_inds
,
4
]
=
1
gt_labels_one_hot
=
F
.
one_hot
(
gt_labels
,
num_classes
=
self
.
num_classes
).
float
()
if
self
.
one_hot_smoother
!=
0
:
# label smooth
gt_labels_one_hot
=
gt_labels_one_hot
*
(
1
-
self
.
one_hot_smoother
)
+
self
.
one_hot_smoother
/
self
.
num_classes
target_map
[
sampling_result
.
pos_inds
,
5
:]
=
gt_labels_one_hot
[
sampling_result
.
pos_assigned_gt_inds
]
neg_map
=
concat_anchors
.
new_zeros
(
concat_anchors
.
size
(
0
),
dtype
=
torch
.
uint8
)
neg_map
[
sampling_result
.
neg_inds
]
=
1
return
target_map
,
neg_map
def
aug_test
(
self
,
feats
,
img_metas
,
rescale
=
False
):
"""Test function with test time augmentation.
Args:
feats (list[Tensor]): the outer list indicates test-time
augmentations and inner Tensor should have a shape NxCxHxW,
which contains features for all images in the batch.
img_metas (list[list[dict]]): the outer list indicates test-time
augs (multiscale, flip, etc.) and the inner list indicates
images in a batch. each dict has image information.
rescale (bool, optional): Whether to rescale the results.
Defaults to False.
Returns:
list[ndarray]: bbox results of each class
"""
return
self
.
aug_test_bboxes
(
feats
,
img_metas
,
rescale
=
rescale
)
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/__init__.py
0 → 100644
View file @
142dcf29
from
.atss
import
ATSS
from
.base
import
BaseDetector
from
.cascade_rcnn
import
CascadeRCNN
from
.cornernet
import
CornerNet
from
.detr
import
DETR
from
.fast_rcnn
import
FastRCNN
from
.faster_rcnn
import
FasterRCNN
from
.fcos
import
FCOS
from
.fovea
import
FOVEA
from
.fsaf
import
FSAF
from
.gfl
import
GFL
from
.grid_rcnn
import
GridRCNN
from
.htc
import
HybridTaskCascade
from
.mask_rcnn
import
MaskRCNN
from
.mask_scoring_rcnn
import
MaskScoringRCNN
from
.nasfcos
import
NASFCOS
from
.paa
import
PAA
from
.point_rend
import
PointRend
from
.reppoints_detector
import
RepPointsDetector
from
.retinanet
import
RetinaNet
from
.rpn
import
RPN
from
.single_stage
import
SingleStageDetector
from
.sparse_rcnn
import
SparseRCNN
from
.trident_faster_rcnn
import
TridentFasterRCNN
from
.two_stage
import
TwoStageDetector
from
.vfnet
import
VFNet
from
.yolact
import
YOLACT
from
.yolo
import
YOLOV3
__all__
=
[
'ATSS'
,
'BaseDetector'
,
'SingleStageDetector'
,
'TwoStageDetector'
,
'RPN'
,
'FastRCNN'
,
'FasterRCNN'
,
'MaskRCNN'
,
'CascadeRCNN'
,
'HybridTaskCascade'
,
'RetinaNet'
,
'FCOS'
,
'GridRCNN'
,
'MaskScoringRCNN'
,
'RepPointsDetector'
,
'FOVEA'
,
'FSAF'
,
'NASFCOS'
,
'PointRend'
,
'GFL'
,
'CornerNet'
,
'PAA'
,
'YOLOV3'
,
'YOLACT'
,
'VFNet'
,
'DETR'
,
'TridentFasterRCNN'
,
'SparseRCNN'
]
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/atss.py
0 → 100644
View file @
142dcf29
from
..builder
import
DETECTORS
from
.single_stage
import
SingleStageDetector
@
DETECTORS
.
register_module
()
class
ATSS
(
SingleStageDetector
):
"""Implementation of `ATSS <https://arxiv.org/abs/1912.02424>`_."""
def
__init__
(
self
,
backbone
,
neck
,
bbox_head
,
train_cfg
=
None
,
test_cfg
=
None
,
pretrained
=
None
):
super
(
ATSS
,
self
).
__init__
(
backbone
,
neck
,
bbox_head
,
train_cfg
,
test_cfg
,
pretrained
)
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/base.py
0 → 100644
View file @
142dcf29
from
abc
import
ABCMeta
,
abstractmethod
from
collections
import
OrderedDict
import
mmcv
import
numpy
as
np
import
torch
import
torch.distributed
as
dist
import
torch.nn
as
nn
from
mmcv.runner
import
auto_fp16
from
mmcv.utils
import
print_log
from
mmdet.core.visualization
import
imshow_det_bboxes
from
mmdet.utils
import
get_root_logger
class
BaseDetector
(
nn
.
Module
,
metaclass
=
ABCMeta
):
"""Base class for detectors."""
def
__init__
(
self
):
super
(
BaseDetector
,
self
).
__init__
()
self
.
fp16_enabled
=
False
@
property
def
with_neck
(
self
):
"""bool: whether the detector has a neck"""
return
hasattr
(
self
,
'neck'
)
and
self
.
neck
is
not
None
# TODO: these properties need to be carefully handled
# for both single stage & two stage detectors
@
property
def
with_shared_head
(
self
):
"""bool: whether the detector has a shared head in the RoI Head"""
return
hasattr
(
self
,
'roi_head'
)
and
self
.
roi_head
.
with_shared_head
@
property
def
with_bbox
(
self
):
"""bool: whether the detector has a bbox head"""
return
((
hasattr
(
self
,
'roi_head'
)
and
self
.
roi_head
.
with_bbox
)
or
(
hasattr
(
self
,
'bbox_head'
)
and
self
.
bbox_head
is
not
None
))
@
property
def
with_mask
(
self
):
"""bool: whether the detector has a mask head"""
return
((
hasattr
(
self
,
'roi_head'
)
and
self
.
roi_head
.
with_mask
)
or
(
hasattr
(
self
,
'mask_head'
)
and
self
.
mask_head
is
not
None
))
@
abstractmethod
def
extract_feat
(
self
,
imgs
):
"""Extract features from images."""
pass
def
extract_feats
(
self
,
imgs
):
"""Extract features from multiple images.
Args:
imgs (list[torch.Tensor]): A list of images. The images are
augmented from the same image but in different ways.
Returns:
list[torch.Tensor]: Features of different images
"""
assert
isinstance
(
imgs
,
list
)
return
[
self
.
extract_feat
(
img
)
for
img
in
imgs
]
def
forward_train
(
self
,
imgs
,
img_metas
,
**
kwargs
):
"""
Args:
img (list[Tensor]): List of tensors of shape (1, C, H, W).
Typically these should be mean centered and std scaled.
img_metas (list[dict]): List of image info dict where each dict
has: 'img_shape', 'scale_factor', 'flip', and may also contain
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
For details on the values of these keys, see
:class:`mmdet.datasets.pipelines.Collect`.
kwargs (keyword arguments): Specific to concrete implementation.
"""
# NOTE the batched image size information may be useful, e.g.
# in DETR, this is needed for the construction of masks, which is
# then used for the transformer_head.
batch_input_shape
=
tuple
(
imgs
[
0
].
size
()[
-
2
:])
for
img_meta
in
img_metas
:
img_meta
[
'batch_input_shape'
]
=
batch_input_shape
async
def
async_simple_test
(
self
,
img
,
img_metas
,
**
kwargs
):
raise
NotImplementedError
@
abstractmethod
def
simple_test
(
self
,
img
,
img_metas
,
**
kwargs
):
pass
@
abstractmethod
def
aug_test
(
self
,
imgs
,
img_metas
,
**
kwargs
):
"""Test function with test time augmentation."""
pass
def
init_weights
(
self
,
pretrained
=
None
):
"""Initialize the weights in detector.
Args:
pretrained (str, optional): Path to pre-trained weights.
Defaults to None.
"""
if
pretrained
is
not
None
:
logger
=
get_root_logger
()
print_log
(
f
'load model from:
{
pretrained
}
'
,
logger
=
logger
)
async
def
aforward_test
(
self
,
*
,
img
,
img_metas
,
**
kwargs
):
for
var
,
name
in
[(
img
,
'img'
),
(
img_metas
,
'img_metas'
)]:
if
not
isinstance
(
var
,
list
):
raise
TypeError
(
f
'
{
name
}
must be a list, but got
{
type
(
var
)
}
'
)
num_augs
=
len
(
img
)
if
num_augs
!=
len
(
img_metas
):
raise
ValueError
(
f
'num of augmentations (
{
len
(
img
)
}
) '
f
'!= num of image metas (
{
len
(
img_metas
)
}
)'
)
# TODO: remove the restriction of samples_per_gpu == 1 when prepared
samples_per_gpu
=
img
[
0
].
size
(
0
)
assert
samples_per_gpu
==
1
if
num_augs
==
1
:
return
await
self
.
async_simple_test
(
img
[
0
],
img_metas
[
0
],
**
kwargs
)
else
:
raise
NotImplementedError
def
forward_test
(
self
,
imgs
,
img_metas
,
**
kwargs
):
"""
Args:
imgs (List[Tensor]): the outer list indicates test-time
augmentations and inner Tensor should have a shape NxCxHxW,
which contains all images in the batch.
img_metas (List[List[dict]]): the outer list indicates test-time
augs (multiscale, flip, etc.) and the inner list indicates
images in a batch.
"""
for
var
,
name
in
[(
imgs
,
'imgs'
),
(
img_metas
,
'img_metas'
)]:
if
not
isinstance
(
var
,
list
):
raise
TypeError
(
f
'
{
name
}
must be a list, but got
{
type
(
var
)
}
'
)
num_augs
=
len
(
imgs
)
if
num_augs
!=
len
(
img_metas
):
raise
ValueError
(
f
'num of augmentations (
{
len
(
imgs
)
}
) '
f
'!= num of image meta (
{
len
(
img_metas
)
}
)'
)
# NOTE the batched image size information may be useful, e.g.
# in DETR, this is needed for the construction of masks, which is
# then used for the transformer_head.
for
img
,
img_meta
in
zip
(
imgs
,
img_metas
):
batch_size
=
len
(
img_meta
)
for
img_id
in
range
(
batch_size
):
img_meta
[
img_id
][
'batch_input_shape'
]
=
tuple
(
img
.
size
()[
-
2
:])
if
num_augs
==
1
:
# proposals (List[List[Tensor]]): the outer list indicates
# test-time augs (multiscale, flip, etc.) and the inner list
# indicates images in a batch.
# The Tensor should have a shape Px4, where P is the number of
# proposals.
if
'proposals'
in
kwargs
:
kwargs
[
'proposals'
]
=
kwargs
[
'proposals'
][
0
]
return
self
.
simple_test
(
imgs
[
0
],
img_metas
[
0
],
**
kwargs
)
else
:
assert
imgs
[
0
].
size
(
0
)
==
1
,
'aug test does not support '
\
'inference with batch size '
\
f
'
{
imgs
[
0
].
size
(
0
)
}
'
# TODO: support test augmentation for predefined proposals
assert
'proposals'
not
in
kwargs
return
self
.
aug_test
(
imgs
,
img_metas
,
**
kwargs
)
@
auto_fp16
(
apply_to
=
(
'img'
,
))
def
forward
(
self
,
img
,
img_metas
,
return_loss
=
True
,
**
kwargs
):
"""Calls either :func:`forward_train` or :func:`forward_test` depending
on whether ``return_loss`` is ``True``.
Note this setting will change the expected inputs. When
``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
and List[dict]), and when ``resturn_loss=False``, img and img_meta
should be double nested (i.e. List[Tensor], List[List[dict]]), with
the outer list indicating test time augmentations.
"""
if
return_loss
:
return
self
.
forward_train
(
img
,
img_metas
,
**
kwargs
)
else
:
return
self
.
forward_test
(
img
,
img_metas
,
**
kwargs
)
def
_parse_losses
(
self
,
losses
):
"""Parse the raw outputs (losses) of the network.
Args:
losses (dict): Raw output of the network, which usually contain
losses and other necessary infomation.
Returns:
tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
\
which may be a weighted sum of all losses, log_vars contains
\
all the variables to be sent to the logger.
"""
log_vars
=
OrderedDict
()
for
loss_name
,
loss_value
in
losses
.
items
():
if
isinstance
(
loss_value
,
torch
.
Tensor
):
log_vars
[
loss_name
]
=
loss_value
.
mean
()
elif
isinstance
(
loss_value
,
list
):
log_vars
[
loss_name
]
=
sum
(
_loss
.
mean
()
for
_loss
in
loss_value
)
else
:
raise
TypeError
(
f
'
{
loss_name
}
is not a tensor or list of tensors'
)
loss
=
sum
(
_value
for
_key
,
_value
in
log_vars
.
items
()
if
'loss'
in
_key
)
log_vars
[
'loss'
]
=
loss
for
loss_name
,
loss_value
in
log_vars
.
items
():
# reduce loss when distributed training
if
dist
.
is_available
()
and
dist
.
is_initialized
():
loss_value
=
loss_value
.
data
.
clone
()
dist
.
all_reduce
(
loss_value
.
div_
(
dist
.
get_world_size
()))
log_vars
[
loss_name
]
=
loss_value
.
item
()
return
loss
,
log_vars
def
train_step
(
self
,
data
,
optimizer
):
"""The iteration step during training.
This method defines an iteration step during training, except for the
back propagation and optimizer updating, which are done in an optimizer
hook. Note that in some complicated cases or models, the whole process
including back propagation and optimizer updating is also defined in
this method, such as GAN.
Args:
data (dict): The output of dataloader.
optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
runner is passed to ``train_step()``. This argument is unused
and reserved.
Returns:
dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
\
``num_samples``.
- ``loss`` is a tensor for back propagation, which can be a
\
weighted sum of multiple losses.
- ``log_vars`` contains all the variables to be sent to the
logger.
- ``num_samples`` indicates the batch size (when the model is
\
DDP, it means the batch size on each GPU), which is used for
\
averaging the logs.
"""
losses
=
self
(
**
data
)
loss
,
log_vars
=
self
.
_parse_losses
(
losses
)
outputs
=
dict
(
loss
=
loss
,
log_vars
=
log_vars
,
num_samples
=
len
(
data
[
'img_metas'
]))
return
outputs
def
val_step
(
self
,
data
,
optimizer
):
"""The iteration step during validation.
This method shares the same signature as :func:`train_step`, but used
during val epochs. Note that the evaluation after training epochs is
not implemented with this method, but an evaluation hook.
"""
losses
=
self
(
**
data
)
loss
,
log_vars
=
self
.
_parse_losses
(
losses
)
outputs
=
dict
(
loss
=
loss
,
log_vars
=
log_vars
,
num_samples
=
len
(
data
[
'img_metas'
]))
return
outputs
def
show_result
(
self
,
img
,
result
,
score_thr
=
0.3
,
bbox_color
=
(
72
,
101
,
241
),
text_color
=
(
72
,
101
,
241
),
mask_color
=
None
,
thickness
=
2
,
font_scale
=
0.5
,
font_size
=
13
,
win_name
=
''
,
fig_size
=
(
15
,
10
),
show
=
False
,
wait_time
=
0
,
out_file
=
None
):
"""Draw `result` over `img`.
Args:
img (str or Tensor): The image to be displayed.
result (Tensor or tuple): The results to draw over `img`
bbox_result or (bbox_result, segm_result).
score_thr (float, optional): Minimum score of bboxes to be shown.
Default: 0.3.
bbox_color (str or tuple(int) or :obj:`Color`):Color of bbox lines.
The tuple of color should be in BGR order. Default: 'green'
text_color (str or tuple(int) or :obj:`Color`):Color of texts.
The tuple of color should be in BGR order. Default: 'green'
mask_color (None or str or tuple(int) or :obj:`Color`):
Color of masks. The tuple of color should be in BGR order.
Default: None
thickness (int): Thickness of lines. Default: 2
font_scale (float): Font scales of texts. Default: 0.5
font_size (int): Font size of texts. Default: 13
win_name (str): The window name. Default: ''
fig_size (tuple): Figure size of the pyplot figure.
Default: (15, 10)
wait_time (float): Value of waitKey param.
Default: 0.
show (bool): Whether to show the image.
Default: False.
out_file (str or None): The filename to write the image.
Default: None.
Returns:
img (Tensor): Only if not `show` or `out_file`
"""
img
=
mmcv
.
imread
(
img
)
img
=
img
.
copy
()
if
isinstance
(
result
,
tuple
):
bbox_result
,
segm_result
=
result
if
isinstance
(
segm_result
,
tuple
):
segm_result
=
segm_result
[
0
]
# ms rcnn
else
:
bbox_result
,
segm_result
=
result
,
None
bboxes
=
np
.
vstack
(
bbox_result
)
labels
=
[
np
.
full
(
bbox
.
shape
[
0
],
i
,
dtype
=
np
.
int32
)
for
i
,
bbox
in
enumerate
(
bbox_result
)
]
labels
=
np
.
concatenate
(
labels
)
# draw segmentation masks
segms
=
None
if
segm_result
is
not
None
and
len
(
labels
)
>
0
:
# non empty
segms
=
mmcv
.
concat_list
(
segm_result
)
if
isinstance
(
segms
[
0
],
torch
.
Tensor
):
segms
=
torch
.
stack
(
segms
,
dim
=
0
).
detach
().
cpu
().
numpy
()
else
:
segms
=
np
.
stack
(
segms
,
axis
=
0
)
# if out_file specified, do not show image in window
if
out_file
is
not
None
:
show
=
False
# draw bounding boxes
imshow_det_bboxes
(
img
,
bboxes
,
labels
,
segms
,
class_names
=
self
.
CLASSES
,
score_thr
=
score_thr
,
bbox_color
=
bbox_color
,
text_color
=
text_color
,
mask_color
=
mask_color
,
thickness
=
thickness
,
font_scale
=
font_scale
,
font_size
=
font_size
,
win_name
=
win_name
,
fig_size
=
fig_size
,
show
=
show
,
wait_time
=
wait_time
,
out_file
=
out_file
)
if
not
(
show
or
out_file
):
return
img
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/cascade_rcnn.py
0 → 100644
View file @
142dcf29
from
..builder
import
DETECTORS
from
.two_stage
import
TwoStageDetector
@
DETECTORS
.
register_module
()
class
CascadeRCNN
(
TwoStageDetector
):
r
"""Implementation of `Cascade R-CNN: Delving into High Quality Object
Detection <https://arxiv.org/abs/1906.09756>`_"""
def
__init__
(
self
,
backbone
,
neck
=
None
,
rpn_head
=
None
,
roi_head
=
None
,
train_cfg
=
None
,
test_cfg
=
None
,
pretrained
=
None
):
super
(
CascadeRCNN
,
self
).
__init__
(
backbone
=
backbone
,
neck
=
neck
,
rpn_head
=
rpn_head
,
roi_head
=
roi_head
,
train_cfg
=
train_cfg
,
test_cfg
=
test_cfg
,
pretrained
=
pretrained
)
def
show_result
(
self
,
data
,
result
,
**
kwargs
):
"""Show prediction results of the detector."""
if
self
.
with_mask
:
ms_bbox_result
,
ms_segm_result
=
result
if
isinstance
(
ms_bbox_result
,
dict
):
result
=
(
ms_bbox_result
[
'ensemble'
],
ms_segm_result
[
'ensemble'
])
else
:
if
isinstance
(
result
,
dict
):
result
=
result
[
'ensemble'
]
return
super
(
CascadeRCNN
,
self
).
show_result
(
data
,
result
,
**
kwargs
)
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/cornernet.py
0 → 100644
View file @
142dcf29
import
torch
from
mmdet.core
import
bbox2result
,
bbox_mapping_back
from
..builder
import
DETECTORS
from
.single_stage
import
SingleStageDetector
@
DETECTORS
.
register_module
()
class
CornerNet
(
SingleStageDetector
):
"""CornerNet.
This detector is the implementation of the paper `CornerNet: Detecting
Objects as Paired Keypoints <https://arxiv.org/abs/1808.01244>`_ .
"""
def
__init__
(
self
,
backbone
,
neck
,
bbox_head
,
train_cfg
=
None
,
test_cfg
=
None
,
pretrained
=
None
):
super
(
CornerNet
,
self
).
__init__
(
backbone
,
neck
,
bbox_head
,
train_cfg
,
test_cfg
,
pretrained
)
def
merge_aug_results
(
self
,
aug_results
,
img_metas
):
"""Merge augmented detection bboxes and score.
Args:
aug_results (list[list[Tensor]]): Det_bboxes and det_labels of each
image.
img_metas (list[list[dict]]): Meta information of each image, e.g.,
image size, scaling factor, etc.
Returns:
tuple: (bboxes, labels)
"""
recovered_bboxes
,
aug_labels
=
[],
[]
for
bboxes_labels
,
img_info
in
zip
(
aug_results
,
img_metas
):
img_shape
=
img_info
[
0
][
'img_shape'
]
# using shape before padding
scale_factor
=
img_info
[
0
][
'scale_factor'
]
flip
=
img_info
[
0
][
'flip'
]
bboxes
,
labels
=
bboxes_labels
bboxes
,
scores
=
bboxes
[:,
:
4
],
bboxes
[:,
-
1
:]
bboxes
=
bbox_mapping_back
(
bboxes
,
img_shape
,
scale_factor
,
flip
)
recovered_bboxes
.
append
(
torch
.
cat
([
bboxes
,
scores
],
dim
=-
1
))
aug_labels
.
append
(
labels
)
bboxes
=
torch
.
cat
(
recovered_bboxes
,
dim
=
0
)
labels
=
torch
.
cat
(
aug_labels
)
if
bboxes
.
shape
[
0
]
>
0
:
out_bboxes
,
out_labels
=
self
.
bbox_head
.
_bboxes_nms
(
bboxes
,
labels
,
self
.
bbox_head
.
test_cfg
)
else
:
out_bboxes
,
out_labels
=
bboxes
,
labels
return
out_bboxes
,
out_labels
def
aug_test
(
self
,
imgs
,
img_metas
,
rescale
=
False
):
"""Augment testing of CornerNet.
Args:
imgs (list[Tensor]): Augmented images.
img_metas (list[list[dict]]): Meta information of each image, e.g.,
image size, scaling factor, etc.
rescale (bool): If True, return boxes in original image space.
Default: False.
Note:
``imgs`` must including flipped image pairs.
Returns:
list[list[np.ndarray]]: BBox results of each image and classes.
The outer list corresponds to each image. The inner list
corresponds to each class.
"""
img_inds
=
list
(
range
(
len
(
imgs
)))
assert
img_metas
[
0
][
0
][
'flip'
]
+
img_metas
[
1
][
0
][
'flip'
],
(
'aug test must have flipped image pair'
)
aug_results
=
[]
for
ind
,
flip_ind
in
zip
(
img_inds
[
0
::
2
],
img_inds
[
1
::
2
]):
img_pair
=
torch
.
cat
([
imgs
[
ind
],
imgs
[
flip_ind
]])
x
=
self
.
extract_feat
(
img_pair
)
outs
=
self
.
bbox_head
(
x
)
bbox_list
=
self
.
bbox_head
.
get_bboxes
(
*
outs
,
[
img_metas
[
ind
],
img_metas
[
flip_ind
]],
False
,
False
)
aug_results
.
append
(
bbox_list
[
0
])
aug_results
.
append
(
bbox_list
[
1
])
bboxes
,
labels
=
self
.
merge_aug_results
(
aug_results
,
img_metas
)
bbox_results
=
bbox2result
(
bboxes
,
labels
,
self
.
bbox_head
.
num_classes
)
return
[
bbox_results
]
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/detr.py
0 → 100644
View file @
142dcf29
from
mmdet.core
import
bbox2result
from
..builder
import
DETECTORS
from
.single_stage
import
SingleStageDetector
@
DETECTORS
.
register_module
()
class
DETR
(
SingleStageDetector
):
r
"""Implementation of `DETR: End-to-End Object Detection with
Transformers <https://arxiv.org/pdf/2005.12872>`_"""
def
__init__
(
self
,
backbone
,
bbox_head
,
train_cfg
=
None
,
test_cfg
=
None
,
pretrained
=
None
):
super
(
DETR
,
self
).
__init__
(
backbone
,
None
,
bbox_head
,
train_cfg
,
test_cfg
,
pretrained
)
def
simple_test
(
self
,
img
,
img_metas
,
rescale
=
False
):
"""Test function without test time augmentation.
Args:
imgs (list[torch.Tensor]): List of multiple images
img_metas (list[dict]): List of image information.
rescale (bool, optional): Whether to rescale the results.
Defaults to False.
Returns:
list[list[np.ndarray]]: BBox results of each image and classes.
The outer list corresponds to each image. The inner list
corresponds to each class.
"""
batch_size
=
len
(
img_metas
)
assert
batch_size
==
1
,
'Currently only batch_size 1 for inference '
\
f
'mode is supported. Found batch_size
{
batch_size
}
.'
x
=
self
.
extract_feat
(
img
)
outs
=
self
.
bbox_head
(
x
,
img_metas
)
bbox_list
=
self
.
bbox_head
.
get_bboxes
(
*
outs
,
img_metas
,
rescale
=
rescale
)
bbox_results
=
[
bbox2result
(
det_bboxes
,
det_labels
,
self
.
bbox_head
.
num_classes
)
for
det_bboxes
,
det_labels
in
bbox_list
]
return
bbox_results
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/fast_rcnn.py
0 → 100644
View file @
142dcf29
from
..builder
import
DETECTORS
from
.two_stage
import
TwoStageDetector
@
DETECTORS
.
register_module
()
class
FastRCNN
(
TwoStageDetector
):
"""Implementation of `Fast R-CNN <https://arxiv.org/abs/1504.08083>`_"""
def
__init__
(
self
,
backbone
,
roi_head
,
train_cfg
,
test_cfg
,
neck
=
None
,
pretrained
=
None
):
super
(
FastRCNN
,
self
).
__init__
(
backbone
=
backbone
,
neck
=
neck
,
roi_head
=
roi_head
,
train_cfg
=
train_cfg
,
test_cfg
=
test_cfg
,
pretrained
=
pretrained
)
def
forward_test
(
self
,
imgs
,
img_metas
,
proposals
,
**
kwargs
):
"""
Args:
imgs (List[Tensor]): the outer list indicates test-time
augmentations and inner Tensor should have a shape NxCxHxW,
which contains all images in the batch.
img_metas (List[List[dict]]): the outer list indicates test-time
augs (multiscale, flip, etc.) and the inner list indicates
images in a batch.
proposals (List[List[Tensor]]): the outer list indicates test-time
augs (multiscale, flip, etc.) and the inner list indicates
images in a batch. The Tensor should have a shape Px4, where
P is the number of proposals.
"""
for
var
,
name
in
[(
imgs
,
'imgs'
),
(
img_metas
,
'img_metas'
)]:
if
not
isinstance
(
var
,
list
):
raise
TypeError
(
f
'
{
name
}
must be a list, but got
{
type
(
var
)
}
'
)
num_augs
=
len
(
imgs
)
if
num_augs
!=
len
(
img_metas
):
raise
ValueError
(
f
'num of augmentations (
{
len
(
imgs
)
}
) '
f
'!= num of image meta (
{
len
(
img_metas
)
}
)'
)
if
num_augs
==
1
:
return
self
.
simple_test
(
imgs
[
0
],
img_metas
[
0
],
proposals
[
0
],
**
kwargs
)
else
:
# TODO: support test-time augmentation
assert
NotImplementedError
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/faster_rcnn.py
0 → 100644
View file @
142dcf29
from
..builder
import
DETECTORS
from
.two_stage
import
TwoStageDetector
@
DETECTORS
.
register_module
()
class
FasterRCNN
(
TwoStageDetector
):
"""Implementation of `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_"""
def
__init__
(
self
,
backbone
,
rpn_head
,
roi_head
,
train_cfg
,
test_cfg
,
neck
=
None
,
pretrained
=
None
):
super
(
FasterRCNN
,
self
).
__init__
(
backbone
=
backbone
,
neck
=
neck
,
rpn_head
=
rpn_head
,
roi_head
=
roi_head
,
train_cfg
=
train_cfg
,
test_cfg
=
test_cfg
,
pretrained
=
pretrained
)
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/fcos.py
0 → 100644
View file @
142dcf29
from
..builder
import
DETECTORS
from
.single_stage
import
SingleStageDetector
@
DETECTORS
.
register_module
()
class
FCOS
(
SingleStageDetector
):
"""Implementation of `FCOS <https://arxiv.org/abs/1904.01355>`_"""
def
__init__
(
self
,
backbone
,
neck
,
bbox_head
,
train_cfg
=
None
,
test_cfg
=
None
,
pretrained
=
None
):
super
(
FCOS
,
self
).
__init__
(
backbone
,
neck
,
bbox_head
,
train_cfg
,
test_cfg
,
pretrained
)
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/fovea.py
0 → 100644
View file @
142dcf29
from
..builder
import
DETECTORS
from
.single_stage
import
SingleStageDetector
@
DETECTORS
.
register_module
()
class
FOVEA
(
SingleStageDetector
):
"""Implementation of `FoveaBox <https://arxiv.org/abs/1904.03797>`_"""
def
__init__
(
self
,
backbone
,
neck
,
bbox_head
,
train_cfg
=
None
,
test_cfg
=
None
,
pretrained
=
None
):
super
(
FOVEA
,
self
).
__init__
(
backbone
,
neck
,
bbox_head
,
train_cfg
,
test_cfg
,
pretrained
)
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/fsaf.py
0 → 100644
View file @
142dcf29
from
..builder
import
DETECTORS
from
.single_stage
import
SingleStageDetector
@
DETECTORS
.
register_module
()
class
FSAF
(
SingleStageDetector
):
"""Implementation of `FSAF <https://arxiv.org/abs/1903.00621>`_"""
def
__init__
(
self
,
backbone
,
neck
,
bbox_head
,
train_cfg
=
None
,
test_cfg
=
None
,
pretrained
=
None
):
super
(
FSAF
,
self
).
__init__
(
backbone
,
neck
,
bbox_head
,
train_cfg
,
test_cfg
,
pretrained
)
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/gfl.py
0 → 100644
View file @
142dcf29
from
..builder
import
DETECTORS
from
.single_stage
import
SingleStageDetector
@
DETECTORS
.
register_module
()
class
GFL
(
SingleStageDetector
):
def
__init__
(
self
,
backbone
,
neck
,
bbox_head
,
train_cfg
=
None
,
test_cfg
=
None
,
pretrained
=
None
):
super
(
GFL
,
self
).
__init__
(
backbone
,
neck
,
bbox_head
,
train_cfg
,
test_cfg
,
pretrained
)
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/grid_rcnn.py
0 → 100644
View file @
142dcf29
from
..builder
import
DETECTORS
from
.two_stage
import
TwoStageDetector
@
DETECTORS
.
register_module
()
class
GridRCNN
(
TwoStageDetector
):
"""Grid R-CNN.
This detector is the implementation of:
- Grid R-CNN (https://arxiv.org/abs/1811.12030)
- Grid R-CNN Plus: Faster and Better (https://arxiv.org/abs/1906.05688)
"""
def
__init__
(
self
,
backbone
,
rpn_head
,
roi_head
,
train_cfg
,
test_cfg
,
neck
=
None
,
pretrained
=
None
):
super
(
GridRCNN
,
self
).
__init__
(
backbone
=
backbone
,
neck
=
neck
,
rpn_head
=
rpn_head
,
roi_head
=
roi_head
,
train_cfg
=
train_cfg
,
test_cfg
=
test_cfg
,
pretrained
=
pretrained
)
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/htc.py
0 → 100644
View file @
142dcf29
from
..builder
import
DETECTORS
from
.cascade_rcnn
import
CascadeRCNN
@
DETECTORS
.
register_module
()
class
HybridTaskCascade
(
CascadeRCNN
):
"""Implementation of `HTC <https://arxiv.org/abs/1901.07518>`_"""
def
__init__
(
self
,
**
kwargs
):
super
(
HybridTaskCascade
,
self
).
__init__
(
**
kwargs
)
@
property
def
with_semantic
(
self
):
"""bool: whether the detector has a semantic head"""
return
self
.
roi_head
.
with_semantic
PyTorch/NLP/Conformer-main/mmdetection/mmdet/models/detectors/mask_rcnn.py
0 → 100644
View file @
142dcf29
from
..builder
import
DETECTORS
from
.two_stage
import
TwoStageDetector
@
DETECTORS
.
register_module
()
class
MaskRCNN
(
TwoStageDetector
):
"""Implementation of `Mask R-CNN <https://arxiv.org/abs/1703.06870>`_"""
def
__init__
(
self
,
backbone
,
rpn_head
,
roi_head
,
train_cfg
,
test_cfg
,
neck
=
None
,
pretrained
=
None
):
super
(
MaskRCNN
,
self
).
__init__
(
backbone
=
backbone
,
neck
=
neck
,
rpn_head
=
rpn_head
,
roi_head
=
roi_head
,
train_cfg
=
train_cfg
,
test_cfg
=
test_cfg
,
pretrained
=
pretrained
)
Prev
1
…
10
11
12
13
14
15
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment