Commit a4372e17 authored by huchen's avatar huchen
Browse files

Merge branch 'hepj_work' into 'main'

增加conformer代码

See merge request dcutoolkit/deeplearing/dlexamples_new!7
parents 7f99c1c3 142dcf29
from inspect import signature
import torch
from mmdet.core import bbox2result, bbox_mapping_back, multiclass_nms
class BBoxTestMixin(object):
"""Mixin class for test time augmentation of bboxes."""
def merge_aug_bboxes(self, aug_bboxes, aug_scores, img_metas):
"""Merge augmented detection bboxes and scores.
Args:
aug_bboxes (list[Tensor]): shape (n, 4*#class)
aug_scores (list[Tensor] or None): shape (n, #class)
img_shapes (list[Tensor]): shape (3, ).
Returns:
tuple: (bboxes, scores)
"""
recovered_bboxes = []
for bboxes, img_info in zip(aug_bboxes, img_metas):
img_shape = img_info[0]['img_shape']
scale_factor = img_info[0]['scale_factor']
flip = img_info[0]['flip']
flip_direction = img_info[0]['flip_direction']
bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip,
flip_direction)
recovered_bboxes.append(bboxes)
bboxes = torch.cat(recovered_bboxes, dim=0)
if aug_scores is None:
return bboxes
else:
scores = torch.cat(aug_scores, dim=0)
return bboxes, scores
def aug_test_bboxes(self, feats, img_metas, rescale=False):
"""Test det bboxes with test time augmentation.
Args:
feats (list[Tensor]): the outer list indicates test-time
augmentations and inner Tensor should have a shape NxCxHxW,
which contains features for all images in the batch.
img_metas (list[list[dict]]): the outer list indicates test-time
augs (multiscale, flip, etc.) and the inner list indicates
images in a batch. each dict has image information.
rescale (bool, optional): Whether to rescale the results.
Defaults to False.
Returns:
list[ndarray]: bbox results of each class
"""
# check with_nms argument
gb_sig = signature(self.get_bboxes)
gb_args = [p.name for p in gb_sig.parameters.values()]
gbs_sig = signature(self._get_bboxes_single)
gbs_args = [p.name for p in gbs_sig.parameters.values()]
assert ('with_nms' in gb_args) and ('with_nms' in gbs_args), \
f'{self.__class__.__name__}' \
' does not support test-time augmentation'
aug_bboxes = []
aug_scores = []
aug_factors = [] # score_factors for NMS
for x, img_meta in zip(feats, img_metas):
# only one image in the batch
outs = self.forward(x)
bbox_inputs = outs + (img_meta, self.test_cfg, False, False)
bbox_outputs = self.get_bboxes(*bbox_inputs)[0]
aug_bboxes.append(bbox_outputs[0])
aug_scores.append(bbox_outputs[1])
# bbox_outputs of some detectors (e.g., ATSS, FCOS, YOLOv3)
# contains additional element to adjust scores before NMS
if len(bbox_outputs) >= 3:
aug_factors.append(bbox_outputs[2])
# after merging, bboxes will be rescaled to the original image size
merged_bboxes, merged_scores = self.merge_aug_bboxes(
aug_bboxes, aug_scores, img_metas)
merged_factors = torch.cat(aug_factors, dim=0) if aug_factors else None
det_bboxes, det_labels = multiclass_nms(
merged_bboxes,
merged_scores,
self.test_cfg.score_thr,
self.test_cfg.nms,
self.test_cfg.max_per_img,
score_factors=merged_factors)
if rescale:
_det_bboxes = det_bboxes
else:
_det_bboxes = det_bboxes.clone()
_det_bboxes[:, :4] *= det_bboxes.new_tensor(
img_metas[0][0]['scale_factor'])
bbox_results = bbox2result(_det_bboxes, det_labels, self.num_classes)
return bbox_results
import mmcv
import torch
import torch.nn as nn
from mmcv import tensor2imgs
from mmdet.models.builder import HEADS
from ...core import bbox_cxcywh_to_xyxy
@HEADS.register_module()
class EmbeddingRPNHead(nn.Module):
"""RPNHead in the `Sparse R-CNN <https://arxiv.org/abs/2011.12450>`_ .
Unlike traditional RPNHead, this module does not need FPN input, but just
decode `init_proposal_bboxes` and expand the first dimension of
`init_proposal_bboxes` and `init_proposal_features` to the batch_size.
Args:
num_proposals (int): Number of init_proposals. Default 100.
proposal_feature_channel (int): Channel number of
init_proposal_feature. Defaults to 256.
"""
def __init__(self,
num_proposals=100,
proposal_feature_channel=256,
**kwargs):
super(EmbeddingRPNHead, self).__init__()
self.num_proposals = num_proposals
self.proposal_feature_channel = proposal_feature_channel
self._init_layers()
def _init_layers(self):
"""Initialize a sparse set of proposal boxes and proposal features."""
self.init_proposal_bboxes = nn.Embedding(self.num_proposals, 4)
self.init_proposal_features = nn.Embedding(
self.num_proposals, self.proposal_feature_channel)
def init_weights(self):
"""Initialize the init_proposal_bboxes as normalized.
[c_x, c_y, w, h], and we initialize it to the size of the entire
image.
"""
nn.init.constant_(self.init_proposal_bboxes.weight[:, :2], 0.5)
nn.init.constant_(self.init_proposal_bboxes.weight[:, 2:], 1)
def _decode_init_proposals(self, imgs, img_metas):
"""Decode init_proposal_bboxes according to the size of images and
expand dimension of init_proposal_features to batch_size.
Args:
imgs (list[Tensor]): List of FPN features.
img_metas (list[dict]): List of meta-information of
images. Need the img_shape to decode the init_proposals.
Returns:
Tuple(Tensor):
- proposals (Tensor): Decoded proposal bboxes,
has shape (batch_size, num_proposals, 4).
- init_proposal_features (Tensor): Expanded proposal
features, has shape
(batch_size, num_proposals, proposal_feature_channel).
- imgs_whwh (Tensor): Tensor with shape
(batch_size, 4), the dimension means
[img_width, img_height, img_width, img_height].
"""
proposals = self.init_proposal_bboxes.weight.clone()
proposals = bbox_cxcywh_to_xyxy(proposals)
num_imgs = len(imgs[0])
imgs_whwh = []
for meta in img_metas:
h, w, _ = meta['img_shape']
imgs_whwh.append(imgs[0].new_tensor([[w, h, w, h]]))
imgs_whwh = torch.cat(imgs_whwh, dim=0)
imgs_whwh = imgs_whwh[:, None, :]
# imgs_whwh has shape (batch_size, 1, 4)
# The shape of proposals change from (num_proposals, 4)
# to (batch_size ,num_proposals, 4)
proposals = proposals * imgs_whwh
init_proposal_features = self.init_proposal_features.weight.clone()
init_proposal_features = init_proposal_features[None].expand(
num_imgs, *init_proposal_features.size())
return proposals, init_proposal_features, imgs_whwh
def forward_dummy(self, img, img_metas):
"""Dummy forward function.
Used in flops calculation.
"""
return self._decode_init_proposals(img, img_metas)
def forward_train(self, img, img_metas):
"""Forward function in training stage."""
return self._decode_init_proposals(img, img_metas)
def simple_test_rpn(self, img, img_metas):
"""Forward function in testing stage."""
return self._decode_init_proposals(img, img_metas)
def show_result(self, data):
"""Show the init proposals in EmbeddingRPN.
Args:
data (dict): Dict contains image and
corresponding meta information.
"""
img_tensor = data['img'][0]
img_metas = data['img_metas'][0].data[0]
imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
proposals, _ = self._decode_init_proposals(data['img'],
data['img_metas'])
assert len(imgs) == len(img_metas)
for img, img_meta in zip(imgs, img_metas):
h, w, _ = img_meta['img_shape']
img_show = img[:h, :w, :]
mmcv.imshow_bboxes(img_show, proposals)
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import Scale, normal_init
from mmcv.runner import force_fp32
from mmdet.core import distance2bbox, multi_apply, multiclass_nms, reduce_mean
from ..builder import HEADS, build_loss
from .anchor_free_head import AnchorFreeHead
INF = 1e8
@HEADS.register_module()
class FCOSHead(AnchorFreeHead):
"""Anchor-free head used in `FCOS <https://arxiv.org/abs/1904.01355>`_.
The FCOS head does not use anchor boxes. Instead bounding boxes are
predicted at each pixel and a centerness measure is used to supress
low-quality predictions.
Here norm_on_bbox, centerness_on_reg, dcn_on_last_conv are training
tricks used in official repo, which will bring remarkable mAP gains
of up to 4.9. Please see https://github.com/tianzhi0549/FCOS for
more detail.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
strides (list[int] | list[tuple[int, int]]): Strides of points
in multiple feature levels. Default: (4, 8, 16, 32, 64).
regress_ranges (tuple[tuple[int, int]]): Regress range of multiple
level points.
center_sampling (bool): If true, use center sampling. Default: False.
center_sample_radius (float): Radius of center sampling. Default: 1.5.
norm_on_bbox (bool): If true, normalize the regression targets
with FPN strides. Default: False.
centerness_on_reg (bool): If true, position centerness on the
regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042.
Default: False.
conv_bias (bool | str): If specified as `auto`, it will be decided by the
norm_cfg. Bias of conv will be set as True if `norm_cfg` is None, otherwise
False. Default: "auto".
loss_cls (dict): Config of classification loss.
loss_bbox (dict): Config of localization loss.
loss_centerness (dict): Config of centerness loss.
norm_cfg (dict): dictionary to construct and config norm layer.
Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
Example:
>>> self = FCOSHead(11, 7)
>>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
>>> cls_score, bbox_pred, centerness = self.forward(feats)
>>> assert len(cls_score) == len(self.scales)
""" # noqa: E501
def __init__(self,
num_classes,
in_channels,
regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 512),
(512, INF)),
center_sampling=False,
center_sample_radius=1.5,
norm_on_bbox=False,
centerness_on_reg=False,
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='IoULoss', loss_weight=1.0),
loss_centerness=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
loss_weight=1.0),
norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
**kwargs):
self.regress_ranges = regress_ranges
self.center_sampling = center_sampling
self.center_sample_radius = center_sample_radius
self.norm_on_bbox = norm_on_bbox
self.centerness_on_reg = centerness_on_reg
super().__init__(
num_classes,
in_channels,
loss_cls=loss_cls,
loss_bbox=loss_bbox,
norm_cfg=norm_cfg,
**kwargs)
self.loss_centerness = build_loss(loss_centerness)
def _init_layers(self):
"""Initialize layers of the head."""
super()._init_layers()
self.conv_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1)
self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
def init_weights(self):
"""Initialize weights of the head."""
super().init_weights()
normal_init(self.conv_centerness, std=0.01)
def forward(self, feats):
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
tuple:
cls_scores (list[Tensor]): Box scores for each scale level, \
each is a 4D-tensor, the channel number is \
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each \
scale level, each is a 4D-tensor, the channel number is \
num_points * 4.
centernesses (list[Tensor]): Centerss for each scale level, \
each is a 4D-tensor, the channel number is num_points * 1.
"""
return multi_apply(self.forward_single, feats, self.scales,
self.strides)
def forward_single(self, x, scale, stride):
"""Forward features of a single scale levle.
Args:
x (Tensor): FPN feature maps of the specified stride.
scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
the bbox prediction.
stride (int): The corresponding stride for feature maps, only
used to normalize the bbox prediction when self.norm_on_bbox
is True.
Returns:
tuple: scores for each class, bbox predictions and centerness \
predictions of input feature maps.
"""
cls_score, bbox_pred, cls_feat, reg_feat = super().forward_single(x)
if self.centerness_on_reg:
centerness = self.conv_centerness(reg_feat)
else:
centerness = self.conv_centerness(cls_feat)
# scale the bbox_pred of different level
# float to avoid overflow when enabling FP16
bbox_pred = scale(bbox_pred).float()
if self.norm_on_bbox:
bbox_pred = F.relu(bbox_pred)
if not self.training:
bbox_pred *= stride
else:
bbox_pred = bbox_pred.exp()
return cls_score, bbox_pred, centerness
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'centernesses'))
def loss(self,
cls_scores,
bbox_preds,
centernesses,
gt_bboxes,
gt_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute loss of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level,
each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level, each is a 4D-tensor, the channel number is
num_points * 4.
centernesses (list[Tensor]): Centerss for each scale level, each
is a 4D-tensor, the channel number is num_points * 1.
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): specify which bounding
boxes can be ignored when computing the loss.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert len(cls_scores) == len(bbox_preds) == len(centernesses)
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
bbox_preds[0].device)
labels, bbox_targets = self.get_targets(all_level_points, gt_bboxes,
gt_labels)
num_imgs = cls_scores[0].size(0)
# flatten cls_scores, bbox_preds and centerness
flatten_cls_scores = [
cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
for cls_score in cls_scores
]
flatten_bbox_preds = [
bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
for bbox_pred in bbox_preds
]
flatten_centerness = [
centerness.permute(0, 2, 3, 1).reshape(-1)
for centerness in centernesses
]
flatten_cls_scores = torch.cat(flatten_cls_scores)
flatten_bbox_preds = torch.cat(flatten_bbox_preds)
flatten_centerness = torch.cat(flatten_centerness)
flatten_labels = torch.cat(labels)
flatten_bbox_targets = torch.cat(bbox_targets)
# repeat points to align with bbox_preds
flatten_points = torch.cat(
[points.repeat(num_imgs, 1) for points in all_level_points])
# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
bg_class_ind = self.num_classes
pos_inds = ((flatten_labels >= 0)
& (flatten_labels < bg_class_ind)).nonzero().reshape(-1)
num_pos = torch.tensor(
len(pos_inds), dtype=torch.float, device=bbox_preds[0].device)
num_pos = max(reduce_mean(num_pos), 1.0)
loss_cls = self.loss_cls(
flatten_cls_scores, flatten_labels, avg_factor=num_pos)
pos_bbox_preds = flatten_bbox_preds[pos_inds]
pos_centerness = flatten_centerness[pos_inds]
if len(pos_inds) > 0:
pos_bbox_targets = flatten_bbox_targets[pos_inds]
pos_centerness_targets = self.centerness_target(pos_bbox_targets)
pos_points = flatten_points[pos_inds]
pos_decoded_bbox_preds = distance2bbox(pos_points, pos_bbox_preds)
pos_decoded_target_preds = distance2bbox(pos_points,
pos_bbox_targets)
# centerness weighted iou loss
centerness_denorm = max(
reduce_mean(pos_centerness_targets.sum().detach()), 1e-6)
loss_bbox = self.loss_bbox(
pos_decoded_bbox_preds,
pos_decoded_target_preds,
weight=pos_centerness_targets,
avg_factor=centerness_denorm)
loss_centerness = self.loss_centerness(
pos_centerness, pos_centerness_targets, avg_factor=num_pos)
else:
loss_bbox = pos_bbox_preds.sum()
loss_centerness = pos_centerness.sum()
return dict(
loss_cls=loss_cls,
loss_bbox=loss_bbox,
loss_centerness=loss_centerness)
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'centernesses'))
def get_bboxes(self,
cls_scores,
bbox_preds,
centernesses,
img_metas,
cfg=None,
rescale=False,
with_nms=True):
"""Transform network output for a batch into bbox predictions.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
with shape (N, num_points * num_classes, H, W).
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_points * 4, H, W).
centernesses (list[Tensor]): Centerness for each scale level with
shape (N, num_points * 1, H, W).
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cfg (mmcv.Config | None): Test / postprocessing configuration,
if None, test_cfg would be used. Default: None.
rescale (bool): If True, return boxes in original image space.
Default: False.
with_nms (bool): If True, do nms before return boxes.
Default: True.
Returns:
list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
The first item is an (n, 5) tensor, where the first 4 columns
are bounding box positions (tl_x, tl_y, br_x, br_y) and the
5-th column is a score between 0 and 1. The second item is a
(n,) tensor where each item is the predicted class label of the
corresponding box.
"""
assert len(cls_scores) == len(bbox_preds)
num_levels = len(cls_scores)
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
bbox_preds[0].device)
result_list = []
for img_id in range(len(img_metas)):
cls_score_list = [
cls_scores[i][img_id].detach() for i in range(num_levels)
]
bbox_pred_list = [
bbox_preds[i][img_id].detach() for i in range(num_levels)
]
centerness_pred_list = [
centernesses[i][img_id].detach() for i in range(num_levels)
]
img_shape = img_metas[img_id]['img_shape']
scale_factor = img_metas[img_id]['scale_factor']
det_bboxes = self._get_bboxes_single(
cls_score_list, bbox_pred_list, centerness_pred_list,
mlvl_points, img_shape, scale_factor, cfg, rescale, with_nms)
result_list.append(det_bboxes)
return result_list
def _get_bboxes_single(self,
cls_scores,
bbox_preds,
centernesses,
mlvl_points,
img_shape,
scale_factor,
cfg,
rescale=False,
with_nms=True):
"""Transform outputs for a single batch item into bbox predictions.
Args:
cls_scores (list[Tensor]): Box scores for a single scale level
with shape (num_points * num_classes, H, W).
bbox_preds (list[Tensor]): Box energies / deltas for a single scale
level with shape (num_points * 4, H, W).
centernesses (list[Tensor]): Centerness for a single scale level
with shape (num_points * 4, H, W).
mlvl_points (list[Tensor]): Box reference for a single scale level
with shape (num_total_points, 4).
img_shape (tuple[int]): Shape of the input image,
(height, width, 3).
scale_factor (ndarray): Scale factor of the image arrange as
(w_scale, h_scale, w_scale, h_scale).
cfg (mmcv.Config | None): Test / postprocessing configuration,
if None, test_cfg would be used.
rescale (bool): If True, return boxes in original image space.
Default: False.
with_nms (bool): If True, do nms before return boxes.
Default: True.
Returns:
tuple(Tensor):
det_bboxes (Tensor): BBox predictions in shape (n, 5), where
the first 4 columns are bounding box positions
(tl_x, tl_y, br_x, br_y) and the 5-th column is a score
between 0 and 1.
det_labels (Tensor): A (n,) tensor where each item is the
predicted class label of the corresponding box.
"""
cfg = self.test_cfg if cfg is None else cfg
assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
mlvl_bboxes = []
mlvl_scores = []
mlvl_centerness = []
for cls_score, bbox_pred, centerness, points in zip(
cls_scores, bbox_preds, centernesses, mlvl_points):
assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
scores = cls_score.permute(1, 2, 0).reshape(
-1, self.cls_out_channels).sigmoid()
centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid()
bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
nms_pre = cfg.get('nms_pre', -1)
if nms_pre > 0 and scores.shape[0] > nms_pre:
max_scores, _ = (scores * centerness[:, None]).max(dim=1)
_, topk_inds = max_scores.topk(nms_pre)
points = points[topk_inds, :]
bbox_pred = bbox_pred[topk_inds, :]
scores = scores[topk_inds, :]
centerness = centerness[topk_inds]
bboxes = distance2bbox(points, bbox_pred, max_shape=img_shape)
mlvl_bboxes.append(bboxes)
mlvl_scores.append(scores)
mlvl_centerness.append(centerness)
mlvl_bboxes = torch.cat(mlvl_bboxes)
if rescale:
mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
mlvl_scores = torch.cat(mlvl_scores)
padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
# remind that we set FG labels to [0, num_class-1] since mmdet v2.0
# BG cat_id: num_class
mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
mlvl_centerness = torch.cat(mlvl_centerness)
if with_nms:
det_bboxes, det_labels = multiclass_nms(
mlvl_bboxes,
mlvl_scores,
cfg.score_thr,
cfg.nms,
cfg.max_per_img,
score_factors=mlvl_centerness)
return det_bboxes, det_labels
else:
return mlvl_bboxes, mlvl_scores, mlvl_centerness
def _get_points_single(self,
featmap_size,
stride,
dtype,
device,
flatten=False):
"""Get points according to feature map sizes."""
y, x = super()._get_points_single(featmap_size, stride, dtype, device)
points = torch.stack((x.reshape(-1) * stride, y.reshape(-1) * stride),
dim=-1) + stride // 2
return points
def get_targets(self, points, gt_bboxes_list, gt_labels_list):
"""Compute regression, classification and centerss targets for points
in multiple images.
Args:
points (list[Tensor]): Points of each fpn level, each has shape
(num_points, 2).
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
each has shape (num_gt, 4).
gt_labels_list (list[Tensor]): Ground truth labels of each box,
each has shape (num_gt,).
Returns:
tuple:
concat_lvl_labels (list[Tensor]): Labels of each level. \
concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
level.
"""
assert len(points) == len(self.regress_ranges)
num_levels = len(points)
# expand regress ranges to align with points
expanded_regress_ranges = [
points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
points[i]) for i in range(num_levels)
]
# concat all levels points and regress ranges
concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
concat_points = torch.cat(points, dim=0)
# the number of points per img, per lvl
num_points = [center.size(0) for center in points]
# get labels and bbox_targets of each image
labels_list, bbox_targets_list = multi_apply(
self._get_target_single,
gt_bboxes_list,
gt_labels_list,
points=concat_points,
regress_ranges=concat_regress_ranges,
num_points_per_lvl=num_points)
# split to per img, per level
labels_list = [labels.split(num_points, 0) for labels in labels_list]
bbox_targets_list = [
bbox_targets.split(num_points, 0)
for bbox_targets in bbox_targets_list
]
# concat per level image
concat_lvl_labels = []
concat_lvl_bbox_targets = []
for i in range(num_levels):
concat_lvl_labels.append(
torch.cat([labels[i] for labels in labels_list]))
bbox_targets = torch.cat(
[bbox_targets[i] for bbox_targets in bbox_targets_list])
if self.norm_on_bbox:
bbox_targets = bbox_targets / self.strides[i]
concat_lvl_bbox_targets.append(bbox_targets)
return concat_lvl_labels, concat_lvl_bbox_targets
def _get_target_single(self, gt_bboxes, gt_labels, points, regress_ranges,
num_points_per_lvl):
"""Compute regression and classification targets for a single image."""
num_points = points.size(0)
num_gts = gt_labels.size(0)
if num_gts == 0:
return gt_labels.new_full((num_points,), self.num_classes), \
gt_bboxes.new_zeros((num_points, 4))
areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
gt_bboxes[:, 3] - gt_bboxes[:, 1])
# TODO: figure out why these two are different
# areas = areas[None].expand(num_points, num_gts)
areas = areas[None].repeat(num_points, 1)
regress_ranges = regress_ranges[:, None, :].expand(
num_points, num_gts, 2)
gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
xs, ys = points[:, 0], points[:, 1]
xs = xs[:, None].expand(num_points, num_gts)
ys = ys[:, None].expand(num_points, num_gts)
left = xs - gt_bboxes[..., 0]
right = gt_bboxes[..., 2] - xs
top = ys - gt_bboxes[..., 1]
bottom = gt_bboxes[..., 3] - ys
bbox_targets = torch.stack((left, top, right, bottom), -1)
if self.center_sampling:
# condition1: inside a `center bbox`
radius = self.center_sample_radius
center_xs = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) / 2
center_ys = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) / 2
center_gts = torch.zeros_like(gt_bboxes)
stride = center_xs.new_zeros(center_xs.shape)
# project the points on current lvl back to the `original` sizes
lvl_begin = 0
for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl):
lvl_end = lvl_begin + num_points_lvl
stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius
lvl_begin = lvl_end
x_mins = center_xs - stride
y_mins = center_ys - stride
x_maxs = center_xs + stride
y_maxs = center_ys + stride
center_gts[..., 0] = torch.where(x_mins > gt_bboxes[..., 0],
x_mins, gt_bboxes[..., 0])
center_gts[..., 1] = torch.where(y_mins > gt_bboxes[..., 1],
y_mins, gt_bboxes[..., 1])
center_gts[..., 2] = torch.where(x_maxs > gt_bboxes[..., 2],
gt_bboxes[..., 2], x_maxs)
center_gts[..., 3] = torch.where(y_maxs > gt_bboxes[..., 3],
gt_bboxes[..., 3], y_maxs)
cb_dist_left = xs - center_gts[..., 0]
cb_dist_right = center_gts[..., 2] - xs
cb_dist_top = ys - center_gts[..., 1]
cb_dist_bottom = center_gts[..., 3] - ys
center_bbox = torch.stack(
(cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1)
inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
else:
# condition1: inside a gt bbox
inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0
# condition2: limit the regression range for each location
max_regress_distance = bbox_targets.max(-1)[0]
inside_regress_range = (
(max_regress_distance >= regress_ranges[..., 0])
& (max_regress_distance <= regress_ranges[..., 1]))
# if there are still more than one objects for a location,
# we choose the one with minimal area
areas[inside_gt_bbox_mask == 0] = INF
areas[inside_regress_range == 0] = INF
min_area, min_area_inds = areas.min(dim=1)
labels = gt_labels[min_area_inds]
labels[min_area == INF] = self.num_classes # set as BG
bbox_targets = bbox_targets[range(num_points), min_area_inds]
return labels, bbox_targets
def centerness_target(self, pos_bbox_targets):
"""Compute centerness targets.
Args:
pos_bbox_targets (Tensor): BBox targets of positive bboxes in shape
(num_pos, 4)
Returns:
Tensor: Centerness target.
"""
# only calculate pos centerness targets, otherwise there may be nan
left_right = pos_bbox_targets[:, [0, 2]]
top_bottom = pos_bbox_targets[:, [1, 3]]
centerness_targets = (
left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * (
top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
return torch.sqrt(centerness_targets)
import torch
import torch.nn as nn
from mmcv.cnn import ConvModule, normal_init
from mmcv.ops import DeformConv2d
from mmdet.core import multi_apply, multiclass_nms
from ..builder import HEADS
from .anchor_free_head import AnchorFreeHead
INF = 1e8
class FeatureAlign(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size=3,
deform_groups=4):
super(FeatureAlign, self).__init__()
offset_channels = kernel_size * kernel_size * 2
self.conv_offset = nn.Conv2d(
4, deform_groups * offset_channels, 1, bias=False)
self.conv_adaption = DeformConv2d(
in_channels,
out_channels,
kernel_size=kernel_size,
padding=(kernel_size - 1) // 2,
deform_groups=deform_groups)
self.relu = nn.ReLU(inplace=True)
def init_weights(self):
normal_init(self.conv_offset, std=0.1)
normal_init(self.conv_adaption, std=0.01)
def forward(self, x, shape):
offset = self.conv_offset(shape)
x = self.relu(self.conv_adaption(x, offset))
return x
@HEADS.register_module()
class FoveaHead(AnchorFreeHead):
"""FoveaBox: Beyond Anchor-based Object Detector
https://arxiv.org/abs/1904.03797
"""
def __init__(self,
num_classes,
in_channels,
base_edge_list=(16, 32, 64, 128, 256),
scale_ranges=((8, 32), (16, 64), (32, 128), (64, 256), (128,
512)),
sigma=0.4,
with_deform=False,
deform_groups=4,
**kwargs):
self.base_edge_list = base_edge_list
self.scale_ranges = scale_ranges
self.sigma = sigma
self.with_deform = with_deform
self.deform_groups = deform_groups
super().__init__(num_classes, in_channels, **kwargs)
def _init_layers(self):
# box branch
super()._init_reg_convs()
self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
# cls branch
if not self.with_deform:
super()._init_cls_convs()
self.conv_cls = nn.Conv2d(
self.feat_channels, self.cls_out_channels, 3, padding=1)
else:
self.cls_convs = nn.ModuleList()
self.cls_convs.append(
ConvModule(
self.feat_channels, (self.feat_channels * 4),
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
bias=self.norm_cfg is None))
self.cls_convs.append(
ConvModule((self.feat_channels * 4), (self.feat_channels * 4),
1,
stride=1,
padding=0,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
bias=self.norm_cfg is None))
self.feature_adaption = FeatureAlign(
self.feat_channels,
self.feat_channels,
kernel_size=3,
deform_groups=self.deform_groups)
self.conv_cls = nn.Conv2d(
int(self.feat_channels * 4),
self.cls_out_channels,
3,
padding=1)
def init_weights(self):
super().init_weights()
if self.with_deform:
self.feature_adaption.init_weights()
def forward_single(self, x):
cls_feat = x
reg_feat = x
for reg_layer in self.reg_convs:
reg_feat = reg_layer(reg_feat)
bbox_pred = self.conv_reg(reg_feat)
if self.with_deform:
cls_feat = self.feature_adaption(cls_feat, bbox_pred.exp())
for cls_layer in self.cls_convs:
cls_feat = cls_layer(cls_feat)
cls_score = self.conv_cls(cls_feat)
return cls_score, bbox_pred
def _get_points_single(self, *args, **kwargs):
y, x = super()._get_points_single(*args, **kwargs)
return y + 0.5, x + 0.5
def loss(self,
cls_scores,
bbox_preds,
gt_bbox_list,
gt_label_list,
img_metas,
gt_bboxes_ignore=None):
assert len(cls_scores) == len(bbox_preds)
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
bbox_preds[0].device)
num_imgs = cls_scores[0].size(0)
flatten_cls_scores = [
cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
for cls_score in cls_scores
]
flatten_bbox_preds = [
bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
for bbox_pred in bbox_preds
]
flatten_cls_scores = torch.cat(flatten_cls_scores)
flatten_bbox_preds = torch.cat(flatten_bbox_preds)
flatten_labels, flatten_bbox_targets = self.get_targets(
gt_bbox_list, gt_label_list, featmap_sizes, points)
# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
pos_inds = ((flatten_labels >= 0)
& (flatten_labels < self.num_classes)).nonzero().view(-1)
num_pos = len(pos_inds)
loss_cls = self.loss_cls(
flatten_cls_scores, flatten_labels, avg_factor=num_pos + num_imgs)
if num_pos > 0:
pos_bbox_preds = flatten_bbox_preds[pos_inds]
pos_bbox_targets = flatten_bbox_targets[pos_inds]
pos_weights = pos_bbox_targets.new_zeros(
pos_bbox_targets.size()) + 1.0
loss_bbox = self.loss_bbox(
pos_bbox_preds,
pos_bbox_targets,
pos_weights,
avg_factor=num_pos)
else:
loss_bbox = torch.tensor(
0,
dtype=flatten_bbox_preds.dtype,
device=flatten_bbox_preds.device)
return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
def get_targets(self, gt_bbox_list, gt_label_list, featmap_sizes, points):
label_list, bbox_target_list = multi_apply(
self._get_target_single,
gt_bbox_list,
gt_label_list,
featmap_size_list=featmap_sizes,
point_list=points)
flatten_labels = [
torch.cat([
labels_level_img.flatten() for labels_level_img in labels_level
]) for labels_level in zip(*label_list)
]
flatten_bbox_targets = [
torch.cat([
bbox_targets_level_img.reshape(-1, 4)
for bbox_targets_level_img in bbox_targets_level
]) for bbox_targets_level in zip(*bbox_target_list)
]
flatten_labels = torch.cat(flatten_labels)
flatten_bbox_targets = torch.cat(flatten_bbox_targets)
return flatten_labels, flatten_bbox_targets
def _get_target_single(self,
gt_bboxes_raw,
gt_labels_raw,
featmap_size_list=None,
point_list=None):
gt_areas = torch.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) *
(gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1]))
label_list = []
bbox_target_list = []
# for each pyramid, find the cls and box target
for base_len, (lower_bound, upper_bound), stride, featmap_size, \
(y, x) in zip(self.base_edge_list, self.scale_ranges,
self.strides, featmap_size_list, point_list):
# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
labels = gt_labels_raw.new_zeros(featmap_size) + self.num_classes
bbox_targets = gt_bboxes_raw.new(featmap_size[0], featmap_size[1],
4) + 1
# scale assignment
hit_indices = ((gt_areas >= lower_bound) &
(gt_areas <= upper_bound)).nonzero().flatten()
if len(hit_indices) == 0:
label_list.append(labels)
bbox_target_list.append(torch.log(bbox_targets))
continue
_, hit_index_order = torch.sort(-gt_areas[hit_indices])
hit_indices = hit_indices[hit_index_order]
gt_bboxes = gt_bboxes_raw[hit_indices, :] / stride
gt_labels = gt_labels_raw[hit_indices]
half_w = 0.5 * (gt_bboxes[:, 2] - gt_bboxes[:, 0])
half_h = 0.5 * (gt_bboxes[:, 3] - gt_bboxes[:, 1])
# valid fovea area: left, right, top, down
pos_left = torch.ceil(
gt_bboxes[:, 0] + (1 - self.sigma) * half_w - 0.5).long().\
clamp(0, featmap_size[1] - 1)
pos_right = torch.floor(
gt_bboxes[:, 0] + (1 + self.sigma) * half_w - 0.5).long().\
clamp(0, featmap_size[1] - 1)
pos_top = torch.ceil(
gt_bboxes[:, 1] + (1 - self.sigma) * half_h - 0.5).long().\
clamp(0, featmap_size[0] - 1)
pos_down = torch.floor(
gt_bboxes[:, 1] + (1 + self.sigma) * half_h - 0.5).long().\
clamp(0, featmap_size[0] - 1)
for px1, py1, px2, py2, label, (gt_x1, gt_y1, gt_x2, gt_y2) in \
zip(pos_left, pos_top, pos_right, pos_down, gt_labels,
gt_bboxes_raw[hit_indices, :]):
labels[py1:py2 + 1, px1:px2 + 1] = label
bbox_targets[py1:py2 + 1, px1:px2 + 1, 0] = \
(stride * x[py1:py2 + 1, px1:px2 + 1] - gt_x1) / base_len
bbox_targets[py1:py2 + 1, px1:px2 + 1, 1] = \
(stride * y[py1:py2 + 1, px1:px2 + 1] - gt_y1) / base_len
bbox_targets[py1:py2 + 1, px1:px2 + 1, 2] = \
(gt_x2 - stride * x[py1:py2 + 1, px1:px2 + 1]) / base_len
bbox_targets[py1:py2 + 1, px1:px2 + 1, 3] = \
(gt_y2 - stride * y[py1:py2 + 1, px1:px2 + 1]) / base_len
bbox_targets = bbox_targets.clamp(min=1. / 16, max=16.)
label_list.append(labels)
bbox_target_list.append(torch.log(bbox_targets))
return label_list, bbox_target_list
def get_bboxes(self,
cls_scores,
bbox_preds,
img_metas,
cfg=None,
rescale=None):
assert len(cls_scores) == len(bbox_preds)
num_levels = len(cls_scores)
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
points = self.get_points(
featmap_sizes,
bbox_preds[0].dtype,
bbox_preds[0].device,
flatten=True)
result_list = []
for img_id in range(len(img_metas)):
cls_score_list = [
cls_scores[i][img_id].detach() for i in range(num_levels)
]
bbox_pred_list = [
bbox_preds[i][img_id].detach() for i in range(num_levels)
]
img_shape = img_metas[img_id]['img_shape']
scale_factor = img_metas[img_id]['scale_factor']
det_bboxes = self._get_bboxes_single(cls_score_list,
bbox_pred_list, featmap_sizes,
points, img_shape,
scale_factor, cfg, rescale)
result_list.append(det_bboxes)
return result_list
def _get_bboxes_single(self,
cls_scores,
bbox_preds,
featmap_sizes,
point_list,
img_shape,
scale_factor,
cfg,
rescale=False):
cfg = self.test_cfg if cfg is None else cfg
assert len(cls_scores) == len(bbox_preds) == len(point_list)
det_bboxes = []
det_scores = []
for cls_score, bbox_pred, featmap_size, stride, base_len, (y, x) \
in zip(cls_scores, bbox_preds, featmap_sizes, self.strides,
self.base_edge_list, point_list):
assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
scores = cls_score.permute(1, 2, 0).reshape(
-1, self.cls_out_channels).sigmoid()
bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4).exp()
nms_pre = cfg.get('nms_pre', -1)
if (nms_pre > 0) and (scores.shape[0] > nms_pre):
max_scores, _ = scores.max(dim=1)
_, topk_inds = max_scores.topk(nms_pre)
bbox_pred = bbox_pred[topk_inds, :]
scores = scores[topk_inds, :]
y = y[topk_inds]
x = x[topk_inds]
x1 = (stride * x - base_len * bbox_pred[:, 0]).\
clamp(min=0, max=img_shape[1] - 1)
y1 = (stride * y - base_len * bbox_pred[:, 1]).\
clamp(min=0, max=img_shape[0] - 1)
x2 = (stride * x + base_len * bbox_pred[:, 2]).\
clamp(min=0, max=img_shape[1] - 1)
y2 = (stride * y + base_len * bbox_pred[:, 3]).\
clamp(min=0, max=img_shape[0] - 1)
bboxes = torch.stack([x1, y1, x2, y2], -1)
det_bboxes.append(bboxes)
det_scores.append(scores)
det_bboxes = torch.cat(det_bboxes)
if rescale:
det_bboxes /= det_bboxes.new_tensor(scale_factor)
det_scores = torch.cat(det_scores)
padding = det_scores.new_zeros(det_scores.shape[0], 1)
# remind that we set FG labels to [0, num_class-1] since mmdet v2.0
# BG cat_id: num_class
det_scores = torch.cat([det_scores, padding], dim=1)
det_bboxes, det_labels = multiclass_nms(det_bboxes, det_scores,
cfg.score_thr, cfg.nms,
cfg.max_per_img)
return det_bboxes, det_labels
import torch
import torch.nn.functional as F
from mmdet.core import bbox_overlaps
from ..builder import HEADS
from .retina_head import RetinaHead
EPS = 1e-12
@HEADS.register_module()
class FreeAnchorRetinaHead(RetinaHead):
"""FreeAnchor RetinaHead used in https://arxiv.org/abs/1909.02466.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
stacked_convs (int): Number of conv layers in cls and reg tower.
Default: 4.
conv_cfg (dict): dictionary to construct and config conv layer.
Default: None.
norm_cfg (dict): dictionary to construct and config norm layer.
Default: norm_cfg=dict(type='GN', num_groups=32,
requires_grad=True).
pre_anchor_topk (int): Number of boxes that be token in each bag.
bbox_thr (float): The threshold of the saturated linear function. It is
usually the same with the IoU threshold used in NMS.
gamma (float): Gamma parameter in focal loss.
alpha (float): Alpha parameter in focal loss.
""" # noqa: W605
def __init__(self,
num_classes,
in_channels,
stacked_convs=4,
conv_cfg=None,
norm_cfg=None,
pre_anchor_topk=50,
bbox_thr=0.6,
gamma=2.0,
alpha=0.5,
**kwargs):
super(FreeAnchorRetinaHead,
self).__init__(num_classes, in_channels, stacked_convs, conv_cfg,
norm_cfg, **kwargs)
self.pre_anchor_topk = pre_anchor_topk
self.bbox_thr = bbox_thr
self.gamma = gamma
self.alpha = alpha
def loss(self,
cls_scores,
bbox_preds,
gt_bboxes,
gt_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute losses of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_anchors * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_anchors * 4, H, W)
gt_bboxes (list[Tensor]): each item are the truth boxes for each
image in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): specify which bounding
boxes can be ignored when computing the loss.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == len(self.anchor_generator.base_anchors)
anchor_list, _ = self.get_anchors(featmap_sizes, img_metas)
anchors = [torch.cat(anchor) for anchor in anchor_list]
# concatenate each level
cls_scores = [
cls.permute(0, 2, 3,
1).reshape(cls.size(0), -1, self.cls_out_channels)
for cls in cls_scores
]
bbox_preds = [
bbox_pred.permute(0, 2, 3, 1).reshape(bbox_pred.size(0), -1, 4)
for bbox_pred in bbox_preds
]
cls_scores = torch.cat(cls_scores, dim=1)
bbox_preds = torch.cat(bbox_preds, dim=1)
cls_prob = torch.sigmoid(cls_scores)
box_prob = []
num_pos = 0
positive_losses = []
for _, (anchors_, gt_labels_, gt_bboxes_, cls_prob_,
bbox_preds_) in enumerate(
zip(anchors, gt_labels, gt_bboxes, cls_prob, bbox_preds)):
with torch.no_grad():
if len(gt_bboxes_) == 0:
image_box_prob = torch.zeros(
anchors_.size(0),
self.cls_out_channels).type_as(bbox_preds_)
else:
# box_localization: a_{j}^{loc}, shape: [j, 4]
pred_boxes = self.bbox_coder.decode(anchors_, bbox_preds_)
# object_box_iou: IoU_{ij}^{loc}, shape: [i, j]
object_box_iou = bbox_overlaps(gt_bboxes_, pred_boxes)
# object_box_prob: P{a_{j} -> b_{i}}, shape: [i, j]
t1 = self.bbox_thr
t2 = object_box_iou.max(
dim=1, keepdim=True).values.clamp(min=t1 + 1e-12)
object_box_prob = ((object_box_iou - t1) /
(t2 - t1)).clamp(
min=0, max=1)
# object_cls_box_prob: P{a_{j} -> b_{i}}, shape: [i, c, j]
num_obj = gt_labels_.size(0)
indices = torch.stack([
torch.arange(num_obj).type_as(gt_labels_), gt_labels_
],
dim=0)
object_cls_box_prob = torch.sparse_coo_tensor(
indices, object_box_prob)
# image_box_iou: P{a_{j} \in A_{+}}, shape: [c, j]
"""
from "start" to "end" implement:
image_box_iou = torch.sparse.max(object_cls_box_prob,
dim=0).t()
"""
# start
box_cls_prob = torch.sparse.sum(
object_cls_box_prob, dim=0).to_dense()
indices = torch.nonzero(box_cls_prob, as_tuple=False).t_()
if indices.numel() == 0:
image_box_prob = torch.zeros(
anchors_.size(0),
self.cls_out_channels).type_as(object_box_prob)
else:
nonzero_box_prob = torch.where(
(gt_labels_.unsqueeze(dim=-1) == indices[0]),
object_box_prob[:, indices[1]],
torch.tensor([
0
]).type_as(object_box_prob)).max(dim=0).values
# upmap to shape [j, c]
image_box_prob = torch.sparse_coo_tensor(
indices.flip([0]),
nonzero_box_prob,
size=(anchors_.size(0),
self.cls_out_channels)).to_dense()
# end
box_prob.append(image_box_prob)
# construct bags for objects
match_quality_matrix = bbox_overlaps(gt_bboxes_, anchors_)
_, matched = torch.topk(
match_quality_matrix,
self.pre_anchor_topk,
dim=1,
sorted=False)
del match_quality_matrix
# matched_cls_prob: P_{ij}^{cls}
matched_cls_prob = torch.gather(
cls_prob_[matched], 2,
gt_labels_.view(-1, 1, 1).repeat(1, self.pre_anchor_topk,
1)).squeeze(2)
# matched_box_prob: P_{ij}^{loc}
matched_anchors = anchors_[matched]
matched_object_targets = self.bbox_coder.encode(
matched_anchors,
gt_bboxes_.unsqueeze(dim=1).expand_as(matched_anchors))
loss_bbox = self.loss_bbox(
bbox_preds_[matched],
matched_object_targets,
reduction_override='none').sum(-1)
matched_box_prob = torch.exp(-loss_bbox)
# positive_losses: {-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )}
num_pos += len(gt_bboxes_)
positive_losses.append(
self.positive_bag_loss(matched_cls_prob, matched_box_prob))
positive_loss = torch.cat(positive_losses).sum() / max(1, num_pos)
# box_prob: P{a_{j} \in A_{+}}
box_prob = torch.stack(box_prob, dim=0)
# negative_loss:
# \sum_{j}{ FL((1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg})) } / n||B||
negative_loss = self.negative_bag_loss(cls_prob, box_prob).sum() / max(
1, num_pos * self.pre_anchor_topk)
# avoid the absence of gradients in regression subnet
# when no ground-truth in a batch
if num_pos == 0:
positive_loss = bbox_preds.sum() * 0
losses = {
'positive_bag_loss': positive_loss,
'negative_bag_loss': negative_loss
}
return losses
def positive_bag_loss(self, matched_cls_prob, matched_box_prob):
"""Compute positive bag loss.
:math:`-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )`.
:math:`P_{ij}^{cls}`: matched_cls_prob, classification probability of matched samples.
:math:`P_{ij}^{loc}`: matched_box_prob, box probability of matched samples.
Args:
matched_cls_prob (Tensor): Classification probabilty of matched
samples in shape (num_gt, pre_anchor_topk).
matched_box_prob (Tensor): BBox probability of matched samples,
in shape (num_gt, pre_anchor_topk).
Returns:
Tensor: Positive bag loss in shape (num_gt,).
""" # noqa: E501, W605
# bag_prob = Mean-max(matched_prob)
matched_prob = matched_cls_prob * matched_box_prob
weight = 1 / torch.clamp(1 - matched_prob, 1e-12, None)
weight /= weight.sum(dim=1).unsqueeze(dim=-1)
bag_prob = (weight * matched_prob).sum(dim=1)
# positive_bag_loss = -self.alpha * log(bag_prob)
return self.alpha * F.binary_cross_entropy(
bag_prob, torch.ones_like(bag_prob), reduction='none')
def negative_bag_loss(self, cls_prob, box_prob):
"""Compute negative bag loss.
:math:`FL((1 - P_{a_{j} \in A_{+}}) * (1 - P_{j}^{bg}))`.
:math:`P_{a_{j} \in A_{+}}`: Box_probability of matched samples.
:math:`P_{j}^{bg}`: Classification probability of negative samples.
Args:
cls_prob (Tensor): Classification probability, in shape
(num_img, num_anchors, num_classes).
box_prob (Tensor): Box probability, in shape
(num_img, num_anchors, num_classes).
Returns:
Tensor: Negative bag loss in shape (num_img, num_anchors, num_classes).
""" # noqa: E501, W605
prob = cls_prob * (1 - box_prob)
# There are some cases when neg_prob = 0.
# This will cause the neg_prob.log() to be inf without clamp.
prob = prob.clamp(min=EPS, max=1 - EPS)
negative_bag_loss = prob**self.gamma * F.binary_cross_entropy(
prob, torch.zeros_like(prob), reduction='none')
return (1 - self.alpha) * negative_bag_loss
import numpy as np
import torch
from mmcv.cnn import normal_init
from mmcv.runner import force_fp32
from mmdet.core import (anchor_inside_flags, images_to_levels, multi_apply,
unmap)
from ..builder import HEADS
from ..losses.accuracy import accuracy
from ..losses.utils import weight_reduce_loss
from .retina_head import RetinaHead
@HEADS.register_module()
class FSAFHead(RetinaHead):
"""Anchor-free head used in `FSAF <https://arxiv.org/abs/1903.00621>`_.
The head contains two subnetworks. The first classifies anchor boxes and
the second regresses deltas for the anchors (num_anchors is 1 for anchor-
free methods)
Args:
*args: Same as its base class in :class:`RetinaHead`
score_threshold (float, optional): The score_threshold to calculate
positive recall. If given, prediction scores lower than this value
is counted as incorrect prediction. Default to None.
**kwargs: Same as its base class in :class:`RetinaHead`
Example:
>>> import torch
>>> self = FSAFHead(11, 7)
>>> x = torch.rand(1, 7, 32, 32)
>>> cls_score, bbox_pred = self.forward_single(x)
>>> # Each anchor predicts a score for each class except background
>>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
>>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
>>> assert cls_per_anchor == self.num_classes
>>> assert box_per_anchor == 4
"""
def __init__(self, *args, score_threshold=None, **kwargs):
super().__init__(*args, **kwargs)
self.score_threshold = score_threshold
def forward_single(self, x):
"""Forward feature map of a single scale level.
Args:
x (Tensor): Feature map of a single scale level.
Returns:
tuple (Tensor):
cls_score (Tensor): Box scores for each scale level
Has shape (N, num_points * num_classes, H, W).
bbox_pred (Tensor): Box energies / deltas for each scale
level with shape (N, num_points * 4, H, W).
"""
cls_score, bbox_pred = super().forward_single(x)
# relu: TBLR encoder only accepts positive bbox_pred
return cls_score, self.relu(bbox_pred)
def init_weights(self):
"""Initialize weights of the head."""
super(FSAFHead, self).init_weights()
# The positive bias in self.retina_reg conv is to prevent predicted \
# bbox with 0 area
normal_init(self.retina_reg, std=0.01, bias=0.25)
def _get_targets_single(self,
flat_anchors,
valid_flags,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
img_meta,
label_channels=1,
unmap_outputs=True):
"""Compute regression and classification targets for anchors in a
single image.
Most of the codes are the same with the base class
:obj: `AnchorHead`, except that it also collects and returns
the matched gt index in the image (from 0 to num_gt-1). If the
anchor bbox is not matched to any gt, the corresponding value in
pos_gt_inds is -1.
"""
inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
img_meta['img_shape'][:2],
self.train_cfg.allowed_border)
if not inside_flags.any():
return (None, ) * 7
# Assign gt and sample anchors
anchors = flat_anchors[inside_flags.type(torch.bool), :]
assign_result = self.assigner.assign(
anchors, gt_bboxes, gt_bboxes_ignore,
None if self.sampling else gt_labels)
sampling_result = self.sampler.sample(assign_result, anchors,
gt_bboxes)
num_valid_anchors = anchors.shape[0]
bbox_targets = torch.zeros_like(anchors)
bbox_weights = torch.zeros_like(anchors)
labels = anchors.new_full((num_valid_anchors, ),
self.num_classes,
dtype=torch.long)
label_weights = anchors.new_zeros((num_valid_anchors, label_channels),
dtype=torch.float)
pos_gt_inds = anchors.new_full((num_valid_anchors, ),
-1,
dtype=torch.long)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds
if len(pos_inds) > 0:
if not self.reg_decoded_bbox:
pos_bbox_targets = self.bbox_coder.encode(
sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
else:
# When the regression loss (e.g. `IouLoss`, `GIouLoss`)
# is applied directly on the decoded bounding boxes, both
# the predicted boxes and regression targets should be with
# absolute coordinate format.
pos_bbox_targets = sampling_result.pos_gt_bboxes
bbox_targets[pos_inds, :] = pos_bbox_targets
bbox_weights[pos_inds, :] = 1.0
# The assigned gt_index for each anchor. (0-based)
pos_gt_inds[pos_inds] = sampling_result.pos_assigned_gt_inds
if gt_labels is None:
# Only rpn gives gt_labels as None
# Foreground is the first class
labels[pos_inds] = 0
else:
labels[pos_inds] = gt_labels[
sampling_result.pos_assigned_gt_inds]
if self.train_cfg.pos_weight <= 0:
label_weights[pos_inds] = 1.0
else:
label_weights[pos_inds] = self.train_cfg.pos_weight
if len(neg_inds) > 0:
label_weights[neg_inds] = 1.0
# shadowed_labels is a tensor composed of tuples
# (anchor_inds, class_label) that indicate those anchors lying in the
# outer region of a gt or overlapped by another gt with a smaller
# area.
#
# Therefore, only the shadowed labels are ignored for loss calculation.
# the key `shadowed_labels` is defined in :obj:`CenterRegionAssigner`
shadowed_labels = assign_result.get_extra_property('shadowed_labels')
if shadowed_labels is not None and shadowed_labels.numel():
if len(shadowed_labels.shape) == 2:
idx_, label_ = shadowed_labels[:, 0], shadowed_labels[:, 1]
assert (labels[idx_] != label_).all(), \
'One label cannot be both positive and ignored'
label_weights[idx_, label_] = 0
else:
label_weights[shadowed_labels] = 0
# map up to original set of anchors
if unmap_outputs:
num_total_anchors = flat_anchors.size(0)
labels = unmap(labels, num_total_anchors, inside_flags)
label_weights = unmap(label_weights, num_total_anchors,
inside_flags)
bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
pos_gt_inds = unmap(
pos_gt_inds, num_total_anchors, inside_flags, fill=-1)
return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
neg_inds, sampling_result, pos_gt_inds)
@force_fp32(apply_to=('cls_scores', 'bbox_preds'))
def loss(self,
cls_scores,
bbox_preds,
gt_bboxes,
gt_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute loss of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_points * num_classes, H, W).
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_points * 4, H, W).
gt_bboxes (list[Tensor]): each item are the truth boxes for each
image in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): specify which bounding
boxes can be ignored when computing the loss.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
for i in range(len(bbox_preds)): # loop over fpn level
# avoid 0 area of the predicted bbox
bbox_preds[i] = bbox_preds[i].clamp(min=1e-4)
# TODO: It may directly use the base-class loss function.
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == self.anchor_generator.num_levels
batch_size = len(gt_bboxes)
device = cls_scores[0].device
anchor_list, valid_flag_list = self.get_anchors(
featmap_sizes, img_metas, device=device)
label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
cls_reg_targets = self.get_targets(
anchor_list,
valid_flag_list,
gt_bboxes,
img_metas,
gt_bboxes_ignore_list=gt_bboxes_ignore,
gt_labels_list=gt_labels,
label_channels=label_channels)
if cls_reg_targets is None:
return None
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
num_total_pos, num_total_neg,
pos_assigned_gt_inds_list) = cls_reg_targets
num_gts = np.array(list(map(len, gt_labels)))
num_total_samples = (
num_total_pos + num_total_neg if self.sampling else num_total_pos)
# anchor number of multi levels
num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
# concat all level anchors and flags to a single tensor
concat_anchor_list = []
for i in range(len(anchor_list)):
concat_anchor_list.append(torch.cat(anchor_list[i]))
all_anchor_list = images_to_levels(concat_anchor_list,
num_level_anchors)
losses_cls, losses_bbox = multi_apply(
self.loss_single,
cls_scores,
bbox_preds,
all_anchor_list,
labels_list,
label_weights_list,
bbox_targets_list,
bbox_weights_list,
num_total_samples=num_total_samples)
# `pos_assigned_gt_inds_list` (length: fpn_levels) stores the assigned
# gt index of each anchor bbox in each fpn level.
cum_num_gts = list(np.cumsum(num_gts)) # length of batch_size
for i, assign in enumerate(pos_assigned_gt_inds_list):
# loop over fpn levels
for j in range(1, batch_size):
# loop over batch size
# Convert gt indices in each img to those in the batch
assign[j][assign[j] >= 0] += int(cum_num_gts[j - 1])
pos_assigned_gt_inds_list[i] = assign.flatten()
labels_list[i] = labels_list[i].flatten()
num_gts = sum(map(len, gt_labels)) # total number of gt in the batch
# The unique label index of each gt in the batch
label_sequence = torch.arange(num_gts, device=device)
# Collect the average loss of each gt in each level
with torch.no_grad():
loss_levels, = multi_apply(
self.collect_loss_level_single,
losses_cls,
losses_bbox,
pos_assigned_gt_inds_list,
labels_seq=label_sequence)
# Shape: (fpn_levels, num_gts). Loss of each gt at each fpn level
loss_levels = torch.stack(loss_levels, dim=0)
# Locate the best fpn level for loss back-propagation
if loss_levels.numel() == 0: # zero gt
argmin = loss_levels.new_empty((num_gts, ), dtype=torch.long)
else:
_, argmin = loss_levels.min(dim=0)
# Reweight the loss of each (anchor, label) pair, so that only those
# at the best gt level are back-propagated.
losses_cls, losses_bbox, pos_inds = multi_apply(
self.reweight_loss_single,
losses_cls,
losses_bbox,
pos_assigned_gt_inds_list,
labels_list,
list(range(len(losses_cls))),
min_levels=argmin)
num_pos = torch.cat(pos_inds, 0).sum().float()
pos_recall = self.calculate_pos_recall(cls_scores, labels_list,
pos_inds)
if num_pos == 0: # No gt
avg_factor = num_pos + float(num_total_neg)
else:
avg_factor = num_pos
for i in range(len(losses_cls)):
losses_cls[i] /= avg_factor
losses_bbox[i] /= avg_factor
return dict(
loss_cls=losses_cls,
loss_bbox=losses_bbox,
num_pos=num_pos / batch_size,
pos_recall=pos_recall)
def calculate_pos_recall(self, cls_scores, labels_list, pos_inds):
"""Calculate positive recall with score threshold.
Args:
cls_scores (list[Tensor]): Classification scores at all fpn levels.
Each tensor is in shape (N, num_classes * num_anchors, H, W)
labels_list (list[Tensor]): The label that each anchor is assigned
to. Shape (N * H * W * num_anchors, )
pos_inds (list[Tensor]): List of bool tensors indicating whether
the anchor is assigned to a positive label.
Shape (N * H * W * num_anchors, )
Returns:
Tensor: A single float number indicating the positive recall.
"""
with torch.no_grad():
num_class = self.num_classes
scores = [
cls.permute(0, 2, 3, 1).reshape(-1, num_class)[pos]
for cls, pos in zip(cls_scores, pos_inds)
]
labels = [
label.reshape(-1)[pos]
for label, pos in zip(labels_list, pos_inds)
]
scores = torch.cat(scores, dim=0)
labels = torch.cat(labels, dim=0)
if self.use_sigmoid_cls:
scores = scores.sigmoid()
else:
scores = scores.softmax(dim=1)
return accuracy(scores, labels, thresh=self.score_threshold)
def collect_loss_level_single(self, cls_loss, reg_loss, assigned_gt_inds,
labels_seq):
"""Get the average loss in each FPN level w.r.t. each gt label.
Args:
cls_loss (Tensor): Classification loss of each feature map pixel,
shape (num_anchor, num_class)
reg_loss (Tensor): Regression loss of each feature map pixel,
shape (num_anchor, 4)
assigned_gt_inds (Tensor): It indicates which gt the prior is
assigned to (0-based, -1: no assignment). shape (num_anchor),
labels_seq: The rank of labels. shape (num_gt)
Returns:
shape: (num_gt), average loss of each gt in this level
"""
if len(reg_loss.shape) == 2: # iou loss has shape (num_prior, 4)
reg_loss = reg_loss.sum(dim=-1) # sum loss in tblr dims
if len(cls_loss.shape) == 2:
cls_loss = cls_loss.sum(dim=-1) # sum loss in class dims
loss = cls_loss + reg_loss
assert loss.size(0) == assigned_gt_inds.size(0)
# Default loss value is 1e6 for a layer where no anchor is positive
# to ensure it will not be chosen to back-propagate gradient
losses_ = loss.new_full(labels_seq.shape, 1e6)
for i, l in enumerate(labels_seq):
match = assigned_gt_inds == l
if match.any():
losses_[i] = loss[match].mean()
return losses_,
def reweight_loss_single(self, cls_loss, reg_loss, assigned_gt_inds,
labels, level, min_levels):
"""Reweight loss values at each level.
Reassign loss values at each level by masking those where the
pre-calculated loss is too large. Then return the reduced losses.
Args:
cls_loss (Tensor): Element-wise classification loss.
Shape: (num_anchors, num_classes)
reg_loss (Tensor): Element-wise regression loss.
Shape: (num_anchors, 4)
assigned_gt_inds (Tensor): The gt indices that each anchor bbox
is assigned to. -1 denotes a negative anchor, otherwise it is the
gt index (0-based). Shape: (num_anchors, ),
labels (Tensor): Label assigned to anchors. Shape: (num_anchors, ).
level (int): The current level index in the pyramid
(0-4 for RetinaNet)
min_levels (Tensor): The best-matching level for each gt.
Shape: (num_gts, ),
Returns:
tuple:
- cls_loss: Reduced corrected classification loss. Scalar.
- reg_loss: Reduced corrected regression loss. Scalar.
- pos_flags (Tensor): Corrected bool tensor indicating the
final postive anchors. Shape: (num_anchors, ).
"""
loc_weight = torch.ones_like(reg_loss)
cls_weight = torch.ones_like(cls_loss)
pos_flags = assigned_gt_inds >= 0 # positive pixel flag
pos_indices = torch.nonzero(pos_flags, as_tuple=False).flatten()
if pos_flags.any(): # pos pixels exist
pos_assigned_gt_inds = assigned_gt_inds[pos_flags]
zeroing_indices = (min_levels[pos_assigned_gt_inds] != level)
neg_indices = pos_indices[zeroing_indices]
if neg_indices.numel():
pos_flags[neg_indices] = 0
loc_weight[neg_indices] = 0
# Only the weight corresponding to the label is
# zeroed out if not selected
zeroing_labels = labels[neg_indices]
assert (zeroing_labels >= 0).all()
cls_weight[neg_indices, zeroing_labels] = 0
# Weighted loss for both cls and reg loss
cls_loss = weight_reduce_loss(cls_loss, cls_weight, reduction='sum')
reg_loss = weight_reduce_loss(reg_loss, loc_weight, reduction='sum')
return cls_loss, reg_loss, pos_flags
import torch.nn as nn
from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
from mmcv.ops import MaskedConv2d
from ..builder import HEADS
from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead
@HEADS.register_module()
class GARetinaHead(GuidedAnchorHead):
"""Guided-Anchor-based RetinaNet head."""
def __init__(self,
num_classes,
in_channels,
stacked_convs=4,
conv_cfg=None,
norm_cfg=None,
**kwargs):
self.stacked_convs = stacked_convs
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
super(GARetinaHead, self).__init__(num_classes, in_channels, **kwargs)
def _init_layers(self):
"""Initialize layers of the head."""
self.relu = nn.ReLU(inplace=True)
self.cls_convs = nn.ModuleList()
self.reg_convs = nn.ModuleList()
for i in range(self.stacked_convs):
chn = self.in_channels if i == 0 else self.feat_channels
self.cls_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg))
self.reg_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg))
self.conv_loc = nn.Conv2d(self.feat_channels, 1, 1)
self.conv_shape = nn.Conv2d(self.feat_channels, self.num_anchors * 2,
1)
self.feature_adaption_cls = FeatureAdaption(
self.feat_channels,
self.feat_channels,
kernel_size=3,
deform_groups=self.deform_groups)
self.feature_adaption_reg = FeatureAdaption(
self.feat_channels,
self.feat_channels,
kernel_size=3,
deform_groups=self.deform_groups)
self.retina_cls = MaskedConv2d(
self.feat_channels,
self.num_anchors * self.cls_out_channels,
3,
padding=1)
self.retina_reg = MaskedConv2d(
self.feat_channels, self.num_anchors * 4, 3, padding=1)
def init_weights(self):
"""Initialize weights of the layer."""
for m in self.cls_convs:
normal_init(m.conv, std=0.01)
for m in self.reg_convs:
normal_init(m.conv, std=0.01)
self.feature_adaption_cls.init_weights()
self.feature_adaption_reg.init_weights()
bias_cls = bias_init_with_prob(0.01)
normal_init(self.conv_loc, std=0.01, bias=bias_cls)
normal_init(self.conv_shape, std=0.01)
normal_init(self.retina_cls, std=0.01, bias=bias_cls)
normal_init(self.retina_reg, std=0.01)
def forward_single(self, x):
"""Forward feature map of a single scale level."""
cls_feat = x
reg_feat = x
for cls_conv in self.cls_convs:
cls_feat = cls_conv(cls_feat)
for reg_conv in self.reg_convs:
reg_feat = reg_conv(reg_feat)
loc_pred = self.conv_loc(cls_feat)
shape_pred = self.conv_shape(reg_feat)
cls_feat = self.feature_adaption_cls(cls_feat, shape_pred)
reg_feat = self.feature_adaption_reg(reg_feat, shape_pred)
if not self.training:
mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
else:
mask = None
cls_score = self.retina_cls(cls_feat, mask)
bbox_pred = self.retina_reg(reg_feat, mask)
return cls_score, bbox_pred, shape_pred, loc_pred
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import normal_init
from mmcv.ops import nms
from ..builder import HEADS
from .guided_anchor_head import GuidedAnchorHead
from .rpn_test_mixin import RPNTestMixin
@HEADS.register_module()
class GARPNHead(RPNTestMixin, GuidedAnchorHead):
"""Guided-Anchor-based RPN head."""
def __init__(self, in_channels, **kwargs):
super(GARPNHead, self).__init__(1, in_channels, **kwargs)
def _init_layers(self):
"""Initialize layers of the head."""
self.rpn_conv = nn.Conv2d(
self.in_channels, self.feat_channels, 3, padding=1)
super(GARPNHead, self)._init_layers()
def init_weights(self):
"""Initialize weights of the head."""
normal_init(self.rpn_conv, std=0.01)
super(GARPNHead, self).init_weights()
def forward_single(self, x):
"""Forward feature of a single scale level."""
x = self.rpn_conv(x)
x = F.relu(x, inplace=True)
(cls_score, bbox_pred, shape_pred,
loc_pred) = super(GARPNHead, self).forward_single(x)
return cls_score, bbox_pred, shape_pred, loc_pred
def loss(self,
cls_scores,
bbox_preds,
shape_preds,
loc_preds,
gt_bboxes,
img_metas,
gt_bboxes_ignore=None):
losses = super(GARPNHead, self).loss(
cls_scores,
bbox_preds,
shape_preds,
loc_preds,
gt_bboxes,
None,
img_metas,
gt_bboxes_ignore=gt_bboxes_ignore)
return dict(
loss_rpn_cls=losses['loss_cls'],
loss_rpn_bbox=losses['loss_bbox'],
loss_anchor_shape=losses['loss_shape'],
loss_anchor_loc=losses['loss_loc'])
def _get_bboxes_single(self,
cls_scores,
bbox_preds,
mlvl_anchors,
mlvl_masks,
img_shape,
scale_factor,
cfg,
rescale=False):
cfg = self.test_cfg if cfg is None else cfg
mlvl_proposals = []
for idx in range(len(cls_scores)):
rpn_cls_score = cls_scores[idx]
rpn_bbox_pred = bbox_preds[idx]
anchors = mlvl_anchors[idx]
mask = mlvl_masks[idx]
assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
# if no location is kept, end.
if mask.sum() == 0:
continue
rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
if self.use_sigmoid_cls:
rpn_cls_score = rpn_cls_score.reshape(-1)
scores = rpn_cls_score.sigmoid()
else:
rpn_cls_score = rpn_cls_score.reshape(-1, 2)
# remind that we set FG labels to [0, num_class-1]
# since mmdet v2.0
# BG cat_id: num_class
scores = rpn_cls_score.softmax(dim=1)[:, :-1]
# filter scores, bbox_pred w.r.t. mask.
# anchors are filtered in get_anchors() beforehand.
scores = scores[mask]
rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1,
4)[mask, :]
if scores.dim() == 0:
rpn_bbox_pred = rpn_bbox_pred.unsqueeze(0)
anchors = anchors.unsqueeze(0)
scores = scores.unsqueeze(0)
# filter anchors, bbox_pred, scores w.r.t. scores
if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
_, topk_inds = scores.topk(cfg.nms_pre)
rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
anchors = anchors[topk_inds, :]
scores = scores[topk_inds]
# get proposals w.r.t. anchors and rpn_bbox_pred
proposals = self.bbox_coder.decode(
anchors, rpn_bbox_pred, max_shape=img_shape)
# filter out too small bboxes
if cfg.min_bbox_size > 0:
w = proposals[:, 2] - proposals[:, 0]
h = proposals[:, 3] - proposals[:, 1]
valid_inds = torch.nonzero(
(w >= cfg.min_bbox_size) & (h >= cfg.min_bbox_size),
as_tuple=False).squeeze()
proposals = proposals[valid_inds, :]
scores = scores[valid_inds]
# NMS in current level
proposals, _ = nms(proposals, scores, cfg.nms_thr)
proposals = proposals[:cfg.nms_post, :]
mlvl_proposals.append(proposals)
proposals = torch.cat(mlvl_proposals, 0)
if cfg.nms_across_levels:
# NMS across multi levels
proposals, _ = nms(proposals[:, :4], proposals[:, -1], cfg.nms_thr)
proposals = proposals[:cfg.max_num, :]
else:
scores = proposals[:, 4]
num = min(cfg.max_num, proposals.shape[0])
_, topk_inds = scores.topk(num)
proposals = proposals[topk_inds, :]
return proposals
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import ConvModule, Scale, bias_init_with_prob, normal_init
from mmcv.runner import force_fp32
from mmdet.core import (anchor_inside_flags, bbox2distance, bbox_overlaps,
build_assigner, build_sampler, distance2bbox,
images_to_levels, multi_apply, multiclass_nms,
reduce_mean, unmap)
from ..builder import HEADS, build_loss
from .anchor_head import AnchorHead
class Integral(nn.Module):
"""A fixed layer for calculating integral result from distribution.
This layer calculates the target location by :math: `sum{P(y_i) * y_i}`,
P(y_i) denotes the softmax vector that represents the discrete distribution
y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max}
Args:
reg_max (int): The maximal value of the discrete set. Default: 16. You
may want to reset it according to your new dataset or related
settings.
"""
def __init__(self, reg_max=16):
super(Integral, self).__init__()
self.reg_max = reg_max
self.register_buffer('project',
torch.linspace(0, self.reg_max, self.reg_max + 1))
def forward(self, x):
"""Forward feature from the regression head to get integral result of
bounding box location.
Args:
x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
n is self.reg_max.
Returns:
x (Tensor): Integral result of box locations, i.e., distance
offsets from the box center in four directions, shape (N, 4).
"""
x = F.softmax(x.reshape(-1, self.reg_max + 1), dim=1)
x = F.linear(x, self.project.type_as(x)).reshape(-1, 4)
return x
@HEADS.register_module()
class GFLHead(AnchorHead):
"""Generalized Focal Loss: Learning Qualified and Distributed Bounding
Boxes for Dense Object Detection.
GFL head structure is similar with ATSS, however GFL uses
1) joint representation for classification and localization quality, and
2) flexible General distribution for bounding box locations,
which are supervised by
Quality Focal Loss (QFL) and Distribution Focal Loss (DFL), respectively
https://arxiv.org/abs/2006.04388
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
stacked_convs (int): Number of conv layers in cls and reg tower.
Default: 4.
conv_cfg (dict): dictionary to construct and config conv layer.
Default: None.
norm_cfg (dict): dictionary to construct and config norm layer.
Default: dict(type='GN', num_groups=32, requires_grad=True).
loss_qfl (dict): Config of Quality Focal Loss (QFL).
reg_max (int): Max value of integral set :math: `{0, ..., reg_max}`
in QFL setting. Default: 16.
Example:
>>> self = GFLHead(11, 7)
>>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
>>> cls_quality_score, bbox_pred = self.forward(feats)
>>> assert len(cls_quality_score) == len(self.scales)
"""
def __init__(self,
num_classes,
in_channels,
stacked_convs=4,
conv_cfg=None,
norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25),
reg_max=16,
**kwargs):
self.stacked_convs = stacked_convs
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.reg_max = reg_max
super(GFLHead, self).__init__(num_classes, in_channels, **kwargs)
self.sampling = False
if self.train_cfg:
self.assigner = build_assigner(self.train_cfg.assigner)
# SSD sampling=False so use PseudoSampler
sampler_cfg = dict(type='PseudoSampler')
self.sampler = build_sampler(sampler_cfg, context=self)
self.integral = Integral(self.reg_max)
self.loss_dfl = build_loss(loss_dfl)
def _init_layers(self):
"""Initialize layers of the head."""
self.relu = nn.ReLU(inplace=True)
self.cls_convs = nn.ModuleList()
self.reg_convs = nn.ModuleList()
for i in range(self.stacked_convs):
chn = self.in_channels if i == 0 else self.feat_channels
self.cls_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg))
self.reg_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg))
assert self.num_anchors == 1, 'anchor free version'
self.gfl_cls = nn.Conv2d(
self.feat_channels, self.cls_out_channels, 3, padding=1)
self.gfl_reg = nn.Conv2d(
self.feat_channels, 4 * (self.reg_max + 1), 3, padding=1)
self.scales = nn.ModuleList(
[Scale(1.0) for _ in self.anchor_generator.strides])
def init_weights(self):
"""Initialize weights of the head."""
for m in self.cls_convs:
normal_init(m.conv, std=0.01)
for m in self.reg_convs:
normal_init(m.conv, std=0.01)
bias_cls = bias_init_with_prob(0.01)
normal_init(self.gfl_cls, std=0.01, bias=bias_cls)
normal_init(self.gfl_reg, std=0.01)
def forward(self, feats):
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
tuple: Usually a tuple of classification scores and bbox prediction
cls_scores (list[Tensor]): Classification and quality (IoU)
joint scores for all scale levels, each is a 4D-tensor,
the channel number is num_classes.
bbox_preds (list[Tensor]): Box distribution logits for all
scale levels, each is a 4D-tensor, the channel number is
4*(n+1), n is max value of integral set.
"""
return multi_apply(self.forward_single, feats, self.scales)
def forward_single(self, x, scale):
"""Forward feature of a single scale level.
Args:
x (Tensor): Features of a single scale level.
scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
the bbox prediction.
Returns:
tuple:
cls_score (Tensor): Cls and quality joint scores for a single
scale level the channel number is num_classes.
bbox_pred (Tensor): Box distribution logits for a single scale
level, the channel number is 4*(n+1), n is max value of
integral set.
"""
cls_feat = x
reg_feat = x
for cls_conv in self.cls_convs:
cls_feat = cls_conv(cls_feat)
for reg_conv in self.reg_convs:
reg_feat = reg_conv(reg_feat)
cls_score = self.gfl_cls(cls_feat)
bbox_pred = scale(self.gfl_reg(reg_feat)).float()
return cls_score, bbox_pred
def anchor_center(self, anchors):
"""Get anchor centers from anchors.
Args:
anchors (Tensor): Anchor list with shape (N, 4), "xyxy" format.
Returns:
Tensor: Anchor centers with shape (N, 2), "xy" format.
"""
anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2
anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2
return torch.stack([anchors_cx, anchors_cy], dim=-1)
def loss_single(self, anchors, cls_score, bbox_pred, labels, label_weights,
bbox_targets, stride, num_total_samples):
"""Compute loss of a single scale level.
Args:
anchors (Tensor): Box reference for each scale level with shape
(N, num_total_anchors, 4).
cls_score (Tensor): Cls and quality joint scores for each scale
level has shape (N, num_classes, H, W).
bbox_pred (Tensor): Box distribution logits for each scale
level with shape (N, 4*(n+1), H, W), n is max value of integral
set.
labels (Tensor): Labels of each anchors with shape
(N, num_total_anchors).
label_weights (Tensor): Label weights of each anchor with shape
(N, num_total_anchors)
bbox_targets (Tensor): BBox regression targets of each anchor wight
shape (N, num_total_anchors, 4).
stride (tuple): Stride in this scale level.
num_total_samples (int): Number of positive samples that is
reduced over all GPUs.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert stride[0] == stride[1], 'h stride is not equal to w stride!'
anchors = anchors.reshape(-1, 4)
cls_score = cls_score.permute(0, 2, 3,
1).reshape(-1, self.cls_out_channels)
bbox_pred = bbox_pred.permute(0, 2, 3,
1).reshape(-1, 4 * (self.reg_max + 1))
bbox_targets = bbox_targets.reshape(-1, 4)
labels = labels.reshape(-1)
label_weights = label_weights.reshape(-1)
# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
bg_class_ind = self.num_classes
pos_inds = ((labels >= 0)
& (labels < bg_class_ind)).nonzero().squeeze(1)
score = label_weights.new_zeros(labels.shape)
if len(pos_inds) > 0:
pos_bbox_targets = bbox_targets[pos_inds]
pos_bbox_pred = bbox_pred[pos_inds]
pos_anchors = anchors[pos_inds]
pos_anchor_centers = self.anchor_center(pos_anchors) / stride[0]
weight_targets = cls_score.detach().sigmoid()
weight_targets = weight_targets.max(dim=1)[0][pos_inds]
pos_bbox_pred_corners = self.integral(pos_bbox_pred)
pos_decode_bbox_pred = distance2bbox(pos_anchor_centers,
pos_bbox_pred_corners)
pos_decode_bbox_targets = pos_bbox_targets / stride[0]
score[pos_inds] = bbox_overlaps(
pos_decode_bbox_pred.detach(),
pos_decode_bbox_targets,
is_aligned=True)
pred_corners = pos_bbox_pred.reshape(-1, self.reg_max + 1)
target_corners = bbox2distance(pos_anchor_centers,
pos_decode_bbox_targets,
self.reg_max).reshape(-1)
# regression loss
loss_bbox = self.loss_bbox(
pos_decode_bbox_pred,
pos_decode_bbox_targets,
weight=weight_targets,
avg_factor=1.0)
# dfl loss
loss_dfl = self.loss_dfl(
pred_corners,
target_corners,
weight=weight_targets[:, None].expand(-1, 4).reshape(-1),
avg_factor=4.0)
else:
loss_bbox = bbox_pred.sum() * 0
loss_dfl = bbox_pred.sum() * 0
weight_targets = torch.tensor(0).cuda()
# cls (qfl) loss
loss_cls = self.loss_cls(
cls_score, (labels, score),
weight=label_weights,
avg_factor=num_total_samples)
return loss_cls, loss_bbox, loss_dfl, weight_targets.sum()
@force_fp32(apply_to=('cls_scores', 'bbox_preds'))
def loss(self,
cls_scores,
bbox_preds,
gt_bboxes,
gt_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute losses of the head.
Args:
cls_scores (list[Tensor]): Cls and quality scores for each scale
level has shape (N, num_classes, H, W).
bbox_preds (list[Tensor]): Box distribution logits for each scale
level with shape (N, 4*(n+1), H, W), n is max value of integral
set.
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (list[Tensor] | None): specify which bounding
boxes can be ignored when computing the loss.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == self.anchor_generator.num_levels
device = cls_scores[0].device
anchor_list, valid_flag_list = self.get_anchors(
featmap_sizes, img_metas, device=device)
label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
cls_reg_targets = self.get_targets(
anchor_list,
valid_flag_list,
gt_bboxes,
img_metas,
gt_bboxes_ignore_list=gt_bboxes_ignore,
gt_labels_list=gt_labels,
label_channels=label_channels)
if cls_reg_targets is None:
return None
(anchor_list, labels_list, label_weights_list, bbox_targets_list,
bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets
num_total_samples = reduce_mean(
torch.tensor(num_total_pos, dtype=torch.float,
device=device)).item()
num_total_samples = max(num_total_samples, 1.0)
losses_cls, losses_bbox, losses_dfl,\
avg_factor = multi_apply(
self.loss_single,
anchor_list,
cls_scores,
bbox_preds,
labels_list,
label_weights_list,
bbox_targets_list,
self.anchor_generator.strides,
num_total_samples=num_total_samples)
avg_factor = sum(avg_factor)
avg_factor = reduce_mean(avg_factor).item()
losses_bbox = list(map(lambda x: x / avg_factor, losses_bbox))
losses_dfl = list(map(lambda x: x / avg_factor, losses_dfl))
return dict(
loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dfl=losses_dfl)
def _get_bboxes_single(self,
cls_scores,
bbox_preds,
mlvl_anchors,
img_shape,
scale_factor,
cfg,
rescale=False,
with_nms=True):
"""Transform outputs for a single batch item into labeled boxes.
Args:
cls_scores (list[Tensor]): Box scores for a single scale level
has shape (num_classes, H, W).
bbox_preds (list[Tensor]): Box distribution logits for a single
scale level with shape (4*(n+1), H, W), n is max value of
integral set.
mlvl_anchors (list[Tensor]): Box reference for a single scale level
with shape (num_total_anchors, 4).
img_shape (tuple[int]): Shape of the input image,
(height, width, 3).
scale_factor (ndarray): Scale factor of the image arange as
(w_scale, h_scale, w_scale, h_scale).
cfg (mmcv.Config | None): Test / postprocessing configuration,
if None, test_cfg would be used.
rescale (bool): If True, return boxes in original image space.
Default: False.
with_nms (bool): If True, do nms before return boxes.
Default: True.
Returns:
tuple(Tensor):
det_bboxes (Tensor): Bbox predictions in shape (N, 5), where
the first 4 columns are bounding box positions
(tl_x, tl_y, br_x, br_y) and the 5-th column is a score
between 0 and 1.
det_labels (Tensor): A (N,) tensor where each item is the
predicted class label of the corresponding box.
"""
cfg = self.test_cfg if cfg is None else cfg
assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
mlvl_bboxes = []
mlvl_scores = []
for cls_score, bbox_pred, stride, anchors in zip(
cls_scores, bbox_preds, self.anchor_generator.strides,
mlvl_anchors):
assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
assert stride[0] == stride[1]
scores = cls_score.permute(1, 2, 0).reshape(
-1, self.cls_out_channels).sigmoid()
bbox_pred = bbox_pred.permute(1, 2, 0)
bbox_pred = self.integral(bbox_pred) * stride[0]
nms_pre = cfg.get('nms_pre', -1)
if nms_pre > 0 and scores.shape[0] > nms_pre:
max_scores, _ = scores.max(dim=1)
_, topk_inds = max_scores.topk(nms_pre)
anchors = anchors[topk_inds, :]
bbox_pred = bbox_pred[topk_inds, :]
scores = scores[topk_inds, :]
bboxes = distance2bbox(
self.anchor_center(anchors), bbox_pred, max_shape=img_shape)
mlvl_bboxes.append(bboxes)
mlvl_scores.append(scores)
mlvl_bboxes = torch.cat(mlvl_bboxes)
if rescale:
mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
mlvl_scores = torch.cat(mlvl_scores)
# Add a dummy background class to the backend when using sigmoid
# remind that we set FG labels to [0, num_class-1] since mmdet v2.0
# BG cat_id: num_class
padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
if with_nms:
det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
cfg.score_thr, cfg.nms,
cfg.max_per_img)
return det_bboxes, det_labels
else:
return mlvl_bboxes, mlvl_scores
def get_targets(self,
anchor_list,
valid_flag_list,
gt_bboxes_list,
img_metas,
gt_bboxes_ignore_list=None,
gt_labels_list=None,
label_channels=1,
unmap_outputs=True):
"""Get targets for GFL head.
This method is almost the same as `AnchorHead.get_targets()`. Besides
returning the targets as the parent method does, it also returns the
anchors as the first element of the returned tuple.
"""
num_imgs = len(img_metas)
assert len(anchor_list) == len(valid_flag_list) == num_imgs
# anchor number of multi levels
num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
num_level_anchors_list = [num_level_anchors] * num_imgs
# concat all level anchors and flags to a single tensor
for i in range(num_imgs):
assert len(anchor_list[i]) == len(valid_flag_list[i])
anchor_list[i] = torch.cat(anchor_list[i])
valid_flag_list[i] = torch.cat(valid_flag_list[i])
# compute targets for each image
if gt_bboxes_ignore_list is None:
gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
if gt_labels_list is None:
gt_labels_list = [None for _ in range(num_imgs)]
(all_anchors, all_labels, all_label_weights, all_bbox_targets,
all_bbox_weights, pos_inds_list, neg_inds_list) = multi_apply(
self._get_target_single,
anchor_list,
valid_flag_list,
num_level_anchors_list,
gt_bboxes_list,
gt_bboxes_ignore_list,
gt_labels_list,
img_metas,
label_channels=label_channels,
unmap_outputs=unmap_outputs)
# no valid anchors
if any([labels is None for labels in all_labels]):
return None
# sampled anchors of all images
num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
# split targets to a list w.r.t. multiple levels
anchors_list = images_to_levels(all_anchors, num_level_anchors)
labels_list = images_to_levels(all_labels, num_level_anchors)
label_weights_list = images_to_levels(all_label_weights,
num_level_anchors)
bbox_targets_list = images_to_levels(all_bbox_targets,
num_level_anchors)
bbox_weights_list = images_to_levels(all_bbox_weights,
num_level_anchors)
return (anchors_list, labels_list, label_weights_list,
bbox_targets_list, bbox_weights_list, num_total_pos,
num_total_neg)
def _get_target_single(self,
flat_anchors,
valid_flags,
num_level_anchors,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
img_meta,
label_channels=1,
unmap_outputs=True):
"""Compute regression, classification targets for anchors in a single
image.
Args:
flat_anchors (Tensor): Multi-level anchors of the image, which are
concatenated into a single tensor of shape (num_anchors, 4)
valid_flags (Tensor): Multi level valid flags of the image,
which are concatenated into a single tensor of
shape (num_anchors,).
num_level_anchors Tensor): Number of anchors of each scale level.
gt_bboxes (Tensor): Ground truth bboxes of the image,
shape (num_gts, 4).
gt_bboxes_ignore (Tensor): Ground truth bboxes to be
ignored, shape (num_ignored_gts, 4).
gt_labels (Tensor): Ground truth labels of each box,
shape (num_gts,).
img_meta (dict): Meta info of the image.
label_channels (int): Channel of label.
unmap_outputs (bool): Whether to map outputs back to the original
set of anchors.
Returns:
tuple: N is the number of total anchors in the image.
anchors (Tensor): All anchors in the image with shape (N, 4).
labels (Tensor): Labels of all anchors in the image with shape
(N,).
label_weights (Tensor): Label weights of all anchor in the
image with shape (N,).
bbox_targets (Tensor): BBox targets of all anchors in the
image with shape (N, 4).
bbox_weights (Tensor): BBox weights of all anchors in the
image with shape (N, 4).
pos_inds (Tensor): Indices of postive anchor with shape
(num_pos,).
neg_inds (Tensor): Indices of negative anchor with shape
(num_neg,).
"""
inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
img_meta['img_shape'][:2],
self.train_cfg.allowed_border)
if not inside_flags.any():
return (None, ) * 7
# assign gt and sample anchors
anchors = flat_anchors[inside_flags, :]
num_level_anchors_inside = self.get_num_level_anchors_inside(
num_level_anchors, inside_flags)
assign_result = self.assigner.assign(anchors, num_level_anchors_inside,
gt_bboxes, gt_bboxes_ignore,
gt_labels)
sampling_result = self.sampler.sample(assign_result, anchors,
gt_bboxes)
num_valid_anchors = anchors.shape[0]
bbox_targets = torch.zeros_like(anchors)
bbox_weights = torch.zeros_like(anchors)
labels = anchors.new_full((num_valid_anchors, ),
self.num_classes,
dtype=torch.long)
label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds
if len(pos_inds) > 0:
pos_bbox_targets = sampling_result.pos_gt_bboxes
bbox_targets[pos_inds, :] = pos_bbox_targets
bbox_weights[pos_inds, :] = 1.0
if gt_labels is None:
# Only rpn gives gt_labels as None
# Foreground is the first class
labels[pos_inds] = 0
else:
labels[pos_inds] = gt_labels[
sampling_result.pos_assigned_gt_inds]
if self.train_cfg.pos_weight <= 0:
label_weights[pos_inds] = 1.0
else:
label_weights[pos_inds] = self.train_cfg.pos_weight
if len(neg_inds) > 0:
label_weights[neg_inds] = 1.0
# map up to original set of anchors
if unmap_outputs:
num_total_anchors = flat_anchors.size(0)
anchors = unmap(anchors, num_total_anchors, inside_flags)
labels = unmap(
labels, num_total_anchors, inside_flags, fill=self.num_classes)
label_weights = unmap(label_weights, num_total_anchors,
inside_flags)
bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
return (anchors, labels, label_weights, bbox_targets, bbox_weights,
pos_inds, neg_inds)
def get_num_level_anchors_inside(self, num_level_anchors, inside_flags):
split_inside_flags = torch.split(inside_flags, num_level_anchors)
num_level_anchors_inside = [
int(flags.sum()) for flags in split_inside_flags
]
return num_level_anchors_inside
import torch
import torch.nn as nn
from mmcv.cnn import bias_init_with_prob, normal_init
from mmcv.ops import DeformConv2d, MaskedConv2d
from mmcv.runner import force_fp32
from mmdet.core import (anchor_inside_flags, build_anchor_generator,
build_assigner, build_bbox_coder, build_sampler,
calc_region, images_to_levels, multi_apply,
multiclass_nms, unmap)
from ..builder import HEADS, build_loss
from .anchor_head import AnchorHead
class FeatureAdaption(nn.Module):
"""Feature Adaption Module.
Feature Adaption Module is implemented based on DCN v1.
It uses anchor shape prediction rather than feature map to
predict offsets of deform conv layer.
Args:
in_channels (int): Number of channels in the input feature map.
out_channels (int): Number of channels in the output feature map.
kernel_size (int): Deformable conv kernel size.
deform_groups (int): Deformable conv group size.
"""
def __init__(self,
in_channels,
out_channels,
kernel_size=3,
deform_groups=4):
super(FeatureAdaption, self).__init__()
offset_channels = kernel_size * kernel_size * 2
self.conv_offset = nn.Conv2d(
2, deform_groups * offset_channels, 1, bias=False)
self.conv_adaption = DeformConv2d(
in_channels,
out_channels,
kernel_size=kernel_size,
padding=(kernel_size - 1) // 2,
deform_groups=deform_groups)
self.relu = nn.ReLU(inplace=True)
def init_weights(self):
normal_init(self.conv_offset, std=0.1)
normal_init(self.conv_adaption, std=0.01)
def forward(self, x, shape):
offset = self.conv_offset(shape.detach())
x = self.relu(self.conv_adaption(x, offset))
return x
@HEADS.register_module()
class GuidedAnchorHead(AnchorHead):
"""Guided-Anchor-based head (GA-RPN, GA-RetinaNet, etc.).
This GuidedAnchorHead will predict high-quality feature guided
anchors and locations where anchors will be kept in inference.
There are mainly 3 categories of bounding-boxes.
- Sampled 9 pairs for target assignment. (approxes)
- The square boxes where the predicted anchors are based on. (squares)
- Guided anchors.
Please refer to https://arxiv.org/abs/1901.03278 for more details.
Args:
num_classes (int): Number of classes.
in_channels (int): Number of channels in the input feature map.
feat_channels (int): Number of hidden channels.
approx_anchor_generator (dict): Config dict for approx generator
square_anchor_generator (dict): Config dict for square generator
anchor_coder (dict): Config dict for anchor coder
bbox_coder (dict): Config dict for bbox coder
reg_decoded_bbox (bool): If true, the regression loss would be
applied directly on decoded bounding boxes, converting both
the predicted boxes and regression targets to absolute
coordinates format. Default False. It should be `True` when
using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
deform_groups: (int): Group number of DCN in
FeatureAdaption module.
loc_filter_thr (float): Threshold to filter out unconcerned regions.
loss_loc (dict): Config of location loss.
loss_shape (dict): Config of anchor shape loss.
loss_cls (dict): Config of classification loss.
loss_bbox (dict): Config of bbox regression loss.
"""
def __init__(
self,
num_classes,
in_channels,
feat_channels=256,
approx_anchor_generator=dict(
type='AnchorGenerator',
octave_base_scale=8,
scales_per_octave=3,
ratios=[0.5, 1.0, 2.0],
strides=[4, 8, 16, 32, 64]),
square_anchor_generator=dict(
type='AnchorGenerator',
ratios=[1.0],
scales=[8],
strides=[4, 8, 16, 32, 64]),
anchor_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0]
),
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0]
),
reg_decoded_bbox=False,
deform_groups=4,
loc_filter_thr=0.01,
train_cfg=None,
test_cfg=None,
loss_loc=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)): # yapf: disable
super(AnchorHead, self).__init__()
self.in_channels = in_channels
self.num_classes = num_classes
self.feat_channels = feat_channels
self.deform_groups = deform_groups
self.loc_filter_thr = loc_filter_thr
# build approx_anchor_generator and square_anchor_generator
assert (approx_anchor_generator['octave_base_scale'] ==
square_anchor_generator['scales'][0])
assert (approx_anchor_generator['strides'] ==
square_anchor_generator['strides'])
self.approx_anchor_generator = build_anchor_generator(
approx_anchor_generator)
self.square_anchor_generator = build_anchor_generator(
square_anchor_generator)
self.approxs_per_octave = self.approx_anchor_generator \
.num_base_anchors[0]
self.reg_decoded_bbox = reg_decoded_bbox
# one anchor per location
self.num_anchors = 1
self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
self.loc_focal_loss = loss_loc['type'] in ['FocalLoss']
self.sampling = loss_cls['type'] not in ['FocalLoss']
self.ga_sampling = train_cfg is not None and hasattr(
train_cfg, 'ga_sampler')
if self.use_sigmoid_cls:
self.cls_out_channels = self.num_classes
else:
self.cls_out_channels = self.num_classes + 1
# build bbox_coder
self.anchor_coder = build_bbox_coder(anchor_coder)
self.bbox_coder = build_bbox_coder(bbox_coder)
# build losses
self.loss_loc = build_loss(loss_loc)
self.loss_shape = build_loss(loss_shape)
self.loss_cls = build_loss(loss_cls)
self.loss_bbox = build_loss(loss_bbox)
self.train_cfg = train_cfg
self.test_cfg = test_cfg
if self.train_cfg:
self.assigner = build_assigner(self.train_cfg.assigner)
# use PseudoSampler when sampling is False
if self.sampling and hasattr(self.train_cfg, 'sampler'):
sampler_cfg = self.train_cfg.sampler
else:
sampler_cfg = dict(type='PseudoSampler')
self.sampler = build_sampler(sampler_cfg, context=self)
self.ga_assigner = build_assigner(self.train_cfg.ga_assigner)
if self.ga_sampling:
ga_sampler_cfg = self.train_cfg.ga_sampler
else:
ga_sampler_cfg = dict(type='PseudoSampler')
self.ga_sampler = build_sampler(ga_sampler_cfg, context=self)
self.fp16_enabled = False
self._init_layers()
def _init_layers(self):
self.relu = nn.ReLU(inplace=True)
self.conv_loc = nn.Conv2d(self.in_channels, 1, 1)
self.conv_shape = nn.Conv2d(self.in_channels, self.num_anchors * 2, 1)
self.feature_adaption = FeatureAdaption(
self.in_channels,
self.feat_channels,
kernel_size=3,
deform_groups=self.deform_groups)
self.conv_cls = MaskedConv2d(self.feat_channels,
self.num_anchors * self.cls_out_channels,
1)
self.conv_reg = MaskedConv2d(self.feat_channels, self.num_anchors * 4,
1)
def init_weights(self):
normal_init(self.conv_cls, std=0.01)
normal_init(self.conv_reg, std=0.01)
bias_cls = bias_init_with_prob(0.01)
normal_init(self.conv_loc, std=0.01, bias=bias_cls)
normal_init(self.conv_shape, std=0.01)
self.feature_adaption.init_weights()
def forward_single(self, x):
loc_pred = self.conv_loc(x)
shape_pred = self.conv_shape(x)
x = self.feature_adaption(x, shape_pred)
# masked conv is only used during inference for speed-up
if not self.training:
mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
else:
mask = None
cls_score = self.conv_cls(x, mask)
bbox_pred = self.conv_reg(x, mask)
return cls_score, bbox_pred, shape_pred, loc_pred
def forward(self, feats):
return multi_apply(self.forward_single, feats)
def get_sampled_approxs(self, featmap_sizes, img_metas, device='cuda'):
"""Get sampled approxs and inside flags according to feature map sizes.
Args:
featmap_sizes (list[tuple]): Multi-level feature map sizes.
img_metas (list[dict]): Image meta info.
device (torch.device | str): device for returned tensors
Returns:
tuple: approxes of each image, inside flags of each image
"""
num_imgs = len(img_metas)
# since feature map sizes of all images are the same, we only compute
# approxes for one time
multi_level_approxs = self.approx_anchor_generator.grid_anchors(
featmap_sizes, device=device)
approxs_list = [multi_level_approxs for _ in range(num_imgs)]
# for each image, we compute inside flags of multi level approxes
inside_flag_list = []
for img_id, img_meta in enumerate(img_metas):
multi_level_flags = []
multi_level_approxs = approxs_list[img_id]
# obtain valid flags for each approx first
multi_level_approx_flags = self.approx_anchor_generator \
.valid_flags(featmap_sizes,
img_meta['pad_shape'],
device=device)
for i, flags in enumerate(multi_level_approx_flags):
approxs = multi_level_approxs[i]
inside_flags_list = []
for i in range(self.approxs_per_octave):
split_valid_flags = flags[i::self.approxs_per_octave]
split_approxs = approxs[i::self.approxs_per_octave, :]
inside_flags = anchor_inside_flags(
split_approxs, split_valid_flags,
img_meta['img_shape'][:2],
self.train_cfg.allowed_border)
inside_flags_list.append(inside_flags)
# inside_flag for a position is true if any anchor in this
# position is true
inside_flags = (
torch.stack(inside_flags_list, 0).sum(dim=0) > 0)
multi_level_flags.append(inside_flags)
inside_flag_list.append(multi_level_flags)
return approxs_list, inside_flag_list
def get_anchors(self,
featmap_sizes,
shape_preds,
loc_preds,
img_metas,
use_loc_filter=False,
device='cuda'):
"""Get squares according to feature map sizes and guided anchors.
Args:
featmap_sizes (list[tuple]): Multi-level feature map sizes.
shape_preds (list[tensor]): Multi-level shape predictions.
loc_preds (list[tensor]): Multi-level location predictions.
img_metas (list[dict]): Image meta info.
use_loc_filter (bool): Use loc filter or not.
device (torch.device | str): device for returned tensors
Returns:
tuple: square approxs of each image, guided anchors of each image,
loc masks of each image
"""
num_imgs = len(img_metas)
num_levels = len(featmap_sizes)
# since feature map sizes of all images are the same, we only compute
# squares for one time
multi_level_squares = self.square_anchor_generator.grid_anchors(
featmap_sizes, device=device)
squares_list = [multi_level_squares for _ in range(num_imgs)]
# for each image, we compute multi level guided anchors
guided_anchors_list = []
loc_mask_list = []
for img_id, img_meta in enumerate(img_metas):
multi_level_guided_anchors = []
multi_level_loc_mask = []
for i in range(num_levels):
squares = squares_list[img_id][i]
shape_pred = shape_preds[i][img_id]
loc_pred = loc_preds[i][img_id]
guided_anchors, loc_mask = self._get_guided_anchors_single(
squares,
shape_pred,
loc_pred,
use_loc_filter=use_loc_filter)
multi_level_guided_anchors.append(guided_anchors)
multi_level_loc_mask.append(loc_mask)
guided_anchors_list.append(multi_level_guided_anchors)
loc_mask_list.append(multi_level_loc_mask)
return squares_list, guided_anchors_list, loc_mask_list
def _get_guided_anchors_single(self,
squares,
shape_pred,
loc_pred,
use_loc_filter=False):
"""Get guided anchors and loc masks for a single level.
Args:
square (tensor): Squares of a single level.
shape_pred (tensor): Shape predections of a single level.
loc_pred (tensor): Loc predections of a single level.
use_loc_filter (list[tensor]): Use loc filter or not.
Returns:
tuple: guided anchors, location masks
"""
# calculate location filtering mask
loc_pred = loc_pred.sigmoid().detach()
if use_loc_filter:
loc_mask = loc_pred >= self.loc_filter_thr
else:
loc_mask = loc_pred >= 0.0
mask = loc_mask.permute(1, 2, 0).expand(-1, -1, self.num_anchors)
mask = mask.contiguous().view(-1)
# calculate guided anchors
squares = squares[mask]
anchor_deltas = shape_pred.permute(1, 2, 0).contiguous().view(
-1, 2).detach()[mask]
bbox_deltas = anchor_deltas.new_full(squares.size(), 0)
bbox_deltas[:, 2:] = anchor_deltas
guided_anchors = self.anchor_coder.decode(
squares, bbox_deltas, wh_ratio_clip=1e-6)
return guided_anchors, mask
def ga_loc_targets(self, gt_bboxes_list, featmap_sizes):
"""Compute location targets for guided anchoring.
Each feature map is divided into positive, negative and ignore regions.
- positive regions: target 1, weight 1
- ignore regions: target 0, weight 0
- negative regions: target 0, weight 0.1
Args:
gt_bboxes_list (list[Tensor]): Gt bboxes of each image.
featmap_sizes (list[tuple]): Multi level sizes of each feature
maps.
Returns:
tuple
"""
anchor_scale = self.approx_anchor_generator.octave_base_scale
anchor_strides = self.approx_anchor_generator.strides
# Currently only supports same stride in x and y direction.
for stride in anchor_strides:
assert (stride[0] == stride[1])
anchor_strides = [stride[0] for stride in anchor_strides]
center_ratio = self.train_cfg.center_ratio
ignore_ratio = self.train_cfg.ignore_ratio
img_per_gpu = len(gt_bboxes_list)
num_lvls = len(featmap_sizes)
r1 = (1 - center_ratio) / 2
r2 = (1 - ignore_ratio) / 2
all_loc_targets = []
all_loc_weights = []
all_ignore_map = []
for lvl_id in range(num_lvls):
h, w = featmap_sizes[lvl_id]
loc_targets = torch.zeros(
img_per_gpu,
1,
h,
w,
device=gt_bboxes_list[0].device,
dtype=torch.float32)
loc_weights = torch.full_like(loc_targets, -1)
ignore_map = torch.zeros_like(loc_targets)
all_loc_targets.append(loc_targets)
all_loc_weights.append(loc_weights)
all_ignore_map.append(ignore_map)
for img_id in range(img_per_gpu):
gt_bboxes = gt_bboxes_list[img_id]
scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
(gt_bboxes[:, 3] - gt_bboxes[:, 1]))
min_anchor_size = scale.new_full(
(1, ), float(anchor_scale * anchor_strides[0]))
# assign gt bboxes to different feature levels w.r.t. their scales
target_lvls = torch.floor(
torch.log2(scale) - torch.log2(min_anchor_size) + 0.5)
target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long()
for gt_id in range(gt_bboxes.size(0)):
lvl = target_lvls[gt_id].item()
# rescaled to corresponding feature map
gt_ = gt_bboxes[gt_id, :4] / anchor_strides[lvl]
# calculate ignore regions
ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
gt_, r2, featmap_sizes[lvl])
# calculate positive (center) regions
ctr_x1, ctr_y1, ctr_x2, ctr_y2 = calc_region(
gt_, r1, featmap_sizes[lvl])
all_loc_targets[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
ctr_x1:ctr_x2 + 1] = 1
all_loc_weights[lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
ignore_x1:ignore_x2 + 1] = 0
all_loc_weights[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
ctr_x1:ctr_x2 + 1] = 1
# calculate ignore map on nearby low level feature
if lvl > 0:
d_lvl = lvl - 1
# rescaled to corresponding feature map
gt_ = gt_bboxes[gt_id, :4] / anchor_strides[d_lvl]
ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
gt_, r2, featmap_sizes[d_lvl])
all_ignore_map[d_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
ignore_x1:ignore_x2 + 1] = 1
# calculate ignore map on nearby high level feature
if lvl < num_lvls - 1:
u_lvl = lvl + 1
# rescaled to corresponding feature map
gt_ = gt_bboxes[gt_id, :4] / anchor_strides[u_lvl]
ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
gt_, r2, featmap_sizes[u_lvl])
all_ignore_map[u_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
ignore_x1:ignore_x2 + 1] = 1
for lvl_id in range(num_lvls):
# ignore negative regions w.r.t. ignore map
all_loc_weights[lvl_id][(all_loc_weights[lvl_id] < 0)
& (all_ignore_map[lvl_id] > 0)] = 0
# set negative regions with weight 0.1
all_loc_weights[lvl_id][all_loc_weights[lvl_id] < 0] = 0.1
# loc average factor to balance loss
loc_avg_factor = sum(
[t.size(0) * t.size(-1) * t.size(-2)
for t in all_loc_targets]) / 200
return all_loc_targets, all_loc_weights, loc_avg_factor
def _ga_shape_target_single(self,
flat_approxs,
inside_flags,
flat_squares,
gt_bboxes,
gt_bboxes_ignore,
img_meta,
unmap_outputs=True):
"""Compute guided anchoring targets.
This function returns sampled anchors and gt bboxes directly
rather than calculates regression targets.
Args:
flat_approxs (Tensor): flat approxs of a single image,
shape (n, 4)
inside_flags (Tensor): inside flags of a single image,
shape (n, ).
flat_squares (Tensor): flat squares of a single image,
shape (approxs_per_octave * n, 4)
gt_bboxes (Tensor): Ground truth bboxes of a single image.
img_meta (dict): Meta info of a single image.
approxs_per_octave (int): number of approxs per octave
cfg (dict): RPN train configs.
unmap_outputs (bool): unmap outputs or not.
Returns:
tuple
"""
if not inside_flags.any():
return (None, ) * 5
# assign gt and sample anchors
expand_inside_flags = inside_flags[:, None].expand(
-1, self.approxs_per_octave).reshape(-1)
approxs = flat_approxs[expand_inside_flags, :]
squares = flat_squares[inside_flags, :]
assign_result = self.ga_assigner.assign(approxs, squares,
self.approxs_per_octave,
gt_bboxes, gt_bboxes_ignore)
sampling_result = self.ga_sampler.sample(assign_result, squares,
gt_bboxes)
bbox_anchors = torch.zeros_like(squares)
bbox_gts = torch.zeros_like(squares)
bbox_weights = torch.zeros_like(squares)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds
if len(pos_inds) > 0:
bbox_anchors[pos_inds, :] = sampling_result.pos_bboxes
bbox_gts[pos_inds, :] = sampling_result.pos_gt_bboxes
bbox_weights[pos_inds, :] = 1.0
# map up to original set of anchors
if unmap_outputs:
num_total_anchors = flat_squares.size(0)
bbox_anchors = unmap(bbox_anchors, num_total_anchors, inside_flags)
bbox_gts = unmap(bbox_gts, num_total_anchors, inside_flags)
bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
return (bbox_anchors, bbox_gts, bbox_weights, pos_inds, neg_inds)
def ga_shape_targets(self,
approx_list,
inside_flag_list,
square_list,
gt_bboxes_list,
img_metas,
gt_bboxes_ignore_list=None,
unmap_outputs=True):
"""Compute guided anchoring targets.
Args:
approx_list (list[list]): Multi level approxs of each image.
inside_flag_list (list[list]): Multi level inside flags of each
image.
square_list (list[list]): Multi level squares of each image.
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
img_metas (list[dict]): Meta info of each image.
gt_bboxes_ignore_list (list[Tensor]): ignore list of gt bboxes.
unmap_outputs (bool): unmap outputs or not.
Returns:
tuple
"""
num_imgs = len(img_metas)
assert len(approx_list) == len(inside_flag_list) == len(
square_list) == num_imgs
# anchor number of multi levels
num_level_squares = [squares.size(0) for squares in square_list[0]]
# concat all level anchors and flags to a single tensor
inside_flag_flat_list = []
approx_flat_list = []
square_flat_list = []
for i in range(num_imgs):
assert len(square_list[i]) == len(inside_flag_list[i])
inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
approx_flat_list.append(torch.cat(approx_list[i]))
square_flat_list.append(torch.cat(square_list[i]))
# compute targets for each image
if gt_bboxes_ignore_list is None:
gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
(all_bbox_anchors, all_bbox_gts, all_bbox_weights, pos_inds_list,
neg_inds_list) = multi_apply(
self._ga_shape_target_single,
approx_flat_list,
inside_flag_flat_list,
square_flat_list,
gt_bboxes_list,
gt_bboxes_ignore_list,
img_metas,
unmap_outputs=unmap_outputs)
# no valid anchors
if any([bbox_anchors is None for bbox_anchors in all_bbox_anchors]):
return None
# sampled anchors of all images
num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
# split targets to a list w.r.t. multiple levels
bbox_anchors_list = images_to_levels(all_bbox_anchors,
num_level_squares)
bbox_gts_list = images_to_levels(all_bbox_gts, num_level_squares)
bbox_weights_list = images_to_levels(all_bbox_weights,
num_level_squares)
return (bbox_anchors_list, bbox_gts_list, bbox_weights_list,
num_total_pos, num_total_neg)
def loss_shape_single(self, shape_pred, bbox_anchors, bbox_gts,
anchor_weights, anchor_total_num):
shape_pred = shape_pred.permute(0, 2, 3, 1).contiguous().view(-1, 2)
bbox_anchors = bbox_anchors.contiguous().view(-1, 4)
bbox_gts = bbox_gts.contiguous().view(-1, 4)
anchor_weights = anchor_weights.contiguous().view(-1, 4)
bbox_deltas = bbox_anchors.new_full(bbox_anchors.size(), 0)
bbox_deltas[:, 2:] += shape_pred
# filter out negative samples to speed-up weighted_bounded_iou_loss
inds = torch.nonzero(
anchor_weights[:, 0] > 0, as_tuple=False).squeeze(1)
bbox_deltas_ = bbox_deltas[inds]
bbox_anchors_ = bbox_anchors[inds]
bbox_gts_ = bbox_gts[inds]
anchor_weights_ = anchor_weights[inds]
pred_anchors_ = self.anchor_coder.decode(
bbox_anchors_, bbox_deltas_, wh_ratio_clip=1e-6)
loss_shape = self.loss_shape(
pred_anchors_,
bbox_gts_,
anchor_weights_,
avg_factor=anchor_total_num)
return loss_shape
def loss_loc_single(self, loc_pred, loc_target, loc_weight,
loc_avg_factor):
loss_loc = self.loss_loc(
loc_pred.reshape(-1, 1),
loc_target.reshape(-1).long(),
loc_weight.reshape(-1),
avg_factor=loc_avg_factor)
return loss_loc
@force_fp32(
apply_to=('cls_scores', 'bbox_preds', 'shape_preds', 'loc_preds'))
def loss(self,
cls_scores,
bbox_preds,
shape_preds,
loc_preds,
gt_bboxes,
gt_labels,
img_metas,
gt_bboxes_ignore=None):
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == self.approx_anchor_generator.num_levels
device = cls_scores[0].device
# get loc targets
loc_targets, loc_weights, loc_avg_factor = self.ga_loc_targets(
gt_bboxes, featmap_sizes)
# get sampled approxes
approxs_list, inside_flag_list = self.get_sampled_approxs(
featmap_sizes, img_metas, device=device)
# get squares and guided anchors
squares_list, guided_anchors_list, _ = self.get_anchors(
featmap_sizes, shape_preds, loc_preds, img_metas, device=device)
# get shape targets
shape_targets = self.ga_shape_targets(approxs_list, inside_flag_list,
squares_list, gt_bboxes,
img_metas)
if shape_targets is None:
return None
(bbox_anchors_list, bbox_gts_list, anchor_weights_list, anchor_fg_num,
anchor_bg_num) = shape_targets
anchor_total_num = (
anchor_fg_num if not self.ga_sampling else anchor_fg_num +
anchor_bg_num)
# get anchor targets
label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
cls_reg_targets = self.get_targets(
guided_anchors_list,
inside_flag_list,
gt_bboxes,
img_metas,
gt_bboxes_ignore_list=gt_bboxes_ignore,
gt_labels_list=gt_labels,
label_channels=label_channels)
if cls_reg_targets is None:
return None
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
num_total_pos, num_total_neg) = cls_reg_targets
num_total_samples = (
num_total_pos + num_total_neg if self.sampling else num_total_pos)
# anchor number of multi levels
num_level_anchors = [
anchors.size(0) for anchors in guided_anchors_list[0]
]
# concat all level anchors to a single tensor
concat_anchor_list = []
for i in range(len(guided_anchors_list)):
concat_anchor_list.append(torch.cat(guided_anchors_list[i]))
all_anchor_list = images_to_levels(concat_anchor_list,
num_level_anchors)
# get classification and bbox regression losses
losses_cls, losses_bbox = multi_apply(
self.loss_single,
cls_scores,
bbox_preds,
all_anchor_list,
labels_list,
label_weights_list,
bbox_targets_list,
bbox_weights_list,
num_total_samples=num_total_samples)
# get anchor location loss
losses_loc = []
for i in range(len(loc_preds)):
loss_loc = self.loss_loc_single(
loc_preds[i],
loc_targets[i],
loc_weights[i],
loc_avg_factor=loc_avg_factor)
losses_loc.append(loss_loc)
# get anchor shape loss
losses_shape = []
for i in range(len(shape_preds)):
loss_shape = self.loss_shape_single(
shape_preds[i],
bbox_anchors_list[i],
bbox_gts_list[i],
anchor_weights_list[i],
anchor_total_num=anchor_total_num)
losses_shape.append(loss_shape)
return dict(
loss_cls=losses_cls,
loss_bbox=losses_bbox,
loss_shape=losses_shape,
loss_loc=losses_loc)
@force_fp32(
apply_to=('cls_scores', 'bbox_preds', 'shape_preds', 'loc_preds'))
def get_bboxes(self,
cls_scores,
bbox_preds,
shape_preds,
loc_preds,
img_metas,
cfg=None,
rescale=False):
assert len(cls_scores) == len(bbox_preds) == len(shape_preds) == len(
loc_preds)
num_levels = len(cls_scores)
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
device = cls_scores[0].device
# get guided anchors
_, guided_anchors, loc_masks = self.get_anchors(
featmap_sizes,
shape_preds,
loc_preds,
img_metas,
use_loc_filter=not self.training,
device=device)
result_list = []
for img_id in range(len(img_metas)):
cls_score_list = [
cls_scores[i][img_id].detach() for i in range(num_levels)
]
bbox_pred_list = [
bbox_preds[i][img_id].detach() for i in range(num_levels)
]
guided_anchor_list = [
guided_anchors[img_id][i].detach() for i in range(num_levels)
]
loc_mask_list = [
loc_masks[img_id][i].detach() for i in range(num_levels)
]
img_shape = img_metas[img_id]['img_shape']
scale_factor = img_metas[img_id]['scale_factor']
proposals = self._get_bboxes_single(cls_score_list, bbox_pred_list,
guided_anchor_list,
loc_mask_list, img_shape,
scale_factor, cfg, rescale)
result_list.append(proposals)
return result_list
def _get_bboxes_single(self,
cls_scores,
bbox_preds,
mlvl_anchors,
mlvl_masks,
img_shape,
scale_factor,
cfg,
rescale=False):
cfg = self.test_cfg if cfg is None else cfg
assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
mlvl_bboxes = []
mlvl_scores = []
for cls_score, bbox_pred, anchors, mask in zip(cls_scores, bbox_preds,
mlvl_anchors,
mlvl_masks):
assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
# if no location is kept, end.
if mask.sum() == 0:
continue
# reshape scores and bbox_pred
cls_score = cls_score.permute(1, 2,
0).reshape(-1, self.cls_out_channels)
if self.use_sigmoid_cls:
scores = cls_score.sigmoid()
else:
scores = cls_score.softmax(-1)
bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
# filter scores, bbox_pred w.r.t. mask.
# anchors are filtered in get_anchors() beforehand.
scores = scores[mask, :]
bbox_pred = bbox_pred[mask, :]
if scores.dim() == 0:
anchors = anchors.unsqueeze(0)
scores = scores.unsqueeze(0)
bbox_pred = bbox_pred.unsqueeze(0)
# filter anchors, bbox_pred, scores w.r.t. scores
nms_pre = cfg.get('nms_pre', -1)
if nms_pre > 0 and scores.shape[0] > nms_pre:
if self.use_sigmoid_cls:
max_scores, _ = scores.max(dim=1)
else:
# remind that we set FG labels to [0, num_class-1]
# since mmdet v2.0
# BG cat_id: num_class
max_scores, _ = scores[:, :-1].max(dim=1)
_, topk_inds = max_scores.topk(nms_pre)
anchors = anchors[topk_inds, :]
bbox_pred = bbox_pred[topk_inds, :]
scores = scores[topk_inds, :]
bboxes = self.bbox_coder.decode(
anchors, bbox_pred, max_shape=img_shape)
mlvl_bboxes.append(bboxes)
mlvl_scores.append(scores)
mlvl_bboxes = torch.cat(mlvl_bboxes)
if rescale:
mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
mlvl_scores = torch.cat(mlvl_scores)
if self.use_sigmoid_cls:
# Add a dummy background class to the backend when using sigmoid
# remind that we set FG labels to [0, num_class-1] since mmdet v2.0
# BG cat_id: num_class
padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
# multi class NMS
det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
cfg.score_thr, cfg.nms,
cfg.max_per_img)
return det_bboxes, det_labels
import copy
import torch.nn as nn
from mmcv.cnn import (ConvModule, Scale, bias_init_with_prob,
caffe2_xavier_init, normal_init)
from mmdet.models.dense_heads.fcos_head import FCOSHead
from ..builder import HEADS
@HEADS.register_module()
class NASFCOSHead(FCOSHead):
"""Anchor-free head used in `NASFCOS <https://arxiv.org/abs/1906.04423>`_.
It is quite similar with FCOS head, except for the searched structure of
classification branch and bbox regression branch, where a structure of
"dconv3x3, conv3x3, dconv3x3, conv1x1" is utilized instead.
"""
def _init_layers(self):
"""Initialize layers of the head."""
dconv3x3_config = dict(
type='DCNv2',
kernel_size=3,
use_bias=True,
deform_groups=2,
padding=1)
conv3x3_config = dict(type='Conv', kernel_size=3, padding=1)
conv1x1_config = dict(type='Conv', kernel_size=1)
self.arch_config = [
dconv3x3_config, conv3x3_config, dconv3x3_config, conv1x1_config
]
self.cls_convs = nn.ModuleList()
self.reg_convs = nn.ModuleList()
for i, op_ in enumerate(self.arch_config):
op = copy.deepcopy(op_)
chn = self.in_channels if i == 0 else self.feat_channels
assert isinstance(op, dict)
use_bias = op.pop('use_bias', False)
padding = op.pop('padding', 0)
kernel_size = op.pop('kernel_size')
module = ConvModule(
chn,
self.feat_channels,
kernel_size,
stride=1,
padding=padding,
norm_cfg=self.norm_cfg,
bias=use_bias,
conv_cfg=op)
self.cls_convs.append(copy.deepcopy(module))
self.reg_convs.append(copy.deepcopy(module))
self.conv_cls = nn.Conv2d(
self.feat_channels, self.cls_out_channels, 3, padding=1)
self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
self.conv_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1)
self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
def init_weights(self):
"""Initialize weights of the head."""
# retinanet_bias_init
bias_cls = bias_init_with_prob(0.01)
normal_init(self.conv_reg, std=0.01)
normal_init(self.conv_centerness, std=0.01)
normal_init(self.conv_cls, std=0.01, bias=bias_cls)
for branch in [self.cls_convs, self.reg_convs]:
for module in branch.modules():
if isinstance(module, ConvModule) \
and isinstance(module.conv, nn.Conv2d):
caffe2_xavier_init(module.conv)
import numpy as np
import torch
from mmcv.runner import force_fp32
from mmdet.core import multi_apply, multiclass_nms
from mmdet.core.bbox.iou_calculators import bbox_overlaps
from mmdet.models import HEADS
from mmdet.models.dense_heads import ATSSHead
EPS = 1e-12
try:
import sklearn.mixture as skm
except ImportError:
skm = None
def levels_to_images(mlvl_tensor):
"""Concat multi-level feature maps by image.
[feature_level0, feature_level1...] -> [feature_image0, feature_image1...]
Convert the shape of each element in mlvl_tensor from (N, C, H, W) to
(N, H*W , C), then split the element to N elements with shape (H*W, C), and
concat elements in same image of all level along first dimension.
Args:
mlvl_tensor (list[torch.Tensor]): list of Tensor which collect from
corresponding level. Each element is of shape (N, C, H, W)
Returns:
list[torch.Tensor]: A list that contains N tensors and each tensor is
of shape (num_elements, C)
"""
batch_size = mlvl_tensor[0].size(0)
batch_list = [[] for _ in range(batch_size)]
channels = mlvl_tensor[0].size(1)
for t in mlvl_tensor:
t = t.permute(0, 2, 3, 1)
t = t.view(batch_size, -1, channels).contiguous()
for img in range(batch_size):
batch_list[img].append(t[img])
return [torch.cat(item, 0) for item in batch_list]
@HEADS.register_module()
class PAAHead(ATSSHead):
"""Head of PAAAssignment: Probabilistic Anchor Assignment with IoU
Prediction for Object Detection.
Code is modified from the `official github repo
<https://github.com/kkhoot/PAA/blob/master/paa_core
/modeling/rpn/paa/loss.py>`_.
More details can be found in the `paper
<https://arxiv.org/abs/2007.08103>`_ .
Args:
topk (int): Select topk samples with smallest loss in
each level.
score_voting (bool): Whether to use score voting in post-process.
covariance_type : String describing the type of covariance parameters
to be used in :class:`sklearn.mixture.GaussianMixture`.
It must be one of:
- 'full': each component has its own general covariance matrix
- 'tied': all components share the same general covariance matrix
- 'diag': each component has its own diagonal covariance matrix
- 'spherical': each component has its own single variance
Default: 'diag'. From 'full' to 'spherical', the gmm fitting
process is faster yet the performance could be influenced. For most
cases, 'diag' should be a good choice.
"""
def __init__(self,
*args,
topk=9,
score_voting=True,
covariance_type='diag',
**kwargs):
# topk used in paa reassign process
self.topk = topk
self.with_score_voting = score_voting
self.covariance_type = covariance_type
super(PAAHead, self).__init__(*args, **kwargs)
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'iou_preds'))
def loss(self,
cls_scores,
bbox_preds,
iou_preds,
gt_bboxes,
gt_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute losses of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_anchors * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_anchors * 4, H, W)
iou_preds (list[Tensor]): iou_preds for each scale
level with shape (N, num_anchors * 1, H, W)
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (list[Tensor] | None): Specify which bounding
boxes can be ignored when are computing the loss.
Returns:
dict[str, Tensor]: A dictionary of loss gmm_assignment.
"""
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == self.anchor_generator.num_levels
device = cls_scores[0].device
anchor_list, valid_flag_list = self.get_anchors(
featmap_sizes, img_metas, device=device)
label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
cls_reg_targets = self.get_targets(
anchor_list,
valid_flag_list,
gt_bboxes,
img_metas,
gt_bboxes_ignore_list=gt_bboxes_ignore,
gt_labels_list=gt_labels,
label_channels=label_channels,
)
(labels, labels_weight, bboxes_target, bboxes_weight, pos_inds,
pos_gt_index) = cls_reg_targets
cls_scores = levels_to_images(cls_scores)
cls_scores = [
item.reshape(-1, self.cls_out_channels) for item in cls_scores
]
bbox_preds = levels_to_images(bbox_preds)
bbox_preds = [item.reshape(-1, 4) for item in bbox_preds]
iou_preds = levels_to_images(iou_preds)
iou_preds = [item.reshape(-1, 1) for item in iou_preds]
pos_losses_list, = multi_apply(self.get_pos_loss, anchor_list,
cls_scores, bbox_preds, labels,
labels_weight, bboxes_target,
bboxes_weight, pos_inds)
with torch.no_grad():
reassign_labels, reassign_label_weight, \
reassign_bbox_weights, num_pos = multi_apply(
self.paa_reassign,
pos_losses_list,
labels,
labels_weight,
bboxes_weight,
pos_inds,
pos_gt_index,
anchor_list)
num_pos = sum(num_pos)
# convert all tensor list to a flatten tensor
cls_scores = torch.cat(cls_scores, 0).view(-1, cls_scores[0].size(-1))
bbox_preds = torch.cat(bbox_preds, 0).view(-1, bbox_preds[0].size(-1))
iou_preds = torch.cat(iou_preds, 0).view(-1, iou_preds[0].size(-1))
labels = torch.cat(reassign_labels, 0).view(-1)
flatten_anchors = torch.cat(
[torch.cat(item, 0) for item in anchor_list])
labels_weight = torch.cat(reassign_label_weight, 0).view(-1)
bboxes_target = torch.cat(bboxes_target,
0).view(-1, bboxes_target[0].size(-1))
pos_inds_flatten = ((labels >= 0)
&
(labels < self.num_classes)).nonzero().reshape(-1)
losses_cls = self.loss_cls(
cls_scores,
labels,
labels_weight,
avg_factor=max(num_pos, len(img_metas))) # avoid num_pos=0
if num_pos:
pos_bbox_pred = self.bbox_coder.decode(
flatten_anchors[pos_inds_flatten],
bbox_preds[pos_inds_flatten])
pos_bbox_target = bboxes_target[pos_inds_flatten]
iou_target = bbox_overlaps(
pos_bbox_pred.detach(), pos_bbox_target, is_aligned=True)
losses_iou = self.loss_centerness(
iou_preds[pos_inds_flatten],
iou_target.unsqueeze(-1),
avg_factor=num_pos)
losses_bbox = self.loss_bbox(
pos_bbox_pred,
pos_bbox_target,
iou_target.clamp(min=EPS),
avg_factor=iou_target.sum())
else:
losses_iou = iou_preds.sum() * 0
losses_bbox = bbox_preds.sum() * 0
return dict(
loss_cls=losses_cls, loss_bbox=losses_bbox, loss_iou=losses_iou)
def get_pos_loss(self, anchors, cls_score, bbox_pred, label, label_weight,
bbox_target, bbox_weight, pos_inds):
"""Calculate loss of all potential positive samples obtained from first
match process.
Args:
anchors (list[Tensor]): Anchors of each scale.
cls_score (Tensor): Box scores of single image with shape
(num_anchors, num_classes)
bbox_pred (Tensor): Box energies / deltas of single image
with shape (num_anchors, 4)
label (Tensor): classification target of each anchor with
shape (num_anchors,)
label_weight (Tensor): Classification loss weight of each
anchor with shape (num_anchors).
bbox_target (dict): Regression target of each anchor with
shape (num_anchors, 4).
bbox_weight (Tensor): Bbox weight of each anchor with shape
(num_anchors, 4).
pos_inds (Tensor): Index of all positive samples got from
first assign process.
Returns:
Tensor: Losses of all positive samples in single image.
"""
if not len(pos_inds):
return cls_score.new([]),
anchors_all_level = torch.cat(anchors, 0)
pos_scores = cls_score[pos_inds]
pos_bbox_pred = bbox_pred[pos_inds]
pos_label = label[pos_inds]
pos_label_weight = label_weight[pos_inds]
pos_bbox_target = bbox_target[pos_inds]
pos_bbox_weight = bbox_weight[pos_inds]
pos_anchors = anchors_all_level[pos_inds]
pos_bbox_pred = self.bbox_coder.decode(pos_anchors, pos_bbox_pred)
# to keep loss dimension
loss_cls = self.loss_cls(
pos_scores,
pos_label,
pos_label_weight,
avg_factor=self.loss_cls.loss_weight,
reduction_override='none')
loss_bbox = self.loss_bbox(
pos_bbox_pred,
pos_bbox_target,
pos_bbox_weight,
avg_factor=self.loss_cls.loss_weight,
reduction_override='none')
loss_cls = loss_cls.sum(-1)
pos_loss = loss_bbox + loss_cls
return pos_loss,
def paa_reassign(self, pos_losses, label, label_weight, bbox_weight,
pos_inds, pos_gt_inds, anchors):
"""Fit loss to GMM distribution and separate positive, ignore, negative
samples again with GMM model.
Args:
pos_losses (Tensor): Losses of all positive samples in
single image.
label (Tensor): classification target of each anchor with
shape (num_anchors,)
label_weight (Tensor): Classification loss weight of each
anchor with shape (num_anchors).
bbox_weight (Tensor): Bbox weight of each anchor with shape
(num_anchors, 4).
pos_inds (Tensor): Index of all positive samples got from
first assign process.
pos_gt_inds (Tensor): Gt_index of all positive samples got
from first assign process.
anchors (list[Tensor]): Anchors of each scale.
Returns:
tuple: Usually returns a tuple containing learning targets.
- label (Tensor): classification target of each anchor after
paa assign, with shape (num_anchors,)
- label_weight (Tensor): Classification loss weight of each
anchor after paa assign, with shape (num_anchors).
- bbox_weight (Tensor): Bbox weight of each anchor with shape
(num_anchors, 4).
- num_pos (int): The number of positive samples after paa
assign.
"""
if not len(pos_inds):
return label, label_weight, bbox_weight, 0
label = label.clone()
label_weight = label_weight.clone()
bbox_weight = bbox_weight.clone()
num_gt = pos_gt_inds.max() + 1
num_level = len(anchors)
num_anchors_each_level = [item.size(0) for item in anchors]
num_anchors_each_level.insert(0, 0)
inds_level_interval = np.cumsum(num_anchors_each_level)
pos_level_mask = []
for i in range(num_level):
mask = (pos_inds >= inds_level_interval[i]) & (
pos_inds < inds_level_interval[i + 1])
pos_level_mask.append(mask)
pos_inds_after_paa = [label.new_tensor([])]
ignore_inds_after_paa = [label.new_tensor([])]
for gt_ind in range(num_gt):
pos_inds_gmm = []
pos_loss_gmm = []
gt_mask = pos_gt_inds == gt_ind
for level in range(num_level):
level_mask = pos_level_mask[level]
level_gt_mask = level_mask & gt_mask
value, topk_inds = pos_losses[level_gt_mask].topk(
min(level_gt_mask.sum(), self.topk), largest=False)
pos_inds_gmm.append(pos_inds[level_gt_mask][topk_inds])
pos_loss_gmm.append(value)
pos_inds_gmm = torch.cat(pos_inds_gmm)
pos_loss_gmm = torch.cat(pos_loss_gmm)
# fix gmm need at least two sample
if len(pos_inds_gmm) < 2:
continue
device = pos_inds_gmm.device
pos_loss_gmm, sort_inds = pos_loss_gmm.sort()
pos_inds_gmm = pos_inds_gmm[sort_inds]
pos_loss_gmm = pos_loss_gmm.view(-1, 1).cpu().numpy()
min_loss, max_loss = pos_loss_gmm.min(), pos_loss_gmm.max()
means_init = np.array([min_loss, max_loss]).reshape(2, 1)
weights_init = np.array([0.5, 0.5])
precisions_init = np.array([1.0, 1.0]).reshape(2, 1, 1) # full
if self.covariance_type == 'spherical':
precisions_init = precisions_init.reshape(2)
elif self.covariance_type == 'diag':
precisions_init = precisions_init.reshape(2, 1)
elif self.covariance_type == 'tied':
precisions_init = np.array([[1.0]])
if skm is None:
raise ImportError('Please run "pip install sklearn" '
'to install sklearn first.')
gmm = skm.GaussianMixture(
2,
weights_init=weights_init,
means_init=means_init,
precisions_init=precisions_init,
covariance_type=self.covariance_type)
gmm.fit(pos_loss_gmm)
gmm_assignment = gmm.predict(pos_loss_gmm)
scores = gmm.score_samples(pos_loss_gmm)
gmm_assignment = torch.from_numpy(gmm_assignment).to(device)
scores = torch.from_numpy(scores).to(device)
pos_inds_temp, ignore_inds_temp = self.gmm_separation_scheme(
gmm_assignment, scores, pos_inds_gmm)
pos_inds_after_paa.append(pos_inds_temp)
ignore_inds_after_paa.append(ignore_inds_temp)
pos_inds_after_paa = torch.cat(pos_inds_after_paa)
ignore_inds_after_paa = torch.cat(ignore_inds_after_paa)
reassign_mask = (pos_inds.unsqueeze(1) != pos_inds_after_paa).all(1)
reassign_ids = pos_inds[reassign_mask]
label[reassign_ids] = self.num_classes
label_weight[ignore_inds_after_paa] = 0
bbox_weight[reassign_ids] = 0
num_pos = len(pos_inds_after_paa)
return label, label_weight, bbox_weight, num_pos
def gmm_separation_scheme(self, gmm_assignment, scores, pos_inds_gmm):
"""A general separation scheme for gmm model.
It separates a GMM distribution of candidate samples into three
parts, 0 1 and uncertain areas, and you can implement other
separation schemes by rewriting this function.
Args:
gmm_assignment (Tensor): The prediction of GMM which is of shape
(num_samples,). The 0/1 value indicates the distribution
that each sample comes from.
scores (Tensor): The probability of sample coming from the
fit GMM distribution. The tensor is of shape (num_samples,).
pos_inds_gmm (Tensor): All the indexes of samples which are used
to fit GMM model. The tensor is of shape (num_samples,)
Returns:
tuple[Tensor]: The indices of positive and ignored samples.
- pos_inds_temp (Tensor): Indices of positive samples.
- ignore_inds_temp (Tensor): Indices of ignore samples.
"""
# The implementation is (c) in Fig.3 in origin paper intead of (b).
# You can refer to issues such as
# https://github.com/kkhoot/PAA/issues/8 and
# https://github.com/kkhoot/PAA/issues/9.
fgs = gmm_assignment == 0
pos_inds_temp = fgs.new_tensor([], dtype=torch.long)
ignore_inds_temp = fgs.new_tensor([], dtype=torch.long)
if fgs.nonzero().numel():
_, pos_thr_ind = scores[fgs].topk(1)
pos_inds_temp = pos_inds_gmm[fgs][:pos_thr_ind + 1]
ignore_inds_temp = pos_inds_gmm.new_tensor([])
return pos_inds_temp, ignore_inds_temp
def get_targets(
self,
anchor_list,
valid_flag_list,
gt_bboxes_list,
img_metas,
gt_bboxes_ignore_list=None,
gt_labels_list=None,
label_channels=1,
unmap_outputs=True,
):
"""Get targets for PAA head.
This method is almost the same as `AnchorHead.get_targets()`. We direct
return the results from _get_targets_single instead map it to levels
by images_to_levels function.
Args:
anchor_list (list[list[Tensor]]): Multi level anchors of each
image. The outer list indicates images, and the inner list
corresponds to feature levels of the image. Each element of
the inner list is a tensor of shape (num_anchors, 4).
valid_flag_list (list[list[Tensor]]): Multi level valid flags of
each image. The outer list indicates images, and the inner list
corresponds to feature levels of the image. Each element of
the inner list is a tensor of shape (num_anchors, )
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
img_metas (list[dict]): Meta info of each image.
gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be
ignored.
gt_labels_list (list[Tensor]): Ground truth labels of each box.
label_channels (int): Channel of label.
unmap_outputs (bool): Whether to map outputs back to the original
set of anchors.
Returns:
tuple: Usually returns a tuple containing learning targets.
- labels (list[Tensor]): Labels of all anchors, each with
shape (num_anchors,).
- label_weights (list[Tensor]): Label weights of all anchor.
each with shape (num_anchors,).
- bbox_targets (list[Tensor]): BBox targets of all anchors.
each with shape (num_anchors, 4).
- bbox_weights (list[Tensor]): BBox weights of all anchors.
each with shape (num_anchors, 4).
- pos_inds (list[Tensor]): Contains all index of positive
sample in all anchor.
- gt_inds (list[Tensor]): Contains all gt_index of positive
sample in all anchor.
"""
num_imgs = len(img_metas)
assert len(anchor_list) == len(valid_flag_list) == num_imgs
concat_anchor_list = []
concat_valid_flag_list = []
for i in range(num_imgs):
assert len(anchor_list[i]) == len(valid_flag_list[i])
concat_anchor_list.append(torch.cat(anchor_list[i]))
concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
# compute targets for each image
if gt_bboxes_ignore_list is None:
gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
if gt_labels_list is None:
gt_labels_list = [None for _ in range(num_imgs)]
results = multi_apply(
self._get_targets_single,
concat_anchor_list,
concat_valid_flag_list,
gt_bboxes_list,
gt_bboxes_ignore_list,
gt_labels_list,
img_metas,
label_channels=label_channels,
unmap_outputs=unmap_outputs)
(labels, label_weights, bbox_targets, bbox_weights, valid_pos_inds,
valid_neg_inds, sampling_result) = results
# Due to valid flag of anchors, we have to calculate the real pos_inds
# in origin anchor set.
pos_inds = []
for i, single_labels in enumerate(labels):
pos_mask = (0 <= single_labels) & (
single_labels < self.num_classes)
pos_inds.append(pos_mask.nonzero().view(-1))
gt_inds = [item.pos_assigned_gt_inds for item in sampling_result]
return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
gt_inds)
def _get_targets_single(self,
flat_anchors,
valid_flags,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
img_meta,
label_channels=1,
unmap_outputs=True):
"""Compute regression and classification targets for anchors in a
single image.
This method is same as `AnchorHead._get_targets_single()`.
"""
assert unmap_outputs, 'We must map outputs back to the original' \
'set of anchors in PAAhead'
return super(ATSSHead, self)._get_targets_single(
flat_anchors,
valid_flags,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
img_meta,
label_channels=1,
unmap_outputs=True)
def _get_bboxes_single(self,
cls_scores,
bbox_preds,
iou_preds,
mlvl_anchors,
img_shape,
scale_factor,
cfg,
rescale=False,
with_nms=True):
"""Transform outputs for a single batch item into labeled boxes.
This method is almost same as `ATSSHead._get_bboxes_single()`.
We use sqrt(iou_preds * cls_scores) in NMS process instead of just
cls_scores. Besides, score voting is used when `` score_voting``
is set to True.
"""
assert with_nms, 'PAA only supports "with_nms=True" now'
assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
mlvl_bboxes = []
mlvl_scores = []
mlvl_iou_preds = []
for cls_score, bbox_pred, iou_preds, anchors in zip(
cls_scores, bbox_preds, iou_preds, mlvl_anchors):
assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
scores = cls_score.permute(1, 2, 0).reshape(
-1, self.cls_out_channels).sigmoid()
bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
iou_preds = iou_preds.permute(1, 2, 0).reshape(-1).sigmoid()
nms_pre = cfg.get('nms_pre', -1)
if nms_pre > 0 and scores.shape[0] > nms_pre:
max_scores, _ = (scores * iou_preds[:, None]).sqrt().max(dim=1)
_, topk_inds = max_scores.topk(nms_pre)
anchors = anchors[topk_inds, :]
bbox_pred = bbox_pred[topk_inds, :]
scores = scores[topk_inds, :]
iou_preds = iou_preds[topk_inds]
bboxes = self.bbox_coder.decode(
anchors, bbox_pred, max_shape=img_shape)
mlvl_bboxes.append(bboxes)
mlvl_scores.append(scores)
mlvl_iou_preds.append(iou_preds)
mlvl_bboxes = torch.cat(mlvl_bboxes)
if rescale:
mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
mlvl_scores = torch.cat(mlvl_scores)
# Add a dummy background class to the backend when using sigmoid
# remind that we set FG labels to [0, num_class-1] since mmdet v2.0
# BG cat_id: num_class
padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
mlvl_iou_preds = torch.cat(mlvl_iou_preds)
mlvl_nms_scores = (mlvl_scores * mlvl_iou_preds[:, None]).sqrt()
det_bboxes, det_labels = multiclass_nms(
mlvl_bboxes,
mlvl_nms_scores,
cfg.score_thr,
cfg.nms,
cfg.max_per_img,
score_factors=None)
if self.with_score_voting and len(det_bboxes) > 0:
det_bboxes, det_labels = self.score_voting(det_bboxes, det_labels,
mlvl_bboxes,
mlvl_nms_scores,
cfg.score_thr)
return det_bboxes, det_labels
def score_voting(self, det_bboxes, det_labels, mlvl_bboxes,
mlvl_nms_scores, score_thr):
"""Implementation of score voting method works on each remaining boxes
after NMS procedure.
Args:
det_bboxes (Tensor): Remaining boxes after NMS procedure,
with shape (k, 5), each dimension means
(x1, y1, x2, y2, score).
det_labels (Tensor): The label of remaining boxes, with shape
(k, 1),Labels are 0-based.
mlvl_bboxes (Tensor): All boxes before the NMS procedure,
with shape (num_anchors,4).
mlvl_nms_scores (Tensor): The scores of all boxes which is used
in the NMS procedure, with shape (num_anchors, num_class)
mlvl_iou_preds (Tensot): The predictions of IOU of all boxes
before the NMS procedure, with shape (num_anchors, 1)
score_thr (float): The score threshold of bboxes.
Returns:
tuple: Usually returns a tuple containing voting results.
- det_bboxes_voted (Tensor): Remaining boxes after
score voting procedure, with shape (k, 5), each
dimension means (x1, y1, x2, y2, score).
- det_labels_voted (Tensor): Label of remaining bboxes
after voting, with shape (num_anchors,).
"""
candidate_mask = mlvl_nms_scores > score_thr
candidate_mask_nozeros = candidate_mask.nonzero()
candidate_inds = candidate_mask_nozeros[:, 0]
candidate_labels = candidate_mask_nozeros[:, 1]
candidate_bboxes = mlvl_bboxes[candidate_inds]
candidate_scores = mlvl_nms_scores[candidate_mask]
det_bboxes_voted = []
det_labels_voted = []
for cls in range(self.cls_out_channels):
candidate_cls_mask = candidate_labels == cls
if not candidate_cls_mask.any():
continue
candidate_cls_scores = candidate_scores[candidate_cls_mask]
candidate_cls_bboxes = candidate_bboxes[candidate_cls_mask]
det_cls_mask = det_labels == cls
det_cls_bboxes = det_bboxes[det_cls_mask].view(
-1, det_bboxes.size(-1))
det_candidate_ious = bbox_overlaps(det_cls_bboxes[:, :4],
candidate_cls_bboxes)
for det_ind in range(len(det_cls_bboxes)):
single_det_ious = det_candidate_ious[det_ind]
pos_ious_mask = single_det_ious > 0.01
pos_ious = single_det_ious[pos_ious_mask]
pos_bboxes = candidate_cls_bboxes[pos_ious_mask]
pos_scores = candidate_cls_scores[pos_ious_mask]
pis = (torch.exp(-(1 - pos_ious)**2 / 0.025) *
pos_scores)[:, None]
voted_box = torch.sum(
pis * pos_bboxes, dim=0) / torch.sum(
pis, dim=0)
voted_score = det_cls_bboxes[det_ind][-1:][None, :]
det_bboxes_voted.append(
torch.cat((voted_box[None, :], voted_score), dim=1))
det_labels_voted.append(cls)
det_bboxes_voted = torch.cat(det_bboxes_voted, dim=0)
det_labels_voted = det_labels.new_tensor(det_labels_voted)
return det_bboxes_voted, det_labels_voted
import torch
from mmcv.runner import force_fp32
from mmdet.core import images_to_levels
from ..builder import HEADS
from ..losses import carl_loss, isr_p
from .retina_head import RetinaHead
@HEADS.register_module()
class PISARetinaHead(RetinaHead):
"""PISA Retinanet Head.
The head owns the same structure with Retinanet Head, but differs in two
aspects:
1. Importance-based Sample Reweighting Positive (ISR-P) is applied to
change the positive loss weights.
2. Classification-aware regression loss is adopted as a third loss.
"""
@force_fp32(apply_to=('cls_scores', 'bbox_preds'))
def loss(self,
cls_scores,
bbox_preds,
gt_bboxes,
gt_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute losses of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_anchors * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_anchors * 4, H, W)
gt_bboxes (list[Tensor]): Ground truth bboxes of each image
with shape (num_obj, 4).
gt_labels (list[Tensor]): Ground truth labels of each image
with shape (num_obj, 4).
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (list[Tensor]): Ignored gt bboxes of each image.
Default: None.
Returns:
dict: Loss dict, comprise classification loss, regression loss and
carl loss.
"""
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == self.anchor_generator.num_levels
device = cls_scores[0].device
anchor_list, valid_flag_list = self.get_anchors(
featmap_sizes, img_metas, device=device)
label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
cls_reg_targets = self.get_targets(
anchor_list,
valid_flag_list,
gt_bboxes,
img_metas,
gt_bboxes_ignore_list=gt_bboxes_ignore,
gt_labels_list=gt_labels,
label_channels=label_channels,
return_sampling_results=True)
if cls_reg_targets is None:
return None
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
num_total_pos, num_total_neg, sampling_results_list) = cls_reg_targets
num_total_samples = (
num_total_pos + num_total_neg if self.sampling else num_total_pos)
# anchor number of multi levels
num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
# concat all level anchors and flags to a single tensor
concat_anchor_list = []
for i in range(len(anchor_list)):
concat_anchor_list.append(torch.cat(anchor_list[i]))
all_anchor_list = images_to_levels(concat_anchor_list,
num_level_anchors)
num_imgs = len(img_metas)
flatten_cls_scores = [
cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, label_channels)
for cls_score in cls_scores
]
flatten_cls_scores = torch.cat(
flatten_cls_scores, dim=1).reshape(-1,
flatten_cls_scores[0].size(-1))
flatten_bbox_preds = [
bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
for bbox_pred in bbox_preds
]
flatten_bbox_preds = torch.cat(
flatten_bbox_preds, dim=1).view(-1, flatten_bbox_preds[0].size(-1))
flatten_labels = torch.cat(labels_list, dim=1).reshape(-1)
flatten_label_weights = torch.cat(
label_weights_list, dim=1).reshape(-1)
flatten_anchors = torch.cat(all_anchor_list, dim=1).reshape(-1, 4)
flatten_bbox_targets = torch.cat(
bbox_targets_list, dim=1).reshape(-1, 4)
flatten_bbox_weights = torch.cat(
bbox_weights_list, dim=1).reshape(-1, 4)
# Apply ISR-P
isr_cfg = self.train_cfg.get('isr', None)
if isr_cfg is not None:
all_targets = (flatten_labels, flatten_label_weights,
flatten_bbox_targets, flatten_bbox_weights)
with torch.no_grad():
all_targets = isr_p(
flatten_cls_scores,
flatten_bbox_preds,
all_targets,
flatten_anchors,
sampling_results_list,
bbox_coder=self.bbox_coder,
loss_cls=self.loss_cls,
num_class=self.num_classes,
**self.train_cfg.isr)
(flatten_labels, flatten_label_weights, flatten_bbox_targets,
flatten_bbox_weights) = all_targets
# For convenience we compute loss once instead separating by fpn level,
# so that we don't need to separate the weights by level again.
# The result should be the same
losses_cls = self.loss_cls(
flatten_cls_scores,
flatten_labels,
flatten_label_weights,
avg_factor=num_total_samples)
losses_bbox = self.loss_bbox(
flatten_bbox_preds,
flatten_bbox_targets,
flatten_bbox_weights,
avg_factor=num_total_samples)
loss_dict = dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
# CARL Loss
carl_cfg = self.train_cfg.get('carl', None)
if carl_cfg is not None:
loss_carl = carl_loss(
flatten_cls_scores,
flatten_labels,
flatten_bbox_preds,
flatten_bbox_targets,
self.loss_bbox,
**self.train_cfg.carl,
avg_factor=num_total_pos,
sigmoid=True,
num_class=self.num_classes)
loss_dict.update(loss_carl)
return loss_dict
import torch
from mmdet.core import multi_apply
from ..builder import HEADS
from ..losses import CrossEntropyLoss, SmoothL1Loss, carl_loss, isr_p
from .ssd_head import SSDHead
# TODO: add loss evaluator for SSD
@HEADS.register_module()
class PISASSDHead(SSDHead):
def loss(self,
cls_scores,
bbox_preds,
gt_bboxes,
gt_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute losses of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_anchors * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_anchors * 4, H, W)
gt_bboxes (list[Tensor]): Ground truth bboxes of each image
with shape (num_obj, 4).
gt_labels (list[Tensor]): Ground truth labels of each image
with shape (num_obj, 4).
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (list[Tensor]): Ignored gt bboxes of each image.
Default: None.
Returns:
dict: Loss dict, comprise classification loss regression loss and
carl loss.
"""
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == self.anchor_generator.num_levels
device = cls_scores[0].device
anchor_list, valid_flag_list = self.get_anchors(
featmap_sizes, img_metas, device=device)
cls_reg_targets = self.get_targets(
anchor_list,
valid_flag_list,
gt_bboxes,
img_metas,
gt_bboxes_ignore_list=gt_bboxes_ignore,
gt_labels_list=gt_labels,
label_channels=1,
unmap_outputs=False,
return_sampling_results=True)
if cls_reg_targets is None:
return None
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
num_total_pos, num_total_neg, sampling_results_list) = cls_reg_targets
num_images = len(img_metas)
all_cls_scores = torch.cat([
s.permute(0, 2, 3, 1).reshape(
num_images, -1, self.cls_out_channels) for s in cls_scores
], 1)
all_labels = torch.cat(labels_list, -1).view(num_images, -1)
all_label_weights = torch.cat(label_weights_list,
-1).view(num_images, -1)
all_bbox_preds = torch.cat([
b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
for b in bbox_preds
], -2)
all_bbox_targets = torch.cat(bbox_targets_list,
-2).view(num_images, -1, 4)
all_bbox_weights = torch.cat(bbox_weights_list,
-2).view(num_images, -1, 4)
# concat all level anchors to a single tensor
all_anchors = []
for i in range(num_images):
all_anchors.append(torch.cat(anchor_list[i]))
isr_cfg = self.train_cfg.get('isr', None)
all_targets = (all_labels.view(-1), all_label_weights.view(-1),
all_bbox_targets.view(-1,
4), all_bbox_weights.view(-1, 4))
# apply ISR-P
if isr_cfg is not None:
all_targets = isr_p(
all_cls_scores.view(-1, all_cls_scores.size(-1)),
all_bbox_preds.view(-1, 4),
all_targets,
torch.cat(all_anchors),
sampling_results_list,
loss_cls=CrossEntropyLoss(),
bbox_coder=self.bbox_coder,
**self.train_cfg.isr,
num_class=self.num_classes)
(new_labels, new_label_weights, new_bbox_targets,
new_bbox_weights) = all_targets
all_labels = new_labels.view(all_labels.shape)
all_label_weights = new_label_weights.view(all_label_weights.shape)
all_bbox_targets = new_bbox_targets.view(all_bbox_targets.shape)
all_bbox_weights = new_bbox_weights.view(all_bbox_weights.shape)
# add CARL loss
carl_loss_cfg = self.train_cfg.get('carl', None)
if carl_loss_cfg is not None:
loss_carl = carl_loss(
all_cls_scores.view(-1, all_cls_scores.size(-1)),
all_targets[0],
all_bbox_preds.view(-1, 4),
all_targets[2],
SmoothL1Loss(beta=1.),
**self.train_cfg.carl,
avg_factor=num_total_pos,
num_class=self.num_classes)
# check NaN and Inf
assert torch.isfinite(all_cls_scores).all().item(), \
'classification scores become infinite or NaN!'
assert torch.isfinite(all_bbox_preds).all().item(), \
'bbox predications become infinite or NaN!'
losses_cls, losses_bbox = multi_apply(
self.loss_single,
all_cls_scores,
all_bbox_preds,
all_anchors,
all_labels,
all_label_weights,
all_bbox_targets,
all_bbox_weights,
num_total_samples=num_total_pos)
loss_dict = dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
if carl_loss_cfg is not None:
loss_dict.update(loss_carl)
return loss_dict
import numpy as np
import torch
import torch.nn as nn
from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
from mmcv.ops import DeformConv2d
from mmdet.core import (PointGenerator, build_assigner, build_sampler,
images_to_levels, multi_apply, multiclass_nms, unmap)
from ..builder import HEADS, build_loss
from .anchor_free_head import AnchorFreeHead
@HEADS.register_module()
class RepPointsHead(AnchorFreeHead):
"""RepPoint head.
Args:
point_feat_channels (int): Number of channels of points features.
gradient_mul (float): The multiplier to gradients from
points refinement and recognition.
point_strides (Iterable): points strides.
point_base_scale (int): bbox scale for assigning labels.
loss_cls (dict): Config of classification loss.
loss_bbox_init (dict): Config of initial points loss.
loss_bbox_refine (dict): Config of points loss in refinement.
use_grid_points (bool): If we use bounding box representation, the
reppoints is represented as grid points on the bounding box.
center_init (bool): Whether to use center point assignment.
transform_method (str): The methods to transform RepPoints to bbox.
""" # noqa: W605
def __init__(self,
num_classes,
in_channels,
point_feat_channels=256,
num_points=9,
gradient_mul=0.1,
point_strides=[8, 16, 32, 64, 128],
point_base_scale=4,
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox_init=dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.5),
loss_bbox_refine=dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
use_grid_points=False,
center_init=True,
transform_method='moment',
moment_mul=0.01,
**kwargs):
self.num_points = num_points
self.point_feat_channels = point_feat_channels
self.use_grid_points = use_grid_points
self.center_init = center_init
# we use deform conv to extract points features
self.dcn_kernel = int(np.sqrt(num_points))
self.dcn_pad = int((self.dcn_kernel - 1) / 2)
assert self.dcn_kernel * self.dcn_kernel == num_points, \
'The points number should be a square number.'
assert self.dcn_kernel % 2 == 1, \
'The points number should be an odd square number.'
dcn_base = np.arange(-self.dcn_pad,
self.dcn_pad + 1).astype(np.float64)
dcn_base_y = np.repeat(dcn_base, self.dcn_kernel)
dcn_base_x = np.tile(dcn_base, self.dcn_kernel)
dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape(
(-1))
self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1)
super().__init__(num_classes, in_channels, loss_cls=loss_cls, **kwargs)
self.gradient_mul = gradient_mul
self.point_base_scale = point_base_scale
self.point_strides = point_strides
self.point_generators = [PointGenerator() for _ in self.point_strides]
self.sampling = loss_cls['type'] not in ['FocalLoss']
if self.train_cfg:
self.init_assigner = build_assigner(self.train_cfg.init.assigner)
self.refine_assigner = build_assigner(
self.train_cfg.refine.assigner)
# use PseudoSampler when sampling is False
if self.sampling and hasattr(self.train_cfg, 'sampler'):
sampler_cfg = self.train_cfg.sampler
else:
sampler_cfg = dict(type='PseudoSampler')
self.sampler = build_sampler(sampler_cfg, context=self)
self.transform_method = transform_method
if self.transform_method == 'moment':
self.moment_transfer = nn.Parameter(
data=torch.zeros(2), requires_grad=True)
self.moment_mul = moment_mul
self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
if self.use_sigmoid_cls:
self.cls_out_channels = self.num_classes
else:
self.cls_out_channels = self.num_classes + 1
self.loss_bbox_init = build_loss(loss_bbox_init)
self.loss_bbox_refine = build_loss(loss_bbox_refine)
def _init_layers(self):
"""Initialize layers of the head."""
self.relu = nn.ReLU(inplace=True)
self.cls_convs = nn.ModuleList()
self.reg_convs = nn.ModuleList()
for i in range(self.stacked_convs):
chn = self.in_channels if i == 0 else self.feat_channels
self.cls_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg))
self.reg_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg))
pts_out_dim = 4 if self.use_grid_points else 2 * self.num_points
self.reppoints_cls_conv = DeformConv2d(self.feat_channels,
self.point_feat_channels,
self.dcn_kernel, 1,
self.dcn_pad)
self.reppoints_cls_out = nn.Conv2d(self.point_feat_channels,
self.cls_out_channels, 1, 1, 0)
self.reppoints_pts_init_conv = nn.Conv2d(self.feat_channels,
self.point_feat_channels, 3,
1, 1)
self.reppoints_pts_init_out = nn.Conv2d(self.point_feat_channels,
pts_out_dim, 1, 1, 0)
self.reppoints_pts_refine_conv = DeformConv2d(self.feat_channels,
self.point_feat_channels,
self.dcn_kernel, 1,
self.dcn_pad)
self.reppoints_pts_refine_out = nn.Conv2d(self.point_feat_channels,
pts_out_dim, 1, 1, 0)
def init_weights(self):
"""Initialize weights of the head."""
for m in self.cls_convs:
normal_init(m.conv, std=0.01)
for m in self.reg_convs:
normal_init(m.conv, std=0.01)
bias_cls = bias_init_with_prob(0.01)
normal_init(self.reppoints_cls_conv, std=0.01)
normal_init(self.reppoints_cls_out, std=0.01, bias=bias_cls)
normal_init(self.reppoints_pts_init_conv, std=0.01)
normal_init(self.reppoints_pts_init_out, std=0.01)
normal_init(self.reppoints_pts_refine_conv, std=0.01)
normal_init(self.reppoints_pts_refine_out, std=0.01)
def points2bbox(self, pts, y_first=True):
"""Converting the points set into bounding box.
:param pts: the input points sets (fields), each points
set (fields) is represented as 2n scalar.
:param y_first: if y_fisrt=True, the point set is represented as
[y1, x1, y2, x2 ... yn, xn], otherwise the point set is
represented as [x1, y1, x2, y2 ... xn, yn].
:return: each points set is converting to a bbox [x1, y1, x2, y2].
"""
pts_reshape = pts.view(pts.shape[0], -1, 2, *pts.shape[2:])
pts_y = pts_reshape[:, :, 0, ...] if y_first else pts_reshape[:, :, 1,
...]
pts_x = pts_reshape[:, :, 1, ...] if y_first else pts_reshape[:, :, 0,
...]
if self.transform_method == 'minmax':
bbox_left = pts_x.min(dim=1, keepdim=True)[0]
bbox_right = pts_x.max(dim=1, keepdim=True)[0]
bbox_up = pts_y.min(dim=1, keepdim=True)[0]
bbox_bottom = pts_y.max(dim=1, keepdim=True)[0]
bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom],
dim=1)
elif self.transform_method == 'partial_minmax':
pts_y = pts_y[:, :4, ...]
pts_x = pts_x[:, :4, ...]
bbox_left = pts_x.min(dim=1, keepdim=True)[0]
bbox_right = pts_x.max(dim=1, keepdim=True)[0]
bbox_up = pts_y.min(dim=1, keepdim=True)[0]
bbox_bottom = pts_y.max(dim=1, keepdim=True)[0]
bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom],
dim=1)
elif self.transform_method == 'moment':
pts_y_mean = pts_y.mean(dim=1, keepdim=True)
pts_x_mean = pts_x.mean(dim=1, keepdim=True)
pts_y_std = torch.std(pts_y - pts_y_mean, dim=1, keepdim=True)
pts_x_std = torch.std(pts_x - pts_x_mean, dim=1, keepdim=True)
moment_transfer = (self.moment_transfer * self.moment_mul) + (
self.moment_transfer.detach() * (1 - self.moment_mul))
moment_width_transfer = moment_transfer[0]
moment_height_transfer = moment_transfer[1]
half_width = pts_x_std * torch.exp(moment_width_transfer)
half_height = pts_y_std * torch.exp(moment_height_transfer)
bbox = torch.cat([
pts_x_mean - half_width, pts_y_mean - half_height,
pts_x_mean + half_width, pts_y_mean + half_height
],
dim=1)
else:
raise NotImplementedError
return bbox
def gen_grid_from_reg(self, reg, previous_boxes):
"""Base on the previous bboxes and regression values, we compute the
regressed bboxes and generate the grids on the bboxes.
:param reg: the regression value to previous bboxes.
:param previous_boxes: previous bboxes.
:return: generate grids on the regressed bboxes.
"""
b, _, h, w = reg.shape
bxy = (previous_boxes[:, :2, ...] + previous_boxes[:, 2:, ...]) / 2.
bwh = (previous_boxes[:, 2:, ...] -
previous_boxes[:, :2, ...]).clamp(min=1e-6)
grid_topleft = bxy + bwh * reg[:, :2, ...] - 0.5 * bwh * torch.exp(
reg[:, 2:, ...])
grid_wh = bwh * torch.exp(reg[:, 2:, ...])
grid_left = grid_topleft[:, [0], ...]
grid_top = grid_topleft[:, [1], ...]
grid_width = grid_wh[:, [0], ...]
grid_height = grid_wh[:, [1], ...]
intervel = torch.linspace(0., 1., self.dcn_kernel).view(
1, self.dcn_kernel, 1, 1).type_as(reg)
grid_x = grid_left + grid_width * intervel
grid_x = grid_x.unsqueeze(1).repeat(1, self.dcn_kernel, 1, 1, 1)
grid_x = grid_x.view(b, -1, h, w)
grid_y = grid_top + grid_height * intervel
grid_y = grid_y.unsqueeze(2).repeat(1, 1, self.dcn_kernel, 1, 1)
grid_y = grid_y.view(b, -1, h, w)
grid_yx = torch.stack([grid_y, grid_x], dim=2)
grid_yx = grid_yx.view(b, -1, h, w)
regressed_bbox = torch.cat([
grid_left, grid_top, grid_left + grid_width, grid_top + grid_height
], 1)
return grid_yx, regressed_bbox
def forward(self, feats):
return multi_apply(self.forward_single, feats)
def forward_single(self, x):
"""Forward feature map of a single FPN level."""
dcn_base_offset = self.dcn_base_offset.type_as(x)
# If we use center_init, the initial reppoints is from center points.
# If we use bounding bbox representation, the initial reppoints is
# from regular grid placed on a pre-defined bbox.
if self.use_grid_points or not self.center_init:
scale = self.point_base_scale / 2
points_init = dcn_base_offset / dcn_base_offset.max() * scale
bbox_init = x.new_tensor([-scale, -scale, scale,
scale]).view(1, 4, 1, 1)
else:
points_init = 0
cls_feat = x
pts_feat = x
for cls_conv in self.cls_convs:
cls_feat = cls_conv(cls_feat)
for reg_conv in self.reg_convs:
pts_feat = reg_conv(pts_feat)
# initialize reppoints
pts_out_init = self.reppoints_pts_init_out(
self.relu(self.reppoints_pts_init_conv(pts_feat)))
if self.use_grid_points:
pts_out_init, bbox_out_init = self.gen_grid_from_reg(
pts_out_init, bbox_init.detach())
else:
pts_out_init = pts_out_init + points_init
# refine and classify reppoints
pts_out_init_grad_mul = (1 - self.gradient_mul) * pts_out_init.detach(
) + self.gradient_mul * pts_out_init
dcn_offset = pts_out_init_grad_mul - dcn_base_offset
cls_out = self.reppoints_cls_out(
self.relu(self.reppoints_cls_conv(cls_feat, dcn_offset)))
pts_out_refine = self.reppoints_pts_refine_out(
self.relu(self.reppoints_pts_refine_conv(pts_feat, dcn_offset)))
if self.use_grid_points:
pts_out_refine, bbox_out_refine = self.gen_grid_from_reg(
pts_out_refine, bbox_out_init.detach())
else:
pts_out_refine = pts_out_refine + pts_out_init.detach()
return cls_out, pts_out_init, pts_out_refine
def get_points(self, featmap_sizes, img_metas, device):
"""Get points according to feature map sizes.
Args:
featmap_sizes (list[tuple]): Multi-level feature map sizes.
img_metas (list[dict]): Image meta info.
Returns:
tuple: points of each image, valid flags of each image
"""
num_imgs = len(img_metas)
num_levels = len(featmap_sizes)
# since feature map sizes of all images are the same, we only compute
# points center for one time
multi_level_points = []
for i in range(num_levels):
points = self.point_generators[i].grid_points(
featmap_sizes[i], self.point_strides[i], device)
multi_level_points.append(points)
points_list = [[point.clone() for point in multi_level_points]
for _ in range(num_imgs)]
# for each image, we compute valid flags of multi level grids
valid_flag_list = []
for img_id, img_meta in enumerate(img_metas):
multi_level_flags = []
for i in range(num_levels):
point_stride = self.point_strides[i]
feat_h, feat_w = featmap_sizes[i]
h, w = img_meta['pad_shape'][:2]
valid_feat_h = min(int(np.ceil(h / point_stride)), feat_h)
valid_feat_w = min(int(np.ceil(w / point_stride)), feat_w)
flags = self.point_generators[i].valid_flags(
(feat_h, feat_w), (valid_feat_h, valid_feat_w), device)
multi_level_flags.append(flags)
valid_flag_list.append(multi_level_flags)
return points_list, valid_flag_list
def centers_to_bboxes(self, point_list):
"""Get bboxes according to center points.
Only used in :class:`MaxIoUAssigner`.
"""
bbox_list = []
for i_img, point in enumerate(point_list):
bbox = []
for i_lvl in range(len(self.point_strides)):
scale = self.point_base_scale * self.point_strides[i_lvl] * 0.5
bbox_shift = torch.Tensor([-scale, -scale, scale,
scale]).view(1, 4).type_as(point[0])
bbox_center = torch.cat(
[point[i_lvl][:, :2], point[i_lvl][:, :2]], dim=1)
bbox.append(bbox_center + bbox_shift)
bbox_list.append(bbox)
return bbox_list
def offset_to_pts(self, center_list, pred_list):
"""Change from point offset to point coordinate."""
pts_list = []
for i_lvl in range(len(self.point_strides)):
pts_lvl = []
for i_img in range(len(center_list)):
pts_center = center_list[i_img][i_lvl][:, :2].repeat(
1, self.num_points)
pts_shift = pred_list[i_lvl][i_img]
yx_pts_shift = pts_shift.permute(1, 2, 0).view(
-1, 2 * self.num_points)
y_pts_shift = yx_pts_shift[..., 0::2]
x_pts_shift = yx_pts_shift[..., 1::2]
xy_pts_shift = torch.stack([x_pts_shift, y_pts_shift], -1)
xy_pts_shift = xy_pts_shift.view(*yx_pts_shift.shape[:-1], -1)
pts = xy_pts_shift * self.point_strides[i_lvl] + pts_center
pts_lvl.append(pts)
pts_lvl = torch.stack(pts_lvl, 0)
pts_list.append(pts_lvl)
return pts_list
def _point_target_single(self,
flat_proposals,
valid_flags,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
label_channels=1,
stage='init',
unmap_outputs=True):
inside_flags = valid_flags
if not inside_flags.any():
return (None, ) * 7
# assign gt and sample proposals
proposals = flat_proposals[inside_flags, :]
if stage == 'init':
assigner = self.init_assigner
pos_weight = self.train_cfg.init.pos_weight
else:
assigner = self.refine_assigner
pos_weight = self.train_cfg.refine.pos_weight
assign_result = assigner.assign(proposals, gt_bboxes, gt_bboxes_ignore,
None if self.sampling else gt_labels)
sampling_result = self.sampler.sample(assign_result, proposals,
gt_bboxes)
num_valid_proposals = proposals.shape[0]
bbox_gt = proposals.new_zeros([num_valid_proposals, 4])
pos_proposals = torch.zeros_like(proposals)
proposals_weights = proposals.new_zeros([num_valid_proposals, 4])
labels = proposals.new_full((num_valid_proposals, ),
self.num_classes,
dtype=torch.long)
label_weights = proposals.new_zeros(
num_valid_proposals, dtype=torch.float)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds
if len(pos_inds) > 0:
pos_gt_bboxes = sampling_result.pos_gt_bboxes
bbox_gt[pos_inds, :] = pos_gt_bboxes
pos_proposals[pos_inds, :] = proposals[pos_inds, :]
proposals_weights[pos_inds, :] = 1.0
if gt_labels is None:
# Only rpn gives gt_labels as None
# Foreground is the first class
labels[pos_inds] = 0
else:
labels[pos_inds] = gt_labels[
sampling_result.pos_assigned_gt_inds]
if pos_weight <= 0:
label_weights[pos_inds] = 1.0
else:
label_weights[pos_inds] = pos_weight
if len(neg_inds) > 0:
label_weights[neg_inds] = 1.0
# map up to original set of proposals
if unmap_outputs:
num_total_proposals = flat_proposals.size(0)
labels = unmap(labels, num_total_proposals, inside_flags)
label_weights = unmap(label_weights, num_total_proposals,
inside_flags)
bbox_gt = unmap(bbox_gt, num_total_proposals, inside_flags)
pos_proposals = unmap(pos_proposals, num_total_proposals,
inside_flags)
proposals_weights = unmap(proposals_weights, num_total_proposals,
inside_flags)
return (labels, label_weights, bbox_gt, pos_proposals,
proposals_weights, pos_inds, neg_inds)
def get_targets(self,
proposals_list,
valid_flag_list,
gt_bboxes_list,
img_metas,
gt_bboxes_ignore_list=None,
gt_labels_list=None,
stage='init',
label_channels=1,
unmap_outputs=True):
"""Compute corresponding GT box and classification targets for
proposals.
Args:
proposals_list (list[list]): Multi level points/bboxes of each
image.
valid_flag_list (list[list]): Multi level valid flags of each
image.
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
img_metas (list[dict]): Meta info of each image.
gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be
ignored.
gt_bboxes_list (list[Tensor]): Ground truth labels of each box.
stage (str): `init` or `refine`. Generate target for init stage or
refine stage
label_channels (int): Channel of label.
unmap_outputs (bool): Whether to map outputs back to the original
set of anchors.
Returns:
tuple:
- labels_list (list[Tensor]): Labels of each level.
- label_weights_list (list[Tensor]): Label weights of each level. # noqa: E501
- bbox_gt_list (list[Tensor]): Ground truth bbox of each level.
- proposal_list (list[Tensor]): Proposals(points/bboxes) of each level. # noqa: E501
- proposal_weights_list (list[Tensor]): Proposal weights of each level. # noqa: E501
- num_total_pos (int): Number of positive samples in all images. # noqa: E501
- num_total_neg (int): Number of negative samples in all images. # noqa: E501
"""
assert stage in ['init', 'refine']
num_imgs = len(img_metas)
assert len(proposals_list) == len(valid_flag_list) == num_imgs
# points number of multi levels
num_level_proposals = [points.size(0) for points in proposals_list[0]]
# concat all level points and flags to a single tensor
for i in range(num_imgs):
assert len(proposals_list[i]) == len(valid_flag_list[i])
proposals_list[i] = torch.cat(proposals_list[i])
valid_flag_list[i] = torch.cat(valid_flag_list[i])
# compute targets for each image
if gt_bboxes_ignore_list is None:
gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
if gt_labels_list is None:
gt_labels_list = [None for _ in range(num_imgs)]
(all_labels, all_label_weights, all_bbox_gt, all_proposals,
all_proposal_weights, pos_inds_list, neg_inds_list) = multi_apply(
self._point_target_single,
proposals_list,
valid_flag_list,
gt_bboxes_list,
gt_bboxes_ignore_list,
gt_labels_list,
stage=stage,
label_channels=label_channels,
unmap_outputs=unmap_outputs)
# no valid points
if any([labels is None for labels in all_labels]):
return None
# sampled points of all images
num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
labels_list = images_to_levels(all_labels, num_level_proposals)
label_weights_list = images_to_levels(all_label_weights,
num_level_proposals)
bbox_gt_list = images_to_levels(all_bbox_gt, num_level_proposals)
proposals_list = images_to_levels(all_proposals, num_level_proposals)
proposal_weights_list = images_to_levels(all_proposal_weights,
num_level_proposals)
return (labels_list, label_weights_list, bbox_gt_list, proposals_list,
proposal_weights_list, num_total_pos, num_total_neg)
def loss_single(self, cls_score, pts_pred_init, pts_pred_refine, labels,
label_weights, bbox_gt_init, bbox_weights_init,
bbox_gt_refine, bbox_weights_refine, stride,
num_total_samples_init, num_total_samples_refine):
# classification loss
labels = labels.reshape(-1)
label_weights = label_weights.reshape(-1)
cls_score = cls_score.permute(0, 2, 3,
1).reshape(-1, self.cls_out_channels)
cls_score = cls_score.contiguous()
loss_cls = self.loss_cls(
cls_score,
labels,
label_weights,
avg_factor=num_total_samples_refine)
# points loss
bbox_gt_init = bbox_gt_init.reshape(-1, 4)
bbox_weights_init = bbox_weights_init.reshape(-1, 4)
bbox_pred_init = self.points2bbox(
pts_pred_init.reshape(-1, 2 * self.num_points), y_first=False)
bbox_gt_refine = bbox_gt_refine.reshape(-1, 4)
bbox_weights_refine = bbox_weights_refine.reshape(-1, 4)
bbox_pred_refine = self.points2bbox(
pts_pred_refine.reshape(-1, 2 * self.num_points), y_first=False)
normalize_term = self.point_base_scale * stride
loss_pts_init = self.loss_bbox_init(
bbox_pred_init / normalize_term,
bbox_gt_init / normalize_term,
bbox_weights_init,
avg_factor=num_total_samples_init)
loss_pts_refine = self.loss_bbox_refine(
bbox_pred_refine / normalize_term,
bbox_gt_refine / normalize_term,
bbox_weights_refine,
avg_factor=num_total_samples_refine)
return loss_cls, loss_pts_init, loss_pts_refine
def loss(self,
cls_scores,
pts_preds_init,
pts_preds_refine,
gt_bboxes,
gt_labels,
img_metas,
gt_bboxes_ignore=None):
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == len(self.point_generators)
device = cls_scores[0].device
label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
# target for initial stage
center_list, valid_flag_list = self.get_points(featmap_sizes,
img_metas, device)
pts_coordinate_preds_init = self.offset_to_pts(center_list,
pts_preds_init)
if self.train_cfg.init.assigner['type'] == 'PointAssigner':
# Assign target for center list
candidate_list = center_list
else:
# transform center list to bbox list and
# assign target for bbox list
bbox_list = self.centers_to_bboxes(center_list)
candidate_list = bbox_list
cls_reg_targets_init = self.get_targets(
candidate_list,
valid_flag_list,
gt_bboxes,
img_metas,
gt_bboxes_ignore_list=gt_bboxes_ignore,
gt_labels_list=gt_labels,
stage='init',
label_channels=label_channels)
(*_, bbox_gt_list_init, candidate_list_init, bbox_weights_list_init,
num_total_pos_init, num_total_neg_init) = cls_reg_targets_init
num_total_samples_init = (
num_total_pos_init +
num_total_neg_init if self.sampling else num_total_pos_init)
# target for refinement stage
center_list, valid_flag_list = self.get_points(featmap_sizes,
img_metas, device)
pts_coordinate_preds_refine = self.offset_to_pts(
center_list, pts_preds_refine)
bbox_list = []
for i_img, center in enumerate(center_list):
bbox = []
for i_lvl in range(len(pts_preds_refine)):
bbox_preds_init = self.points2bbox(
pts_preds_init[i_lvl].detach())
bbox_shift = bbox_preds_init * self.point_strides[i_lvl]
bbox_center = torch.cat(
[center[i_lvl][:, :2], center[i_lvl][:, :2]], dim=1)
bbox.append(bbox_center +
bbox_shift[i_img].permute(1, 2, 0).reshape(-1, 4))
bbox_list.append(bbox)
cls_reg_targets_refine = self.get_targets(
bbox_list,
valid_flag_list,
gt_bboxes,
img_metas,
gt_bboxes_ignore_list=gt_bboxes_ignore,
gt_labels_list=gt_labels,
stage='refine',
label_channels=label_channels)
(labels_list, label_weights_list, bbox_gt_list_refine,
candidate_list_refine, bbox_weights_list_refine, num_total_pos_refine,
num_total_neg_refine) = cls_reg_targets_refine
num_total_samples_refine = (
num_total_pos_refine +
num_total_neg_refine if self.sampling else num_total_pos_refine)
# compute loss
losses_cls, losses_pts_init, losses_pts_refine = multi_apply(
self.loss_single,
cls_scores,
pts_coordinate_preds_init,
pts_coordinate_preds_refine,
labels_list,
label_weights_list,
bbox_gt_list_init,
bbox_weights_list_init,
bbox_gt_list_refine,
bbox_weights_list_refine,
self.point_strides,
num_total_samples_init=num_total_samples_init,
num_total_samples_refine=num_total_samples_refine)
loss_dict_all = {
'loss_cls': losses_cls,
'loss_pts_init': losses_pts_init,
'loss_pts_refine': losses_pts_refine
}
return loss_dict_all
def get_bboxes(self,
cls_scores,
pts_preds_init,
pts_preds_refine,
img_metas,
cfg=None,
rescale=False,
with_nms=True):
assert len(cls_scores) == len(pts_preds_refine)
device = cls_scores[0].device
bbox_preds_refine = [
self.points2bbox(pts_pred_refine)
for pts_pred_refine in pts_preds_refine
]
num_levels = len(cls_scores)
mlvl_points = [
self.point_generators[i].grid_points(cls_scores[i].size()[-2:],
self.point_strides[i], device)
for i in range(num_levels)
]
result_list = []
for img_id in range(len(img_metas)):
cls_score_list = [
cls_scores[i][img_id].detach() for i in range(num_levels)
]
bbox_pred_list = [
bbox_preds_refine[i][img_id].detach()
for i in range(num_levels)
]
img_shape = img_metas[img_id]['img_shape']
scale_factor = img_metas[img_id]['scale_factor']
proposals = self._get_bboxes_single(cls_score_list, bbox_pred_list,
mlvl_points, img_shape,
scale_factor, cfg, rescale,
with_nms)
result_list.append(proposals)
return result_list
def _get_bboxes_single(self,
cls_scores,
bbox_preds,
mlvl_points,
img_shape,
scale_factor,
cfg,
rescale=False,
with_nms=True):
cfg = self.test_cfg if cfg is None else cfg
assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
mlvl_bboxes = []
mlvl_scores = []
for i_lvl, (cls_score, bbox_pred, points) in enumerate(
zip(cls_scores, bbox_preds, mlvl_points)):
assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
cls_score = cls_score.permute(1, 2,
0).reshape(-1, self.cls_out_channels)
if self.use_sigmoid_cls:
scores = cls_score.sigmoid()
else:
scores = cls_score.softmax(-1)
bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
nms_pre = cfg.get('nms_pre', -1)
if nms_pre > 0 and scores.shape[0] > nms_pre:
if self.use_sigmoid_cls:
max_scores, _ = scores.max(dim=1)
else:
# remind that we set FG labels to [0, num_class-1]
# since mmdet v2.0
# BG cat_id: num_class
max_scores, _ = scores[:, :-1].max(dim=1)
_, topk_inds = max_scores.topk(nms_pre)
points = points[topk_inds, :]
bbox_pred = bbox_pred[topk_inds, :]
scores = scores[topk_inds, :]
bbox_pos_center = torch.cat([points[:, :2], points[:, :2]], dim=1)
bboxes = bbox_pred * self.point_strides[i_lvl] + bbox_pos_center
x1 = bboxes[:, 0].clamp(min=0, max=img_shape[1])
y1 = bboxes[:, 1].clamp(min=0, max=img_shape[0])
x2 = bboxes[:, 2].clamp(min=0, max=img_shape[1])
y2 = bboxes[:, 3].clamp(min=0, max=img_shape[0])
bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
mlvl_bboxes.append(bboxes)
mlvl_scores.append(scores)
mlvl_bboxes = torch.cat(mlvl_bboxes)
if rescale:
mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
mlvl_scores = torch.cat(mlvl_scores)
if self.use_sigmoid_cls:
# Add a dummy background class to the backend when using sigmoid
# remind that we set FG labels to [0, num_class-1] since mmdet v2.0
# BG cat_id: num_class
padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
if with_nms:
det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
cfg.score_thr, cfg.nms,
cfg.max_per_img)
return det_bboxes, det_labels
else:
return mlvl_bboxes, mlvl_scores
import torch.nn as nn
from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
from ..builder import HEADS
from .anchor_head import AnchorHead
@HEADS.register_module()
class RetinaHead(AnchorHead):
r"""An anchor-based head used in `RetinaNet
<https://arxiv.org/pdf/1708.02002.pdf>`_.
The head contains two subnetworks. The first classifies anchor boxes and
the second regresses deltas for the anchors.
Example:
>>> import torch
>>> self = RetinaHead(11, 7)
>>> x = torch.rand(1, 7, 32, 32)
>>> cls_score, bbox_pred = self.forward_single(x)
>>> # Each anchor predicts a score for each class except background
>>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
>>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
>>> assert cls_per_anchor == (self.num_classes)
>>> assert box_per_anchor == 4
"""
def __init__(self,
num_classes,
in_channels,
stacked_convs=4,
conv_cfg=None,
norm_cfg=None,
anchor_generator=dict(
type='AnchorGenerator',
octave_base_scale=4,
scales_per_octave=3,
ratios=[0.5, 1.0, 2.0],
strides=[8, 16, 32, 64, 128]),
**kwargs):
self.stacked_convs = stacked_convs
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
super(RetinaHead, self).__init__(
num_classes,
in_channels,
anchor_generator=anchor_generator,
**kwargs)
def _init_layers(self):
"""Initialize layers of the head."""
self.relu = nn.ReLU(inplace=True)
self.cls_convs = nn.ModuleList()
self.reg_convs = nn.ModuleList()
for i in range(self.stacked_convs):
chn = self.in_channels if i == 0 else self.feat_channels
self.cls_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg))
self.reg_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg))
self.retina_cls = nn.Conv2d(
self.feat_channels,
self.num_anchors * self.cls_out_channels,
3,
padding=1)
self.retina_reg = nn.Conv2d(
self.feat_channels, self.num_anchors * 4, 3, padding=1)
def init_weights(self):
"""Initialize weights of the head."""
for m in self.cls_convs:
normal_init(m.conv, std=0.01)
for m in self.reg_convs:
normal_init(m.conv, std=0.01)
bias_cls = bias_init_with_prob(0.01)
normal_init(self.retina_cls, std=0.01, bias=bias_cls)
normal_init(self.retina_reg, std=0.01)
def forward_single(self, x):
"""Forward feature of a single scale level.
Args:
x (Tensor): Features of a single scale level.
Returns:
tuple:
cls_score (Tensor): Cls scores for a single scale level
the channels number is num_anchors * num_classes.
bbox_pred (Tensor): Box energies / deltas for a single scale
level, the channels number is num_anchors * 4.
"""
cls_feat = x
reg_feat = x
for cls_conv in self.cls_convs:
cls_feat = cls_conv(cls_feat)
for reg_conv in self.reg_convs:
reg_feat = reg_conv(reg_feat)
cls_score = self.retina_cls(cls_feat)
bbox_pred = self.retina_reg(reg_feat)
return cls_score, bbox_pred
import torch.nn as nn
from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
from ..builder import HEADS
from .anchor_head import AnchorHead
@HEADS.register_module()
class RetinaSepBNHead(AnchorHead):
""""RetinaHead with separate BN.
In RetinaHead, conv/norm layers are shared across different FPN levels,
while in RetinaSepBNHead, conv layers are shared across different FPN
levels, but BN layers are separated.
"""
def __init__(self,
num_classes,
num_ins,
in_channels,
stacked_convs=4,
conv_cfg=None,
norm_cfg=None,
**kwargs):
self.stacked_convs = stacked_convs
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.num_ins = num_ins
super(RetinaSepBNHead, self).__init__(num_classes, in_channels,
**kwargs)
def _init_layers(self):
"""Initialize layers of the head."""
self.relu = nn.ReLU(inplace=True)
self.cls_convs = nn.ModuleList()
self.reg_convs = nn.ModuleList()
for i in range(self.num_ins):
cls_convs = nn.ModuleList()
reg_convs = nn.ModuleList()
for i in range(self.stacked_convs):
chn = self.in_channels if i == 0 else self.feat_channels
cls_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg))
reg_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg))
self.cls_convs.append(cls_convs)
self.reg_convs.append(reg_convs)
for i in range(self.stacked_convs):
for j in range(1, self.num_ins):
self.cls_convs[j][i].conv = self.cls_convs[0][i].conv
self.reg_convs[j][i].conv = self.reg_convs[0][i].conv
self.retina_cls = nn.Conv2d(
self.feat_channels,
self.num_anchors * self.cls_out_channels,
3,
padding=1)
self.retina_reg = nn.Conv2d(
self.feat_channels, self.num_anchors * 4, 3, padding=1)
def init_weights(self):
"""Initialize weights of the head."""
for m in self.cls_convs[0]:
normal_init(m.conv, std=0.01)
for m in self.reg_convs[0]:
normal_init(m.conv, std=0.01)
bias_cls = bias_init_with_prob(0.01)
normal_init(self.retina_cls, std=0.01, bias=bias_cls)
normal_init(self.retina_reg, std=0.01)
def forward(self, feats):
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
tuple: Usually a tuple of classification scores and bbox prediction
cls_scores (list[Tensor]): Classification scores for all scale
levels, each is a 4D-tensor, the channels number is
num_anchors * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for all scale
levels, each is a 4D-tensor, the channels number is
num_anchors * 4.
"""
cls_scores = []
bbox_preds = []
for i, x in enumerate(feats):
cls_feat = feats[i]
reg_feat = feats[i]
for cls_conv in self.cls_convs[i]:
cls_feat = cls_conv(cls_feat)
for reg_conv in self.reg_convs[i]:
reg_feat = reg_conv(reg_feat)
cls_score = self.retina_cls(cls_feat)
bbox_pred = self.retina_reg(reg_feat)
cls_scores.append(cls_score)
bbox_preds.append(bbox_pred)
return cls_scores, bbox_preds
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import normal_init
from mmcv.ops import batched_nms
from ..builder import HEADS
from .anchor_head import AnchorHead
from .rpn_test_mixin import RPNTestMixin
@HEADS.register_module()
class RPNHead(RPNTestMixin, AnchorHead):
"""RPN head.
Args:
in_channels (int): Number of channels in the input feature map.
""" # noqa: W605
def __init__(self, in_channels, **kwargs):
super(RPNHead, self).__init__(1, in_channels, **kwargs)
def _init_layers(self):
"""Initialize layers of the head."""
self.rpn_conv = nn.Conv2d(
self.in_channels, self.feat_channels, 3, padding=1)
self.rpn_cls = nn.Conv2d(self.feat_channels,
self.num_anchors * self.cls_out_channels, 1)
self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1)
def init_weights(self):
"""Initialize weights of the head."""
normal_init(self.rpn_conv, std=0.01)
normal_init(self.rpn_cls, std=0.01)
normal_init(self.rpn_reg, std=0.01)
def forward_single(self, x):
"""Forward feature map of a single scale level."""
x = self.rpn_conv(x)
x = F.relu(x, inplace=True)
rpn_cls_score = self.rpn_cls(x)
rpn_bbox_pred = self.rpn_reg(x)
return rpn_cls_score, rpn_bbox_pred
def loss(self,
cls_scores,
bbox_preds,
gt_bboxes,
img_metas,
gt_bboxes_ignore=None):
"""Compute losses of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_anchors * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_anchors * 4, H, W)
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): specify which bounding
boxes can be ignored when computing the loss.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
losses = super(RPNHead, self).loss(
cls_scores,
bbox_preds,
gt_bboxes,
None,
img_metas,
gt_bboxes_ignore=gt_bboxes_ignore)
return dict(
loss_rpn_cls=losses['loss_cls'], loss_rpn_bbox=losses['loss_bbox'])
def _get_bboxes_single(self,
cls_scores,
bbox_preds,
mlvl_anchors,
img_shape,
scale_factor,
cfg,
rescale=False):
"""Transform outputs for a single batch item into bbox predictions.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (num_anchors * num_classes, H, W).
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (num_anchors * 4, H, W).
mlvl_anchors (list[Tensor]): Box reference for each scale level
with shape (num_total_anchors, 4).
img_shape (tuple[int]): Shape of the input image,
(height, width, 3).
scale_factor (ndarray): Scale factor of the image arange as
(w_scale, h_scale, w_scale, h_scale).
cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used.
rescale (bool): If True, return boxes in original image space.
Returns:
Tensor: Labeled boxes in shape (n, 5), where the first 4 columns
are bounding box positions (tl_x, tl_y, br_x, br_y) and the
5-th column is a score between 0 and 1.
"""
cfg = self.test_cfg if cfg is None else cfg
# bboxes from different level should be independent during NMS,
# level_ids are used as labels for batched NMS to separate them
level_ids = []
mlvl_scores = []
mlvl_bbox_preds = []
mlvl_valid_anchors = []
for idx in range(len(cls_scores)):
rpn_cls_score = cls_scores[idx]
rpn_bbox_pred = bbox_preds[idx]
assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
if self.use_sigmoid_cls:
rpn_cls_score = rpn_cls_score.reshape(-1)
scores = rpn_cls_score.sigmoid()
else:
rpn_cls_score = rpn_cls_score.reshape(-1, 2)
# We set FG labels to [0, num_class-1] and BG label to
# num_class in RPN head since mmdet v2.5, which is unified to
# be consistent with other head since mmdet v2.0. In mmdet v2.0
# to v2.4 we keep BG label as 0 and FG label as 1 in rpn head.
scores = rpn_cls_score.softmax(dim=1)[:, 0]
rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)
anchors = mlvl_anchors[idx]
if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
# sort is faster than topk
# _, topk_inds = scores.topk(cfg.nms_pre)
ranked_scores, rank_inds = scores.sort(descending=True)
topk_inds = rank_inds[:cfg.nms_pre]
scores = ranked_scores[:cfg.nms_pre]
rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
anchors = anchors[topk_inds, :]
mlvl_scores.append(scores)
mlvl_bbox_preds.append(rpn_bbox_pred)
mlvl_valid_anchors.append(anchors)
level_ids.append(
scores.new_full((scores.size(0), ), idx, dtype=torch.long))
scores = torch.cat(mlvl_scores)
anchors = torch.cat(mlvl_valid_anchors)
rpn_bbox_pred = torch.cat(mlvl_bbox_preds)
proposals = self.bbox_coder.decode(
anchors, rpn_bbox_pred, max_shape=img_shape)
ids = torch.cat(level_ids)
if cfg.min_bbox_size > 0:
w = proposals[:, 2] - proposals[:, 0]
h = proposals[:, 3] - proposals[:, 1]
valid_inds = torch.nonzero(
(w >= cfg.min_bbox_size)
& (h >= cfg.min_bbox_size),
as_tuple=False).squeeze()
if valid_inds.sum().item() != len(proposals):
proposals = proposals[valid_inds, :]
scores = scores[valid_inds]
ids = ids[valid_inds]
# TODO: remove the hard coded nms type
nms_cfg = dict(type='nms', iou_threshold=cfg.nms_thr)
dets, keep = batched_nms(proposals, scores, ids, nms_cfg)
return dets[:cfg.nms_post]
import sys
from mmdet.core import merge_aug_proposals
if sys.version_info >= (3, 7):
from mmdet.utils.contextmanagers import completed
class RPNTestMixin(object):
"""Test methods of RPN."""
if sys.version_info >= (3, 7):
async def async_simple_test_rpn(self, x, img_metas):
sleep_interval = self.test_cfg.pop('async_sleep_interval', 0.025)
async with completed(
__name__, 'rpn_head_forward',
sleep_interval=sleep_interval):
rpn_outs = self(x)
proposal_list = self.get_bboxes(*rpn_outs, img_metas)
return proposal_list
def simple_test_rpn(self, x, img_metas):
"""Test without augmentation.
Args:
x (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
img_metas (list[dict]): Meta info of each image.
Returns:
list[Tensor]: Proposals of each image.
"""
rpn_outs = self(x)
proposal_list = self.get_bboxes(*rpn_outs, img_metas)
return proposal_list
def aug_test_rpn(self, feats, img_metas):
samples_per_gpu = len(img_metas[0])
aug_proposals = [[] for _ in range(samples_per_gpu)]
for x, img_meta in zip(feats, img_metas):
proposal_list = self.simple_test_rpn(x, img_meta)
for i, proposals in enumerate(proposal_list):
aug_proposals[i].append(proposals)
# reorganize the order of 'img_metas' to match the dimensions
# of 'aug_proposals'
aug_img_metas = []
for i in range(samples_per_gpu):
aug_img_meta = []
for j in range(len(img_metas)):
aug_img_meta.append(img_metas[j][i])
aug_img_metas.append(aug_img_meta)
# after merging, proposals will be rescaled to the original image size
merged_proposals = [
merge_aug_proposals(proposals, aug_img_meta, self.test_cfg)
for proposals, aug_img_meta in zip(aug_proposals, aug_img_metas)
]
return merged_proposals
import numpy as np
import torch
import torch.nn as nn
from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
from mmcv.runner import force_fp32
from mmdet.core import (build_anchor_generator, build_assigner,
build_bbox_coder, build_sampler, images_to_levels,
multi_apply, multiclass_nms, unmap)
from ..builder import HEADS, build_loss
from .base_dense_head import BaseDenseHead
from .guided_anchor_head import GuidedAnchorHead
@HEADS.register_module()
class SABLRetinaHead(BaseDenseHead):
"""Side-Aware Boundary Localization (SABL) for RetinaNet.
The anchor generation, assigning and sampling in SABLRetinaHead
are the same as GuidedAnchorHead for guided anchoring.
Please refer to https://arxiv.org/abs/1912.04260 for more details.
Args:
num_classes (int): Number of classes.
in_channels (int): Number of channels in the input feature map.
stacked_convs (int): Number of Convs for classification \
and regression branches. Defaults to 4.
feat_channels (int): Number of hidden channels. \
Defaults to 256.
approx_anchor_generator (dict): Config dict for approx generator.
square_anchor_generator (dict): Config dict for square generator.
conv_cfg (dict): Config dict for ConvModule. Defaults to None.
norm_cfg (dict): Config dict for Norm Layer. Defaults to None.
bbox_coder (dict): Config dict for bbox coder.
reg_decoded_bbox (bool): If true, the regression loss would be
applied directly on decoded bounding boxes, converting both
the predicted boxes and regression targets to absolute
coordinates format. Default False. It should be `True` when
using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
train_cfg (dict): Training config of SABLRetinaHead.
test_cfg (dict): Testing config of SABLRetinaHead.
loss_cls (dict): Config of classification loss.
loss_bbox_cls (dict): Config of classification loss for bbox branch.
loss_bbox_reg (dict): Config of regression loss for bbox branch.
"""
def __init__(self,
num_classes,
in_channels,
stacked_convs=4,
feat_channels=256,
approx_anchor_generator=dict(
type='AnchorGenerator',
octave_base_scale=4,
scales_per_octave=3,
ratios=[0.5, 1.0, 2.0],
strides=[8, 16, 32, 64, 128]),
square_anchor_generator=dict(
type='AnchorGenerator',
ratios=[1.0],
scales=[4],
strides=[8, 16, 32, 64, 128]),
conv_cfg=None,
norm_cfg=None,
bbox_coder=dict(
type='BucketingBBoxCoder',
num_buckets=14,
scale_factor=3.0),
reg_decoded_bbox=False,
train_cfg=None,
test_cfg=None,
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
loss_weight=1.5),
loss_bbox_reg=dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)):
super(SABLRetinaHead, self).__init__()
self.in_channels = in_channels
self.num_classes = num_classes
self.feat_channels = feat_channels
self.num_buckets = bbox_coder['num_buckets']
self.side_num = int(np.ceil(self.num_buckets / 2))
assert (approx_anchor_generator['octave_base_scale'] ==
square_anchor_generator['scales'][0])
assert (approx_anchor_generator['strides'] ==
square_anchor_generator['strides'])
self.approx_anchor_generator = build_anchor_generator(
approx_anchor_generator)
self.square_anchor_generator = build_anchor_generator(
square_anchor_generator)
self.approxs_per_octave = (
self.approx_anchor_generator.num_base_anchors[0])
# one anchor per location
self.num_anchors = 1
self.stacked_convs = stacked_convs
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.reg_decoded_bbox = reg_decoded_bbox
self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
self.sampling = loss_cls['type'] not in [
'FocalLoss', 'GHMC', 'QualityFocalLoss'
]
if self.use_sigmoid_cls:
self.cls_out_channels = num_classes
else:
self.cls_out_channels = num_classes + 1
self.bbox_coder = build_bbox_coder(bbox_coder)
self.loss_cls = build_loss(loss_cls)
self.loss_bbox_cls = build_loss(loss_bbox_cls)
self.loss_bbox_reg = build_loss(loss_bbox_reg)
self.train_cfg = train_cfg
self.test_cfg = test_cfg
if self.train_cfg:
self.assigner = build_assigner(self.train_cfg.assigner)
# use PseudoSampler when sampling is False
if self.sampling and hasattr(self.train_cfg, 'sampler'):
sampler_cfg = self.train_cfg.sampler
else:
sampler_cfg = dict(type='PseudoSampler')
self.sampler = build_sampler(sampler_cfg, context=self)
self.fp16_enabled = False
self._init_layers()
def _init_layers(self):
self.relu = nn.ReLU(inplace=True)
self.cls_convs = nn.ModuleList()
self.reg_convs = nn.ModuleList()
for i in range(self.stacked_convs):
chn = self.in_channels if i == 0 else self.feat_channels
self.cls_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg))
self.reg_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg))
self.retina_cls = nn.Conv2d(
self.feat_channels, self.cls_out_channels, 3, padding=1)
self.retina_bbox_reg = nn.Conv2d(
self.feat_channels, self.side_num * 4, 3, padding=1)
self.retina_bbox_cls = nn.Conv2d(
self.feat_channels, self.side_num * 4, 3, padding=1)
def init_weights(self):
for m in self.cls_convs:
normal_init(m.conv, std=0.01)
for m in self.reg_convs:
normal_init(m.conv, std=0.01)
bias_cls = bias_init_with_prob(0.01)
normal_init(self.retina_cls, std=0.01, bias=bias_cls)
normal_init(self.retina_bbox_reg, std=0.01)
normal_init(self.retina_bbox_cls, std=0.01)
def forward_single(self, x):
cls_feat = x
reg_feat = x
for cls_conv in self.cls_convs:
cls_feat = cls_conv(cls_feat)
for reg_conv in self.reg_convs:
reg_feat = reg_conv(reg_feat)
cls_score = self.retina_cls(cls_feat)
bbox_cls_pred = self.retina_bbox_cls(reg_feat)
bbox_reg_pred = self.retina_bbox_reg(reg_feat)
bbox_pred = (bbox_cls_pred, bbox_reg_pred)
return cls_score, bbox_pred
def forward(self, feats):
return multi_apply(self.forward_single, feats)
def get_anchors(self, featmap_sizes, img_metas, device='cuda'):
"""Get squares according to feature map sizes and guided anchors.
Args:
featmap_sizes (list[tuple]): Multi-level feature map sizes.
img_metas (list[dict]): Image meta info.
device (torch.device | str): device for returned tensors
Returns:
tuple: square approxs of each image
"""
num_imgs = len(img_metas)
# since feature map sizes of all images are the same, we only compute
# squares for one time
multi_level_squares = self.square_anchor_generator.grid_anchors(
featmap_sizes, device=device)
squares_list = [multi_level_squares for _ in range(num_imgs)]
return squares_list
def get_target(self,
approx_list,
inside_flag_list,
square_list,
gt_bboxes_list,
img_metas,
gt_bboxes_ignore_list=None,
gt_labels_list=None,
label_channels=None,
sampling=True,
unmap_outputs=True):
"""Compute bucketing targets.
Args:
approx_list (list[list]): Multi level approxs of each image.
inside_flag_list (list[list]): Multi level inside flags of each
image.
square_list (list[list]): Multi level squares of each image.
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
img_metas (list[dict]): Meta info of each image.
gt_bboxes_ignore_list (list[Tensor]): ignore list of gt bboxes.
gt_bboxes_list (list[Tensor]): Gt bboxes of each image.
label_channels (int): Channel of label.
sampling (bool): Sample Anchors or not.
unmap_outputs (bool): unmap outputs or not.
Returns:
tuple: Returns a tuple containing learning targets.
- labels_list (list[Tensor]): Labels of each level.
- label_weights_list (list[Tensor]): Label weights of each \
level.
- bbox_cls_targets_list (list[Tensor]): BBox cls targets of \
each level.
- bbox_cls_weights_list (list[Tensor]): BBox cls weights of \
each level.
- bbox_reg_targets_list (list[Tensor]): BBox reg targets of \
each level.
- bbox_reg_weights_list (list[Tensor]): BBox reg weights of \
each level.
- num_total_pos (int): Number of positive samples in all \
images.
- num_total_neg (int): Number of negative samples in all \
images.
"""
num_imgs = len(img_metas)
assert len(approx_list) == len(inside_flag_list) == len(
square_list) == num_imgs
# anchor number of multi levels
num_level_squares = [squares.size(0) for squares in square_list[0]]
# concat all level anchors and flags to a single tensor
inside_flag_flat_list = []
approx_flat_list = []
square_flat_list = []
for i in range(num_imgs):
assert len(square_list[i]) == len(inside_flag_list[i])
inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
approx_flat_list.append(torch.cat(approx_list[i]))
square_flat_list.append(torch.cat(square_list[i]))
# compute targets for each image
if gt_bboxes_ignore_list is None:
gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
if gt_labels_list is None:
gt_labels_list = [None for _ in range(num_imgs)]
(all_labels, all_label_weights, all_bbox_cls_targets,
all_bbox_cls_weights, all_bbox_reg_targets, all_bbox_reg_weights,
pos_inds_list, neg_inds_list) = multi_apply(
self._get_target_single,
approx_flat_list,
inside_flag_flat_list,
square_flat_list,
gt_bboxes_list,
gt_bboxes_ignore_list,
gt_labels_list,
img_metas,
label_channels=label_channels,
sampling=sampling,
unmap_outputs=unmap_outputs)
# no valid anchors
if any([labels is None for labels in all_labels]):
return None
# sampled anchors of all images
num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
# split targets to a list w.r.t. multiple levels
labels_list = images_to_levels(all_labels, num_level_squares)
label_weights_list = images_to_levels(all_label_weights,
num_level_squares)
bbox_cls_targets_list = images_to_levels(all_bbox_cls_targets,
num_level_squares)
bbox_cls_weights_list = images_to_levels(all_bbox_cls_weights,
num_level_squares)
bbox_reg_targets_list = images_to_levels(all_bbox_reg_targets,
num_level_squares)
bbox_reg_weights_list = images_to_levels(all_bbox_reg_weights,
num_level_squares)
return (labels_list, label_weights_list, bbox_cls_targets_list,
bbox_cls_weights_list, bbox_reg_targets_list,
bbox_reg_weights_list, num_total_pos, num_total_neg)
def _get_target_single(self,
flat_approxs,
inside_flags,
flat_squares,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
img_meta,
label_channels=None,
sampling=True,
unmap_outputs=True):
"""Compute regression and classification targets for anchors in a
single image.
Args:
flat_approxs (Tensor): flat approxs of a single image,
shape (n, 4)
inside_flags (Tensor): inside flags of a single image,
shape (n, ).
flat_squares (Tensor): flat squares of a single image,
shape (approxs_per_octave * n, 4)
gt_bboxes (Tensor): Ground truth bboxes of a single image, \
shape (num_gts, 4).
gt_bboxes_ignore (Tensor): Ground truth bboxes to be
ignored, shape (num_ignored_gts, 4).
gt_labels (Tensor): Ground truth labels of each box,
shape (num_gts,).
img_meta (dict): Meta info of the image.
label_channels (int): Channel of label.
sampling (bool): Sample Anchors or not.
unmap_outputs (bool): unmap outputs or not.
Returns:
tuple:
- labels_list (Tensor): Labels in a single image
- label_weights (Tensor): Label weights in a single image
- bbox_cls_targets (Tensor): BBox cls targets in a single image
- bbox_cls_weights (Tensor): BBox cls weights in a single image
- bbox_reg_targets (Tensor): BBox reg targets in a single image
- bbox_reg_weights (Tensor): BBox reg weights in a single image
- num_total_pos (int): Number of positive samples \
in a single image
- num_total_neg (int): Number of negative samples \
in a single image
"""
if not inside_flags.any():
return (None, ) * 8
# assign gt and sample anchors
expand_inside_flags = inside_flags[:, None].expand(
-1, self.approxs_per_octave).reshape(-1)
approxs = flat_approxs[expand_inside_flags, :]
squares = flat_squares[inside_flags, :]
assign_result = self.assigner.assign(approxs, squares,
self.approxs_per_octave,
gt_bboxes, gt_bboxes_ignore)
sampling_result = self.sampler.sample(assign_result, squares,
gt_bboxes)
num_valid_squares = squares.shape[0]
bbox_cls_targets = squares.new_zeros(
(num_valid_squares, self.side_num * 4))
bbox_cls_weights = squares.new_zeros(
(num_valid_squares, self.side_num * 4))
bbox_reg_targets = squares.new_zeros(
(num_valid_squares, self.side_num * 4))
bbox_reg_weights = squares.new_zeros(
(num_valid_squares, self.side_num * 4))
labels = squares.new_full((num_valid_squares, ),
self.num_classes,
dtype=torch.long)
label_weights = squares.new_zeros(num_valid_squares, dtype=torch.float)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds
if len(pos_inds) > 0:
(pos_bbox_reg_targets, pos_bbox_reg_weights, pos_bbox_cls_targets,
pos_bbox_cls_weights) = self.bbox_coder.encode(
sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
bbox_cls_targets[pos_inds, :] = pos_bbox_cls_targets
bbox_reg_targets[pos_inds, :] = pos_bbox_reg_targets
bbox_cls_weights[pos_inds, :] = pos_bbox_cls_weights
bbox_reg_weights[pos_inds, :] = pos_bbox_reg_weights
if gt_labels is None:
# Only rpn gives gt_labels as None
# Foreground is the first class
labels[pos_inds] = 0
else:
labels[pos_inds] = gt_labels[
sampling_result.pos_assigned_gt_inds]
if self.train_cfg.pos_weight <= 0:
label_weights[pos_inds] = 1.0
else:
label_weights[pos_inds] = self.train_cfg.pos_weight
if len(neg_inds) > 0:
label_weights[neg_inds] = 1.0
# map up to original set of anchors
if unmap_outputs:
num_total_anchors = flat_squares.size(0)
labels = unmap(
labels, num_total_anchors, inside_flags, fill=self.num_classes)
label_weights = unmap(label_weights, num_total_anchors,
inside_flags)
bbox_cls_targets = unmap(bbox_cls_targets, num_total_anchors,
inside_flags)
bbox_cls_weights = unmap(bbox_cls_weights, num_total_anchors,
inside_flags)
bbox_reg_targets = unmap(bbox_reg_targets, num_total_anchors,
inside_flags)
bbox_reg_weights = unmap(bbox_reg_weights, num_total_anchors,
inside_flags)
return (labels, label_weights, bbox_cls_targets, bbox_cls_weights,
bbox_reg_targets, bbox_reg_weights, pos_inds, neg_inds)
def loss_single(self, cls_score, bbox_pred, labels, label_weights,
bbox_cls_targets, bbox_cls_weights, bbox_reg_targets,
bbox_reg_weights, num_total_samples):
# classification loss
labels = labels.reshape(-1)
label_weights = label_weights.reshape(-1)
cls_score = cls_score.permute(0, 2, 3,
1).reshape(-1, self.cls_out_channels)
loss_cls = self.loss_cls(
cls_score, labels, label_weights, avg_factor=num_total_samples)
# regression loss
bbox_cls_targets = bbox_cls_targets.reshape(-1, self.side_num * 4)
bbox_cls_weights = bbox_cls_weights.reshape(-1, self.side_num * 4)
bbox_reg_targets = bbox_reg_targets.reshape(-1, self.side_num * 4)
bbox_reg_weights = bbox_reg_weights.reshape(-1, self.side_num * 4)
(bbox_cls_pred, bbox_reg_pred) = bbox_pred
bbox_cls_pred = bbox_cls_pred.permute(0, 2, 3, 1).reshape(
-1, self.side_num * 4)
bbox_reg_pred = bbox_reg_pred.permute(0, 2, 3, 1).reshape(
-1, self.side_num * 4)
loss_bbox_cls = self.loss_bbox_cls(
bbox_cls_pred,
bbox_cls_targets.long(),
bbox_cls_weights,
avg_factor=num_total_samples * 4 * self.side_num)
loss_bbox_reg = self.loss_bbox_reg(
bbox_reg_pred,
bbox_reg_targets,
bbox_reg_weights,
avg_factor=num_total_samples * 4 * self.bbox_coder.offset_topk)
return loss_cls, loss_bbox_cls, loss_bbox_reg
@force_fp32(apply_to=('cls_scores', 'bbox_preds'))
def loss(self,
cls_scores,
bbox_preds,
gt_bboxes,
gt_labels,
img_metas,
gt_bboxes_ignore=None):
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == self.approx_anchor_generator.num_levels
device = cls_scores[0].device
# get sampled approxes
approxs_list, inside_flag_list = GuidedAnchorHead.get_sampled_approxs(
self, featmap_sizes, img_metas, device=device)
square_list = self.get_anchors(featmap_sizes, img_metas, device=device)
label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
cls_reg_targets = self.get_target(
approxs_list,
inside_flag_list,
square_list,
gt_bboxes,
img_metas,
gt_bboxes_ignore_list=gt_bboxes_ignore,
gt_labels_list=gt_labels,
label_channels=label_channels,
sampling=self.sampling)
if cls_reg_targets is None:
return None
(labels_list, label_weights_list, bbox_cls_targets_list,
bbox_cls_weights_list, bbox_reg_targets_list, bbox_reg_weights_list,
num_total_pos, num_total_neg) = cls_reg_targets
num_total_samples = (
num_total_pos + num_total_neg if self.sampling else num_total_pos)
losses_cls, losses_bbox_cls, losses_bbox_reg = multi_apply(
self.loss_single,
cls_scores,
bbox_preds,
labels_list,
label_weights_list,
bbox_cls_targets_list,
bbox_cls_weights_list,
bbox_reg_targets_list,
bbox_reg_weights_list,
num_total_samples=num_total_samples)
return dict(
loss_cls=losses_cls,
loss_bbox_cls=losses_bbox_cls,
loss_bbox_reg=losses_bbox_reg)
@force_fp32(apply_to=('cls_scores', 'bbox_preds'))
def get_bboxes(self,
cls_scores,
bbox_preds,
img_metas,
cfg=None,
rescale=False):
assert len(cls_scores) == len(bbox_preds)
num_levels = len(cls_scores)
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
device = cls_scores[0].device
mlvl_anchors = self.get_anchors(
featmap_sizes, img_metas, device=device)
result_list = []
for img_id in range(len(img_metas)):
cls_score_list = [
cls_scores[i][img_id].detach() for i in range(num_levels)
]
bbox_cls_pred_list = [
bbox_preds[i][0][img_id].detach() for i in range(num_levels)
]
bbox_reg_pred_list = [
bbox_preds[i][1][img_id].detach() for i in range(num_levels)
]
img_shape = img_metas[img_id]['img_shape']
scale_factor = img_metas[img_id]['scale_factor']
proposals = self.get_bboxes_single(cls_score_list,
bbox_cls_pred_list,
bbox_reg_pred_list,
mlvl_anchors[img_id], img_shape,
scale_factor, cfg, rescale)
result_list.append(proposals)
return result_list
def get_bboxes_single(self,
cls_scores,
bbox_cls_preds,
bbox_reg_preds,
mlvl_anchors,
img_shape,
scale_factor,
cfg,
rescale=False):
cfg = self.test_cfg if cfg is None else cfg
mlvl_bboxes = []
mlvl_scores = []
mlvl_confids = []
assert len(cls_scores) == len(bbox_cls_preds) == len(
bbox_reg_preds) == len(mlvl_anchors)
for cls_score, bbox_cls_pred, bbox_reg_pred, anchors in zip(
cls_scores, bbox_cls_preds, bbox_reg_preds, mlvl_anchors):
assert cls_score.size()[-2:] == bbox_cls_pred.size(
)[-2:] == bbox_reg_pred.size()[-2::]
cls_score = cls_score.permute(1, 2,
0).reshape(-1, self.cls_out_channels)
if self.use_sigmoid_cls:
scores = cls_score.sigmoid()
else:
scores = cls_score.softmax(-1)
bbox_cls_pred = bbox_cls_pred.permute(1, 2, 0).reshape(
-1, self.side_num * 4)
bbox_reg_pred = bbox_reg_pred.permute(1, 2, 0).reshape(
-1, self.side_num * 4)
nms_pre = cfg.get('nms_pre', -1)
if nms_pre > 0 and scores.shape[0] > nms_pre:
if self.use_sigmoid_cls:
max_scores, _ = scores.max(dim=1)
else:
max_scores, _ = scores[:, :-1].max(dim=1)
_, topk_inds = max_scores.topk(nms_pre)
anchors = anchors[topk_inds, :]
bbox_cls_pred = bbox_cls_pred[topk_inds, :]
bbox_reg_pred = bbox_reg_pred[topk_inds, :]
scores = scores[topk_inds, :]
bbox_preds = [
bbox_cls_pred.contiguous(),
bbox_reg_pred.contiguous()
]
bboxes, confids = self.bbox_coder.decode(
anchors.contiguous(), bbox_preds, max_shape=img_shape)
mlvl_bboxes.append(bboxes)
mlvl_scores.append(scores)
mlvl_confids.append(confids)
mlvl_bboxes = torch.cat(mlvl_bboxes)
if rescale:
mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
mlvl_scores = torch.cat(mlvl_scores)
mlvl_confids = torch.cat(mlvl_confids)
if self.use_sigmoid_cls:
padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
det_bboxes, det_labels = multiclass_nms(
mlvl_bboxes,
mlvl_scores,
cfg.score_thr,
cfg.nms,
cfg.max_per_img,
score_factors=mlvl_confids)
return det_bboxes, det_labels
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment