"vscode:/vscode.git/clone" did not exist on "e7a7caf9115f6b06dcfd18a105817ab74713346f"
Commit a4372e17 authored by huchen's avatar huchen
Browse files

Merge branch 'hepj_work' into 'main'

增加conformer代码

See merge request dcutoolkit/deeplearing/dlexamples_new!7
parents 7f99c1c3 142dcf29
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import xavier_init
from mmcv.runner import force_fp32
from mmdet.core import (build_anchor_generator, build_assigner,
build_bbox_coder, build_sampler, multi_apply)
from ..builder import HEADS
from ..losses import smooth_l1_loss
from .anchor_head import AnchorHead
# TODO: add loss evaluator for SSD
@HEADS.register_module()
class SSDHead(AnchorHead):
"""SSD head used in https://arxiv.org/abs/1512.02325.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
anchor_generator (dict): Config dict for anchor generator
bbox_coder (dict): Config of bounding box coder.
reg_decoded_bbox (bool): If true, the regression loss would be
applied directly on decoded bounding boxes, converting both
the predicted boxes and regression targets to absolute
coordinates format. Default False. It should be `True` when
using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
train_cfg (dict): Training config of anchor head.
test_cfg (dict): Testing config of anchor head.
""" # noqa: W605
def __init__(self,
num_classes=80,
in_channels=(512, 1024, 512, 256, 256, 256),
anchor_generator=dict(
type='SSDAnchorGenerator',
scale_major=False,
input_size=300,
strides=[8, 16, 32, 64, 100, 300],
ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]),
basesize_ratio_range=(0.1, 0.9)),
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
clip_border=True,
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
),
reg_decoded_bbox=False,
train_cfg=None,
test_cfg=None):
super(AnchorHead, self).__init__()
self.num_classes = num_classes
self.in_channels = in_channels
self.cls_out_channels = num_classes + 1 # add background class
self.anchor_generator = build_anchor_generator(anchor_generator)
num_anchors = self.anchor_generator.num_base_anchors
reg_convs = []
cls_convs = []
for i in range(len(in_channels)):
reg_convs.append(
nn.Conv2d(
in_channels[i],
num_anchors[i] * 4,
kernel_size=3,
padding=1))
cls_convs.append(
nn.Conv2d(
in_channels[i],
num_anchors[i] * (num_classes + 1),
kernel_size=3,
padding=1))
self.reg_convs = nn.ModuleList(reg_convs)
self.cls_convs = nn.ModuleList(cls_convs)
self.bbox_coder = build_bbox_coder(bbox_coder)
self.reg_decoded_bbox = reg_decoded_bbox
self.use_sigmoid_cls = False
self.cls_focal_loss = False
self.train_cfg = train_cfg
self.test_cfg = test_cfg
# set sampling=False for archor_target
self.sampling = False
if self.train_cfg:
self.assigner = build_assigner(self.train_cfg.assigner)
# SSD sampling=False so use PseudoSampler
sampler_cfg = dict(type='PseudoSampler')
self.sampler = build_sampler(sampler_cfg, context=self)
self.fp16_enabled = False
def init_weights(self):
"""Initialize weights of the head."""
for m in self.modules():
if isinstance(m, nn.Conv2d):
xavier_init(m, distribution='uniform', bias=0)
def forward(self, feats):
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
tuple:
cls_scores (list[Tensor]): Classification scores for all scale
levels, each is a 4D-tensor, the channels number is
num_anchors * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for all scale
levels, each is a 4D-tensor, the channels number is
num_anchors * 4.
"""
cls_scores = []
bbox_preds = []
for feat, reg_conv, cls_conv in zip(feats, self.reg_convs,
self.cls_convs):
cls_scores.append(cls_conv(feat))
bbox_preds.append(reg_conv(feat))
return cls_scores, bbox_preds
def loss_single(self, cls_score, bbox_pred, anchor, labels, label_weights,
bbox_targets, bbox_weights, num_total_samples):
"""Compute loss of a single image.
Args:
cls_score (Tensor): Box scores for eachimage
Has shape (num_total_anchors, num_classes).
bbox_pred (Tensor): Box energies / deltas for each image
level with shape (num_total_anchors, 4).
anchors (Tensor): Box reference for each scale level with shape
(num_total_anchors, 4).
labels (Tensor): Labels of each anchors with shape
(num_total_anchors,).
label_weights (Tensor): Label weights of each anchor with shape
(num_total_anchors,)
bbox_targets (Tensor): BBox regression targets of each anchor wight
shape (num_total_anchors, 4).
bbox_weights (Tensor): BBox regression loss weights of each anchor
with shape (num_total_anchors, 4).
num_total_samples (int): If sampling, num total samples equal to
the number of total anchors; Otherwise, it is the number of
positive anchors.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
loss_cls_all = F.cross_entropy(
cls_score, labels, reduction='none') * label_weights
# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
pos_inds = ((labels >= 0) &
(labels < self.num_classes)).nonzero().reshape(-1)
neg_inds = (labels == self.num_classes).nonzero().view(-1)
num_pos_samples = pos_inds.size(0)
num_neg_samples = self.train_cfg.neg_pos_ratio * num_pos_samples
if num_neg_samples > neg_inds.size(0):
num_neg_samples = neg_inds.size(0)
topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples)
loss_cls_pos = loss_cls_all[pos_inds].sum()
loss_cls_neg = topk_loss_cls_neg.sum()
loss_cls = (loss_cls_pos + loss_cls_neg) / num_total_samples
if self.reg_decoded_bbox:
# When the regression loss (e.g. `IouLoss`, `GIouLoss`)
# is applied directly on the decoded bounding boxes, it
# decodes the already encoded coordinates to absolute format.
bbox_pred = self.bbox_coder.decode(anchor, bbox_pred)
loss_bbox = smooth_l1_loss(
bbox_pred,
bbox_targets,
bbox_weights,
beta=self.train_cfg.smoothl1_beta,
avg_factor=num_total_samples)
return loss_cls[None], loss_bbox
@force_fp32(apply_to=('cls_scores', 'bbox_preds'))
def loss(self,
cls_scores,
bbox_preds,
gt_bboxes,
gt_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute losses of the head.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_anchors * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_anchors * 4, H, W)
gt_bboxes (list[Tensor]): each item are the truth boxes for each
image in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): specify which bounding
boxes can be ignored when computing the loss.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == self.anchor_generator.num_levels
device = cls_scores[0].device
anchor_list, valid_flag_list = self.get_anchors(
featmap_sizes, img_metas, device=device)
cls_reg_targets = self.get_targets(
anchor_list,
valid_flag_list,
gt_bboxes,
img_metas,
gt_bboxes_ignore_list=gt_bboxes_ignore,
gt_labels_list=gt_labels,
label_channels=1,
unmap_outputs=False)
if cls_reg_targets is None:
return None
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
num_total_pos, num_total_neg) = cls_reg_targets
num_images = len(img_metas)
all_cls_scores = torch.cat([
s.permute(0, 2, 3, 1).reshape(
num_images, -1, self.cls_out_channels) for s in cls_scores
], 1)
all_labels = torch.cat(labels_list, -1).view(num_images, -1)
all_label_weights = torch.cat(label_weights_list,
-1).view(num_images, -1)
all_bbox_preds = torch.cat([
b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
for b in bbox_preds
], -2)
all_bbox_targets = torch.cat(bbox_targets_list,
-2).view(num_images, -1, 4)
all_bbox_weights = torch.cat(bbox_weights_list,
-2).view(num_images, -1, 4)
# concat all level anchors to a single tensor
all_anchors = []
for i in range(num_images):
all_anchors.append(torch.cat(anchor_list[i]))
# check NaN and Inf
assert torch.isfinite(all_cls_scores).all().item(), \
'classification scores become infinite or NaN!'
assert torch.isfinite(all_bbox_preds).all().item(), \
'bbox predications become infinite or NaN!'
losses_cls, losses_bbox = multi_apply(
self.loss_single,
all_cls_scores,
all_bbox_preds,
all_anchors,
all_labels,
all_label_weights,
all_bbox_targets,
all_bbox_weights,
num_total_samples=num_total_pos)
return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import Conv2d, Linear, build_activation_layer
from mmcv.runner import force_fp32
from mmdet.core import (bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh,
build_assigner, build_sampler, multi_apply,
reduce_mean)
from mmdet.models.utils import (FFN, build_positional_encoding,
build_transformer)
from ..builder import HEADS, build_loss
from .anchor_free_head import AnchorFreeHead
@HEADS.register_module()
class TransformerHead(AnchorFreeHead):
"""Implements the DETR transformer head.
See `paper: End-to-End Object Detection with Transformers
<https://arxiv.org/pdf/2005.12872>`_ for details.
Args:
num_classes (int): Number of categories excluding the background.
in_channels (int): Number of channels in the input feature map.
num_fcs (int, optional): Number of fully-connected layers used in
`FFN`, which is then used for the regression head. Default 2.
transformer (dict, optional): Config for transformer.
positional_encoding (dict, optional): Config for position encoding.
loss_cls (dict, optional): Config of the classification loss.
Default `CrossEntropyLoss`.
loss_bbox (dict, optional): Config of the regression loss.
Default `L1Loss`.
loss_iou (dict, optional): Config of the regression iou loss.
Default `GIoULoss`.
tran_cfg (dict, optional): Training config of transformer head.
test_cfg (dict, optional): Testing config of transformer head.
Example:
>>> import torch
>>> self = TransformerHead(80, 2048)
>>> x = torch.rand(1, 2048, 32, 32)
>>> mask = torch.ones(1, 32, 32).to(x.dtype)
>>> mask[:, :16, :15] = 0
>>> all_cls_scores, all_bbox_preds = self(x, mask)
"""
def __init__(self,
num_classes,
in_channels,
num_fcs=2,
transformer=dict(
type='Transformer',
embed_dims=256,
num_heads=8,
num_encoder_layers=6,
num_decoder_layers=6,
feedforward_channels=2048,
dropout=0.1,
act_cfg=dict(type='ReLU', inplace=True),
norm_cfg=dict(type='LN'),
num_fcs=2,
pre_norm=False,
return_intermediate_dec=True),
positional_encoding=dict(
type='SinePositionalEncoding',
num_feats=128,
normalize=True),
loss_cls=dict(
type='CrossEntropyLoss',
bg_cls_weight=0.1,
use_sigmoid=False,
loss_weight=1.0,
class_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=5.0),
loss_iou=dict(type='GIoULoss', loss_weight=2.0),
train_cfg=dict(
assigner=dict(
type='HungarianAssigner',
cls_cost=dict(type='ClassificationCost', weight=1.),
reg_cost=dict(type='BBoxL1Cost', weight=5.0),
iou_cost=dict(
type='IoUCost', iou_mode='giou', weight=2.0))),
test_cfg=dict(max_per_img=100),
**kwargs):
# NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
# since it brings inconvenience when the initialization of
# `AnchorFreeHead` is called.
super(AnchorFreeHead, self).__init__()
use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
assert not use_sigmoid_cls, 'setting use_sigmoid_cls as True is ' \
'not supported in DETR, since background is needed for the ' \
'matching process.'
assert 'embed_dims' in transformer \
and 'num_feats' in positional_encoding
num_feats = positional_encoding['num_feats']
embed_dims = transformer['embed_dims']
assert num_feats * 2 == embed_dims, 'embed_dims should' \
f' be exactly 2 times of num_feats. Found {embed_dims}' \
f' and {num_feats}.'
assert test_cfg is not None and 'max_per_img' in test_cfg
class_weight = loss_cls.get('class_weight', None)
if class_weight is not None:
assert isinstance(class_weight, float), 'Expected ' \
'class_weight to have type float. Found ' \
f'{type(class_weight)}.'
# NOTE following the official DETR rep0, bg_cls_weight means
# relative classification weight of the no-object class.
bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
assert isinstance(bg_cls_weight, float), 'Expected ' \
'bg_cls_weight to have type float. Found ' \
f'{type(bg_cls_weight)}.'
class_weight = torch.ones(num_classes + 1) * class_weight
# set background class as the last indice
class_weight[num_classes] = bg_cls_weight
loss_cls.update({'class_weight': class_weight})
if 'bg_cls_weight' in loss_cls:
loss_cls.pop('bg_cls_weight')
self.bg_cls_weight = bg_cls_weight
if train_cfg:
assert 'assigner' in train_cfg, 'assigner should be provided '\
'when train_cfg is set.'
assigner = train_cfg['assigner']
assert loss_cls['loss_weight'] == assigner['cls_cost']['weight'], \
'The classification weight for loss and matcher should be' \
'exactly the same.'
assert loss_bbox['loss_weight'] == assigner['reg_cost'][
'weight'], 'The regression L1 weight for loss and matcher ' \
'should be exactly the same.'
assert loss_iou['loss_weight'] == assigner['iou_cost']['weight'], \
'The regression iou weight for loss and matcher should be' \
'exactly the same.'
self.assigner = build_assigner(assigner)
# DETR sampling=False, so use PseudoSampler
sampler_cfg = dict(type='PseudoSampler')
self.sampler = build_sampler(sampler_cfg, context=self)
self.num_classes = num_classes
self.cls_out_channels = num_classes + 1
self.in_channels = in_channels
self.num_fcs = num_fcs
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.use_sigmoid_cls = use_sigmoid_cls
self.embed_dims = embed_dims
self.num_query = test_cfg['max_per_img']
self.fp16_enabled = False
self.loss_cls = build_loss(loss_cls)
self.loss_bbox = build_loss(loss_bbox)
self.loss_iou = build_loss(loss_iou)
self.act_cfg = transformer.get('act_cfg',
dict(type='ReLU', inplace=True))
self.activate = build_activation_layer(self.act_cfg)
self.positional_encoding = build_positional_encoding(
positional_encoding)
self.transformer = build_transformer(transformer)
self._init_layers()
def _init_layers(self):
"""Initialize layers of the transformer head."""
self.input_proj = Conv2d(
self.in_channels, self.embed_dims, kernel_size=1)
self.fc_cls = Linear(self.embed_dims, self.cls_out_channels)
self.reg_ffn = FFN(
self.embed_dims,
self.embed_dims,
self.num_fcs,
self.act_cfg,
dropout=0.0,
add_residual=False)
self.fc_reg = Linear(self.embed_dims, 4)
self.query_embedding = nn.Embedding(self.num_query, self.embed_dims)
def init_weights(self, distribution='uniform'):
"""Initialize weights of the transformer head."""
# The initialization for transformer is important
self.transformer.init_weights()
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
missing_keys, unexpected_keys, error_msgs):
"""load checkpoints."""
# NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
# since `AnchorFreeHead._load_from_state_dict` should not be
# called here. Invoking the default `Module._load_from_state_dict`
# is enough.
super(AnchorFreeHead,
self)._load_from_state_dict(state_dict, prefix, local_metadata,
strict, missing_keys,
unexpected_keys, error_msgs)
def forward(self, feats, img_metas):
"""Forward function.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
img_metas (list[dict]): List of image information.
Returns:
tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.
- all_cls_scores_list (list[Tensor]): Classification scores \
for each scale level. Each is a 4D-tensor with shape \
[nb_dec, bs, num_query, cls_out_channels]. Note \
`cls_out_channels` should includes background.
- all_bbox_preds_list (list[Tensor]): Sigmoid regression \
outputs for each scale level. Each is a 4D-tensor with \
normalized coordinate format (cx, cy, w, h) and shape \
[nb_dec, bs, num_query, 4].
"""
num_levels = len(feats)
img_metas_list = [img_metas for _ in range(num_levels)]
return multi_apply(self.forward_single, feats, img_metas_list)
def forward_single(self, x, img_metas):
""""Forward function for a single feature level.
Args:
x (Tensor): Input feature from backbone's single stage, shape
[bs, c, h, w].
img_metas (list[dict]): List of image information.
Returns:
all_cls_scores (Tensor): Outputs from the classification head,
shape [nb_dec, bs, num_query, cls_out_channels]. Note
cls_out_channels should includes background.
all_bbox_preds (Tensor): Sigmoid outputs from the regression
head with normalized coordinate format (cx, cy, w, h).
Shape [nb_dec, bs, num_query, 4].
"""
# construct binary masks which used for the transformer.
# NOTE following the official DETR repo, non-zero values representing
# ignored positions, while zero values means valid positions.
batch_size = x.size(0)
input_img_h, input_img_w = img_metas[0]['batch_input_shape']
masks = x.new_ones((batch_size, input_img_h, input_img_w))
for img_id in range(batch_size):
img_h, img_w, _ = img_metas[img_id]['img_shape']
masks[img_id, :img_h, :img_w] = 0
x = self.input_proj(x)
# interpolate masks to have the same spatial shape with x
masks = F.interpolate(
masks.unsqueeze(1), size=x.shape[-2:]).to(torch.bool).squeeze(1)
# position encoding
pos_embed = self.positional_encoding(masks) # [bs, embed_dim, h, w]
# outs_dec: [nb_dec, bs, num_query, embed_dim]
outs_dec, _ = self.transformer(x, masks, self.query_embedding.weight,
pos_embed)
all_cls_scores = self.fc_cls(outs_dec)
all_bbox_preds = self.fc_reg(self.activate(
self.reg_ffn(outs_dec))).sigmoid()
return all_cls_scores, all_bbox_preds
@force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
def loss(self,
all_cls_scores_list,
all_bbox_preds_list,
gt_bboxes_list,
gt_labels_list,
img_metas,
gt_bboxes_ignore=None):
""""Loss function.
Only outputs from the last feature level are used for computing
losses by default.
Args:
all_cls_scores_list (list[Tensor]): Classification outputs
for each feature level. Each is a 4D-tensor with shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds_list (list[Tensor]): Sigmoid regression
outputs for each feature level. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
# NOTE defaultly only the outputs from the last feature scale is used.
all_cls_scores = all_cls_scores_list[-1]
all_bbox_preds = all_bbox_preds_list[-1]
assert gt_bboxes_ignore is None, \
'Only supports for gt_bboxes_ignore setting to None.'
num_dec_layers = len(all_cls_scores)
all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
all_gt_bboxes_ignore_list = [
gt_bboxes_ignore for _ in range(num_dec_layers)
]
img_metas_list = [img_metas for _ in range(num_dec_layers)]
losses_cls, losses_bbox, losses_iou = multi_apply(
self.loss_single, all_cls_scores, all_bbox_preds,
all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
all_gt_bboxes_ignore_list)
loss_dict = dict()
# loss from the last decoder layer
loss_dict['loss_cls'] = losses_cls[-1]
loss_dict['loss_bbox'] = losses_bbox[-1]
loss_dict['loss_iou'] = losses_iou[-1]
# loss from other decoder layers
num_dec_layer = 0
for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
losses_bbox[:-1],
losses_iou[:-1]):
loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
num_dec_layer += 1
return loss_dict
def loss_single(self,
cls_scores,
bbox_preds,
gt_bboxes_list,
gt_labels_list,
img_metas,
gt_bboxes_ignore_list=None):
""""Loss function for outputs from a single decoder layer of a single
feature level.
Args:
cls_scores (Tensor): Box score logits from a single decoder layer
for all images. Shape [bs, num_query, cls_out_channels].
bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
for all images, with normalized coordinate (cx, cy, w, h) and
shape [bs, num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components for outputs from
a single decoder layer.
"""
num_imgs = cls_scores.size(0)
cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
gt_bboxes_list, gt_labels_list,
img_metas, gt_bboxes_ignore_list)
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
num_total_pos, num_total_neg) = cls_reg_targets
labels = torch.cat(labels_list, 0)
label_weights = torch.cat(label_weights_list, 0)
bbox_targets = torch.cat(bbox_targets_list, 0)
bbox_weights = torch.cat(bbox_weights_list, 0)
# classification loss
cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor = num_total_pos * 1.0 + \
num_total_neg * self.bg_cls_weight
loss_cls = self.loss_cls(
cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
# Compute the average number of gt boxes accross all gpus, for
# normalization purposes
num_total_pos = loss_cls.new_tensor([num_total_pos])
num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
# construct factors used for rescale bboxes
factors = []
for img_meta, bbox_pred in zip(img_metas, bbox_preds):
img_h, img_w, _ = img_meta['img_shape']
factor = bbox_pred.new_tensor([img_w, img_h, img_w,
img_h]).unsqueeze(0).repeat(
bbox_pred.size(0), 1)
factors.append(factor)
factors = torch.cat(factors, 0)
# DETR regress the relative position of boxes (cxcywh) in the image,
# thus the learning target is normalized by the image size. So here
# we need to re-scale them for calculating IoU loss
bbox_preds = bbox_preds.reshape(-1, 4)
bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
# regression IoU loss, defaultly GIoU loss
loss_iou = self.loss_iou(
bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
# regression L1 loss
loss_bbox = self.loss_bbox(
bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
return loss_cls, loss_bbox, loss_iou
def get_targets(self,
cls_scores_list,
bbox_preds_list,
gt_bboxes_list,
gt_labels_list,
img_metas,
gt_bboxes_ignore_list=None):
""""Compute regression and classification targets for a batch image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_scores_list (list[Tensor]): Box score logits from a single
decoder layer for each image with shape [num_query,
cls_out_channels].
bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
decoder layer for each image, with normalized coordinate
(cx, cy, w, h) and shape [num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
tuple: a tuple containing the following targets.
- labels_list (list[Tensor]): Labels for all images.
- label_weights_list (list[Tensor]): Label weights for all \
images.
- bbox_targets_list (list[Tensor]): BBox targets for all \
images.
- bbox_weights_list (list[Tensor]): BBox weights for all \
images.
- num_total_pos (int): Number of positive samples in all \
images.
- num_total_neg (int): Number of negative samples in all \
images.
"""
assert gt_bboxes_ignore_list is None, \
'Only supports for gt_bboxes_ignore setting to None.'
num_imgs = len(cls_scores_list)
gt_bboxes_ignore_list = [
gt_bboxes_ignore_list for _ in range(num_imgs)
]
(labels_list, label_weights_list, bbox_targets_list,
bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
self._get_target_single, cls_scores_list, bbox_preds_list,
gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore_list)
num_total_pos = sum((inds.numel() for inds in pos_inds_list))
num_total_neg = sum((inds.numel() for inds in neg_inds_list))
return (labels_list, label_weights_list, bbox_targets_list,
bbox_weights_list, num_total_pos, num_total_neg)
def _get_target_single(self,
cls_score,
bbox_pred,
gt_bboxes,
gt_labels,
img_meta,
gt_bboxes_ignore=None):
""""Compute regression and classification targets for one image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_score (Tensor): Box score logits from a single decoder layer
for one image. Shape [num_query, cls_out_channels].
bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
for one image, with normalized coordinate (cx, cy, w, h) and
shape [num_query, 4].
gt_bboxes (Tensor): Ground truth bboxes for one image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (Tensor): Ground truth class indices for one image
with shape (num_gts, ).
img_meta (dict): Meta information for one image.
gt_bboxes_ignore (Tensor, optional): Bounding boxes
which can be ignored. Default None.
Returns:
tuple[Tensor]: a tuple containing the following for one image.
- labels (Tensor): Labels of each image.
- label_weights (Tensor]): Label weights of each image.
- bbox_targets (Tensor): BBox targets of each image.
- bbox_weights (Tensor): BBox weights of each image.
- pos_inds (Tensor): Sampled positive indices for each image.
- neg_inds (Tensor): Sampled negative indices for each image.
"""
num_bboxes = bbox_pred.size(0)
# assigner and sampler
assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
gt_labels, img_meta,
gt_bboxes_ignore)
sampling_result = self.sampler.sample(assign_result, bbox_pred,
gt_bboxes)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds
# label targets
labels = gt_bboxes.new_full((num_bboxes, ),
self.num_classes,
dtype=torch.long)
labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
label_weights = gt_bboxes.new_ones(num_bboxes)
# bbox targets
bbox_targets = torch.zeros_like(bbox_pred)
bbox_weights = torch.zeros_like(bbox_pred)
bbox_weights[pos_inds] = 1.0
img_h, img_w, _ = img_meta['img_shape']
# DETR regress the relative position of boxes (cxcywh) in the image.
# Thus the learning target should be normalized by the image size, also
# the box format should be converted from defaultly x1y1x2y2 to cxcywh.
factor = bbox_pred.new_tensor([img_w, img_h, img_w,
img_h]).unsqueeze(0)
pos_gt_bboxes_normalized = sampling_result.pos_gt_bboxes / factor
pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
bbox_targets[pos_inds] = pos_gt_bboxes_targets
return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
neg_inds)
# over-write because img_metas are needed as inputs for bbox_head.
def forward_train(self,
x,
img_metas,
gt_bboxes,
gt_labels=None,
gt_bboxes_ignore=None,
proposal_cfg=None,
**kwargs):
"""Forward function for training mode.
Args:
x (list[Tensor]): Features from backbone.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes (Tensor): Ground truth bboxes of the image,
shape (num_gts, 4).
gt_labels (Tensor): Ground truth labels of each box,
shape (num_gts,).
gt_bboxes_ignore (Tensor): Ground truth bboxes to be
ignored, shape (num_ignored_gts, 4).
proposal_cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert proposal_cfg is None, '"proposal_cfg" must be None'
outs = self(x, img_metas)
if gt_labels is None:
loss_inputs = outs + (gt_bboxes, img_metas)
else:
loss_inputs = outs + (gt_bboxes, gt_labels, img_metas)
losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
return losses
@force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
def get_bboxes(self,
all_cls_scores_list,
all_bbox_preds_list,
img_metas,
rescale=False):
"""Transform network outputs for a batch into bbox predictions.
Args:
all_cls_scores_list (list[Tensor]): Classification outputs
for each feature level. Each is a 4D-tensor with shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds_list (list[Tensor]): Sigmoid regression
outputs for each feature level. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
img_metas (list[dict]): Meta information of each image.
rescale (bool, optional): If True, return boxes in original
image space. Defalut False.
Returns:
list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
The first item is an (n, 5) tensor, where the first 4 columns \
are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
5-th column is a score between 0 and 1. The second item is a \
(n,) tensor where each item is the predicted class label of \
the corresponding box.
"""
# NOTE defaultly only using outputs from the last feature level,
# and only the ouputs from the last decoder layer is used.
cls_scores = all_cls_scores_list[-1][-1]
bbox_preds = all_bbox_preds_list[-1][-1]
result_list = []
for img_id in range(len(img_metas)):
cls_score = cls_scores[img_id]
bbox_pred = bbox_preds[img_id]
img_shape = img_metas[img_id]['img_shape']
scale_factor = img_metas[img_id]['scale_factor']
proposals = self._get_bboxes_single(cls_score, bbox_pred,
img_shape, scale_factor,
rescale)
result_list.append(proposals)
return result_list
def _get_bboxes_single(self,
cls_score,
bbox_pred,
img_shape,
scale_factor,
rescale=False):
"""Transform outputs from the last decoder layer into bbox predictions
for each image.
Args:
cls_score (Tensor): Box score logits from the last decoder layer
for each image. Shape [num_query, cls_out_channels].
bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
for each image, with coordinate format (cx, cy, w, h) and
shape [num_query, 4].
img_shape (tuple[int]): Shape of input image, (height, width, 3).
scale_factor (ndarray, optional): Scale factor of the image arange
as (w_scale, h_scale, w_scale, h_scale).
rescale (bool, optional): If True, return boxes in original image
space. Default False.
Returns:
tuple[Tensor]: Results of detected bboxes and labels.
- det_bboxes: Predicted bboxes with shape [num_query, 5], \
where the first 4 columns are bounding box positions \
(tl_x, tl_y, br_x, br_y) and the 5-th column are scores \
between 0 and 1.
- det_labels: Predicted labels of the corresponding box with \
shape [num_query].
"""
assert len(cls_score) == len(bbox_pred)
# exclude background
scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1)
det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
if rescale:
det_bboxes /= det_bboxes.new_tensor(scale_factor)
det_bboxes = torch.cat((det_bboxes, scores.unsqueeze(1)), -1)
return det_bboxes, det_labels
import numpy as np
import torch
import torch.nn as nn
from mmcv.cnn import ConvModule, Scale, bias_init_with_prob, normal_init
from mmcv.ops import DeformConv2d
from mmcv.runner import force_fp32
from mmdet.core import (bbox2distance, bbox_overlaps, build_anchor_generator,
build_assigner, build_sampler, distance2bbox,
multi_apply, multiclass_nms, reduce_mean)
from ..builder import HEADS, build_loss
from .atss_head import ATSSHead
from .fcos_head import FCOSHead
INF = 1e8
@HEADS.register_module()
class VFNetHead(ATSSHead, FCOSHead):
"""Head of `VarifocalNet (VFNet): An IoU-aware Dense Object
Detector.<https://arxiv.org/abs/2008.13367>`_.
The VFNet predicts IoU-aware classification scores which mix the
object presence confidence and object localization accuracy as the
detection score. It is built on the FCOS architecture and uses ATSS
for defining positive/negative training examples. The VFNet is trained
with Varifocal Loss and empolys star-shaped deformable convolution to
extract features for a bbox.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
regress_ranges (tuple[tuple[int, int]]): Regress range of multiple
level points.
center_sampling (bool): If true, use center sampling. Default: False.
center_sample_radius (float): Radius of center sampling. Default: 1.5.
sync_num_pos (bool): If true, synchronize the number of positive
examples across GPUs. Default: True
gradient_mul (float): The multiplier to gradients from bbox refinement
and recognition. Default: 0.1.
bbox_norm_type (str): The bbox normalization type, 'reg_denom' or
'stride'. Default: reg_denom
loss_cls_fl (dict): Config of focal loss.
use_vfl (bool): If true, use varifocal loss for training.
Default: True.
loss_cls (dict): Config of varifocal loss.
loss_bbox (dict): Config of localization loss, GIoU Loss.
loss_bbox (dict): Config of localization refinement loss, GIoU Loss.
norm_cfg (dict): dictionary to construct and config norm layer.
Default: norm_cfg=dict(type='GN', num_groups=32,
requires_grad=True).
use_atss (bool): If true, use ATSS to define positive/negative
examples. Default: True.
anchor_generator (dict): Config of anchor generator for ATSS.
Example:
>>> self = VFNetHead(11, 7)
>>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
>>> cls_score, bbox_pred, bbox_pred_refine= self.forward(feats)
>>> assert len(cls_score) == len(self.scales)
""" # noqa: E501
def __init__(self,
num_classes,
in_channels,
regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 512),
(512, INF)),
center_sampling=False,
center_sample_radius=1.5,
sync_num_pos=True,
gradient_mul=0.1,
bbox_norm_type='reg_denom',
loss_cls_fl=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
use_vfl=True,
loss_cls=dict(
type='VarifocalLoss',
use_sigmoid=True,
alpha=0.75,
gamma=2.0,
iou_weighted=True,
loss_weight=1.0),
loss_bbox=dict(type='GIoULoss', loss_weight=1.5),
loss_bbox_refine=dict(type='GIoULoss', loss_weight=2.0),
norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
use_atss=True,
anchor_generator=dict(
type='AnchorGenerator',
ratios=[1.0],
octave_base_scale=8,
scales_per_octave=1,
center_offset=0.0,
strides=[8, 16, 32, 64, 128]),
**kwargs):
# dcn base offsets, adapted from reppoints_head.py
self.num_dconv_points = 9
self.dcn_kernel = int(np.sqrt(self.num_dconv_points))
self.dcn_pad = int((self.dcn_kernel - 1) / 2)
dcn_base = np.arange(-self.dcn_pad,
self.dcn_pad + 1).astype(np.float64)
dcn_base_y = np.repeat(dcn_base, self.dcn_kernel)
dcn_base_x = np.tile(dcn_base, self.dcn_kernel)
dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape(
(-1))
self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1)
super(FCOSHead, self).__init__(
num_classes, in_channels, norm_cfg=norm_cfg, **kwargs)
self.regress_ranges = regress_ranges
self.reg_denoms = [
regress_range[-1] for regress_range in regress_ranges
]
self.reg_denoms[-1] = self.reg_denoms[-2] * 2
self.center_sampling = center_sampling
self.center_sample_radius = center_sample_radius
self.sync_num_pos = sync_num_pos
self.bbox_norm_type = bbox_norm_type
self.gradient_mul = gradient_mul
self.use_vfl = use_vfl
if self.use_vfl:
self.loss_cls = build_loss(loss_cls)
else:
self.loss_cls = build_loss(loss_cls_fl)
self.loss_bbox = build_loss(loss_bbox)
self.loss_bbox_refine = build_loss(loss_bbox_refine)
# for getting ATSS targets
self.use_atss = use_atss
self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
self.anchor_generator = build_anchor_generator(anchor_generator)
self.anchor_center_offset = anchor_generator['center_offset']
self.num_anchors = self.anchor_generator.num_base_anchors[0]
self.sampling = False
if self.train_cfg:
self.assigner = build_assigner(self.train_cfg.assigner)
sampler_cfg = dict(type='PseudoSampler')
self.sampler = build_sampler(sampler_cfg, context=self)
def _init_layers(self):
"""Initialize layers of the head."""
super(FCOSHead, self)._init_cls_convs()
super(FCOSHead, self)._init_reg_convs()
self.relu = nn.ReLU(inplace=True)
self.vfnet_reg_conv = ConvModule(
self.feat_channels,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
bias=self.conv_bias)
self.vfnet_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
self.vfnet_reg_refine_dconv = DeformConv2d(
self.feat_channels,
self.feat_channels,
self.dcn_kernel,
1,
padding=self.dcn_pad)
self.vfnet_reg_refine = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
self.scales_refine = nn.ModuleList([Scale(1.0) for _ in self.strides])
self.vfnet_cls_dconv = DeformConv2d(
self.feat_channels,
self.feat_channels,
self.dcn_kernel,
1,
padding=self.dcn_pad)
self.vfnet_cls = nn.Conv2d(
self.feat_channels, self.cls_out_channels, 3, padding=1)
def init_weights(self):
"""Initialize weights of the head."""
for m in self.cls_convs:
if isinstance(m.conv, nn.Conv2d):
normal_init(m.conv, std=0.01)
for m in self.reg_convs:
if isinstance(m.conv, nn.Conv2d):
normal_init(m.conv, std=0.01)
normal_init(self.vfnet_reg_conv.conv, std=0.01)
normal_init(self.vfnet_reg, std=0.01)
normal_init(self.vfnet_reg_refine_dconv, std=0.01)
normal_init(self.vfnet_reg_refine, std=0.01)
normal_init(self.vfnet_cls_dconv, std=0.01)
bias_cls = bias_init_with_prob(0.01)
normal_init(self.vfnet_cls, std=0.01, bias=bias_cls)
def forward(self, feats):
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
tuple:
cls_scores (list[Tensor]): Box iou-aware scores for each scale
level, each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box offsets for each
scale level, each is a 4D-tensor, the channel number is
num_points * 4.
bbox_preds_refine (list[Tensor]): Refined Box offsets for
each scale level, each is a 4D-tensor, the channel
number is num_points * 4.
"""
return multi_apply(self.forward_single, feats, self.scales,
self.scales_refine, self.strides, self.reg_denoms)
def forward_single(self, x, scale, scale_refine, stride, reg_denom):
"""Forward features of a single scale level.
Args:
x (Tensor): FPN feature maps of the specified stride.
scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
the bbox prediction.
scale_refine (:obj: `mmcv.cnn.Scale`): Learnable scale module to
resize the refined bbox prediction.
stride (int): The corresponding stride for feature maps,
used to normalize the bbox prediction when
bbox_norm_type = 'stride'.
reg_denom (int): The corresponding regression range for feature
maps, only used to normalize the bbox prediction when
bbox_norm_type = 'reg_denom'.
Returns:
tuple: iou-aware cls scores for each box, bbox predictions and
refined bbox predictions of input feature maps.
"""
cls_feat = x
reg_feat = x
for cls_layer in self.cls_convs:
cls_feat = cls_layer(cls_feat)
for reg_layer in self.reg_convs:
reg_feat = reg_layer(reg_feat)
# predict the bbox_pred of different level
reg_feat_init = self.vfnet_reg_conv(reg_feat)
if self.bbox_norm_type == 'reg_denom':
bbox_pred = scale(
self.vfnet_reg(reg_feat_init)).float().exp() * reg_denom
elif self.bbox_norm_type == 'stride':
bbox_pred = scale(
self.vfnet_reg(reg_feat_init)).float().exp() * stride
else:
raise NotImplementedError
# compute star deformable convolution offsets
# converting dcn_offset to reg_feat.dtype thus VFNet can be
# trained with FP16
dcn_offset = self.star_dcn_offset(bbox_pred, self.gradient_mul,
stride).to(reg_feat.dtype)
# refine the bbox_pred
reg_feat = self.relu(self.vfnet_reg_refine_dconv(reg_feat, dcn_offset))
bbox_pred_refine = scale_refine(
self.vfnet_reg_refine(reg_feat)).float().exp()
bbox_pred_refine = bbox_pred_refine * bbox_pred.detach()
# predict the iou-aware cls score
cls_feat = self.relu(self.vfnet_cls_dconv(cls_feat, dcn_offset))
cls_score = self.vfnet_cls(cls_feat)
return cls_score, bbox_pred, bbox_pred_refine
def star_dcn_offset(self, bbox_pred, gradient_mul, stride):
"""Compute the star deformable conv offsets.
Args:
bbox_pred (Tensor): Predicted bbox distance offsets (l, r, t, b).
gradient_mul (float): Gradient multiplier.
stride (int): The corresponding stride for feature maps,
used to project the bbox onto the feature map.
Returns:
dcn_offsets (Tensor): The offsets for deformable convolution.
"""
dcn_base_offset = self.dcn_base_offset.type_as(bbox_pred)
bbox_pred_grad_mul = (1 - gradient_mul) * bbox_pred.detach() + \
gradient_mul * bbox_pred
# map to the feature map scale
bbox_pred_grad_mul = bbox_pred_grad_mul / stride
N, C, H, W = bbox_pred.size()
x1 = bbox_pred_grad_mul[:, 0, :, :]
y1 = bbox_pred_grad_mul[:, 1, :, :]
x2 = bbox_pred_grad_mul[:, 2, :, :]
y2 = bbox_pred_grad_mul[:, 3, :, :]
bbox_pred_grad_mul_offset = bbox_pred.new_zeros(
N, 2 * self.num_dconv_points, H, W)
bbox_pred_grad_mul_offset[:, 0, :, :] = -1.0 * y1 # -y1
bbox_pred_grad_mul_offset[:, 1, :, :] = -1.0 * x1 # -x1
bbox_pred_grad_mul_offset[:, 2, :, :] = -1.0 * y1 # -y1
bbox_pred_grad_mul_offset[:, 4, :, :] = -1.0 * y1 # -y1
bbox_pred_grad_mul_offset[:, 5, :, :] = x2 # x2
bbox_pred_grad_mul_offset[:, 7, :, :] = -1.0 * x1 # -x1
bbox_pred_grad_mul_offset[:, 11, :, :] = x2 # x2
bbox_pred_grad_mul_offset[:, 12, :, :] = y2 # y2
bbox_pred_grad_mul_offset[:, 13, :, :] = -1.0 * x1 # -x1
bbox_pred_grad_mul_offset[:, 14, :, :] = y2 # y2
bbox_pred_grad_mul_offset[:, 16, :, :] = y2 # y2
bbox_pred_grad_mul_offset[:, 17, :, :] = x2 # x2
dcn_offset = bbox_pred_grad_mul_offset - dcn_base_offset
return dcn_offset
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'bbox_preds_refine'))
def loss(self,
cls_scores,
bbox_preds,
bbox_preds_refine,
gt_bboxes,
gt_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute loss of the head.
Args:
cls_scores (list[Tensor]): Box iou-aware scores for each scale
level, each is a 4D-tensor, the channel number is
num_points * num_classes.
bbox_preds (list[Tensor]): Box offsets for each
scale level, each is a 4D-tensor, the channel number is
num_points * 4.
bbox_preds_refine (list[Tensor]): Refined Box offsets for
each scale level, each is a 4D-tensor, the channel
number is num_points * 4.
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): specify which bounding
boxes can be ignored when computing the loss.
Default: None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert len(cls_scores) == len(bbox_preds) == len(bbox_preds_refine)
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
bbox_preds[0].device)
labels, label_weights, bbox_targets, bbox_weights = self.get_targets(
cls_scores, all_level_points, gt_bboxes, gt_labels, img_metas,
gt_bboxes_ignore)
num_imgs = cls_scores[0].size(0)
# flatten cls_scores, bbox_preds and bbox_preds_refine
flatten_cls_scores = [
cls_score.permute(0, 2, 3,
1).reshape(-1,
self.cls_out_channels).contiguous()
for cls_score in cls_scores
]
flatten_bbox_preds = [
bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4).contiguous()
for bbox_pred in bbox_preds
]
flatten_bbox_preds_refine = [
bbox_pred_refine.permute(0, 2, 3, 1).reshape(-1, 4).contiguous()
for bbox_pred_refine in bbox_preds_refine
]
flatten_cls_scores = torch.cat(flatten_cls_scores)
flatten_bbox_preds = torch.cat(flatten_bbox_preds)
flatten_bbox_preds_refine = torch.cat(flatten_bbox_preds_refine)
flatten_labels = torch.cat(labels)
flatten_bbox_targets = torch.cat(bbox_targets)
# repeat points to align with bbox_preds
flatten_points = torch.cat(
[points.repeat(num_imgs, 1) for points in all_level_points])
# FG cat_id: [0, num_classes - 1], BG cat_id: num_classes
bg_class_ind = self.num_classes
pos_inds = torch.where(
((flatten_labels >= 0) & (flatten_labels < bg_class_ind)) > 0)[0]
num_pos = len(pos_inds)
pos_bbox_preds = flatten_bbox_preds[pos_inds]
pos_bbox_preds_refine = flatten_bbox_preds_refine[pos_inds]
pos_labels = flatten_labels[pos_inds]
# sync num_pos across all gpus
if self.sync_num_pos:
num_pos_avg_per_gpu = reduce_mean(
pos_inds.new_tensor(num_pos).float()).item()
num_pos_avg_per_gpu = max(num_pos_avg_per_gpu, 1.0)
else:
num_pos_avg_per_gpu = num_pos
if num_pos > 0:
pos_bbox_targets = flatten_bbox_targets[pos_inds]
pos_points = flatten_points[pos_inds]
pos_decoded_bbox_preds = distance2bbox(pos_points, pos_bbox_preds)
pos_decoded_target_preds = distance2bbox(pos_points,
pos_bbox_targets)
iou_targets_ini = bbox_overlaps(
pos_decoded_bbox_preds,
pos_decoded_target_preds.detach(),
is_aligned=True).clamp(min=1e-6)
bbox_weights_ini = iou_targets_ini.clone().detach()
iou_targets_ini_avg_per_gpu = reduce_mean(
bbox_weights_ini.sum()).item()
bbox_avg_factor_ini = max(iou_targets_ini_avg_per_gpu, 1.0)
loss_bbox = self.loss_bbox(
pos_decoded_bbox_preds,
pos_decoded_target_preds.detach(),
weight=bbox_weights_ini,
avg_factor=bbox_avg_factor_ini)
pos_decoded_bbox_preds_refine = \
distance2bbox(pos_points, pos_bbox_preds_refine)
iou_targets_rf = bbox_overlaps(
pos_decoded_bbox_preds_refine,
pos_decoded_target_preds.detach(),
is_aligned=True).clamp(min=1e-6)
bbox_weights_rf = iou_targets_rf.clone().detach()
iou_targets_rf_avg_per_gpu = reduce_mean(
bbox_weights_rf.sum()).item()
bbox_avg_factor_rf = max(iou_targets_rf_avg_per_gpu, 1.0)
loss_bbox_refine = self.loss_bbox_refine(
pos_decoded_bbox_preds_refine,
pos_decoded_target_preds.detach(),
weight=bbox_weights_rf,
avg_factor=bbox_avg_factor_rf)
# build IoU-aware cls_score targets
if self.use_vfl:
pos_ious = iou_targets_rf.clone().detach()
cls_iou_targets = torch.zeros_like(flatten_cls_scores)
cls_iou_targets[pos_inds, pos_labels] = pos_ious
else:
loss_bbox = pos_bbox_preds.sum() * 0
loss_bbox_refine = pos_bbox_preds_refine.sum() * 0
if self.use_vfl:
cls_iou_targets = torch.zeros_like(flatten_cls_scores)
if self.use_vfl:
loss_cls = self.loss_cls(
flatten_cls_scores,
cls_iou_targets,
avg_factor=num_pos_avg_per_gpu)
else:
loss_cls = self.loss_cls(
flatten_cls_scores,
flatten_labels,
weight=label_weights,
avg_factor=num_pos_avg_per_gpu)
return dict(
loss_cls=loss_cls,
loss_bbox=loss_bbox,
loss_bbox_rf=loss_bbox_refine)
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'bbox_preds_refine'))
def get_bboxes(self,
cls_scores,
bbox_preds,
bbox_preds_refine,
img_metas,
cfg=None,
rescale=None,
with_nms=True):
"""Transform network outputs for a batch into bbox predictions.
Args:
cls_scores (list[Tensor]): Box iou-aware scores for each scale
level with shape (N, num_points * num_classes, H, W).
bbox_preds (list[Tensor]): Box offsets for each scale
level with shape (N, num_points * 4, H, W).
bbox_preds_refine (list[Tensor]): Refined Box offsets for
each scale level with shape (N, num_points * 4, H, W).
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used. Default: None.
rescale (bool): If True, return boxes in original image space.
Default: False.
with_nms (bool): If True, do nms before returning boxes.
Default: True.
Returns:
list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
The first item is an (n, 5) tensor, where the first 4 columns
are bounding box positions (tl_x, tl_y, br_x, br_y) and the
5-th column is a score between 0 and 1. The second item is a
(n,) tensor where each item is the predicted class label of
the corresponding box.
"""
assert len(cls_scores) == len(bbox_preds) == len(bbox_preds_refine)
num_levels = len(cls_scores)
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
bbox_preds[0].device)
result_list = []
for img_id in range(len(img_metas)):
cls_score_list = [
cls_scores[i][img_id].detach() for i in range(num_levels)
]
bbox_pred_list = [
bbox_preds_refine[i][img_id].detach()
for i in range(num_levels)
]
img_shape = img_metas[img_id]['img_shape']
scale_factor = img_metas[img_id]['scale_factor']
det_bboxes = self._get_bboxes_single(cls_score_list,
bbox_pred_list, mlvl_points,
img_shape, scale_factor, cfg,
rescale, with_nms)
result_list.append(det_bboxes)
return result_list
def _get_bboxes_single(self,
cls_scores,
bbox_preds,
mlvl_points,
img_shape,
scale_factor,
cfg,
rescale=False,
with_nms=True):
"""Transform outputs for a single batch item into bbox predictions.
Args:
cls_scores (list[Tensor]): Box iou-aware scores for a single scale
level with shape (num_points * num_classes, H, W).
bbox_preds (list[Tensor]): Box offsets for a single scale
level with shape (num_points * 4, H, W).
mlvl_points (list[Tensor]): Box reference for a single scale level
with shape (num_total_points, 4).
img_shape (tuple[int]): Shape of the input image,
(height, width, 3).
scale_factor (ndarray): Scale factor of the image arrange as
(w_scale, h_scale, w_scale, h_scale).
cfg (mmcv.Config | None): Test / postprocessing configuration,
if None, test_cfg would be used.
rescale (bool): If True, return boxes in original image space.
Default: False.
with_nms (bool): If True, do nms before returning boxes.
Default: True.
Returns:
tuple(Tensor):
det_bboxes (Tensor): BBox predictions in shape (n, 5), where
the first 4 columns are bounding box positions
(tl_x, tl_y, br_x, br_y) and the 5-th column is a score
between 0 and 1.
det_labels (Tensor): A (n,) tensor where each item is the
predicted class label of the corresponding box.
"""
cfg = self.test_cfg if cfg is None else cfg
assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
mlvl_bboxes = []
mlvl_scores = []
for cls_score, bbox_pred, points in zip(cls_scores, bbox_preds,
mlvl_points):
assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
scores = cls_score.permute(1, 2, 0).reshape(
-1, self.cls_out_channels).contiguous().sigmoid()
bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4).contiguous()
nms_pre = cfg.get('nms_pre', -1)
if 0 < nms_pre < scores.shape[0]:
max_scores, _ = scores.max(dim=1)
_, topk_inds = max_scores.topk(nms_pre)
points = points[topk_inds, :]
bbox_pred = bbox_pred[topk_inds, :]
scores = scores[topk_inds, :]
bboxes = distance2bbox(points, bbox_pred, max_shape=img_shape)
mlvl_bboxes.append(bboxes)
mlvl_scores.append(scores)
mlvl_bboxes = torch.cat(mlvl_bboxes)
if rescale:
mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
mlvl_scores = torch.cat(mlvl_scores)
padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
# remind that we set FG labels to [0, num_class-1] since mmdet v2.0
# BG cat_id: num_class
mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
if with_nms:
det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
cfg.score_thr, cfg.nms,
cfg.max_per_img)
return det_bboxes, det_labels
else:
return mlvl_bboxes, mlvl_scores
def _get_points_single(self,
featmap_size,
stride,
dtype,
device,
flatten=False):
"""Get points according to feature map sizes."""
h, w = featmap_size
x_range = torch.arange(
0, w * stride, stride, dtype=dtype, device=device)
y_range = torch.arange(
0, h * stride, stride, dtype=dtype, device=device)
y, x = torch.meshgrid(y_range, x_range)
# to be compatible with anchor points in ATSS
if self.use_atss:
points = torch.stack(
(x.reshape(-1), y.reshape(-1)), dim=-1) + \
stride * self.anchor_center_offset
else:
points = torch.stack(
(x.reshape(-1), y.reshape(-1)), dim=-1) + stride // 2
return points
def get_targets(self, cls_scores, mlvl_points, gt_bboxes, gt_labels,
img_metas, gt_bboxes_ignore):
"""A wrapper for computing ATSS and FCOS targets for points in multiple
images.
Args:
cls_scores (list[Tensor]): Box iou-aware scores for each scale
level with shape (N, num_points * num_classes, H, W).
mlvl_points (list[Tensor]): Points of each fpn level, each has
shape (num_points, 2).
gt_bboxes (list[Tensor]): Ground truth bboxes of each image,
each has shape (num_gt, 4).
gt_labels (list[Tensor]): Ground truth labels of each box,
each has shape (num_gt,).
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | Tensor): Ground truth bboxes to be
ignored, shape (num_ignored_gts, 4).
Returns:
tuple:
labels_list (list[Tensor]): Labels of each level.
label_weights (Tensor/None): Label weights of all levels.
bbox_targets_list (list[Tensor]): Regression targets of each
level, (l, t, r, b).
bbox_weights (Tensor/None): Bbox weights of all levels.
"""
if self.use_atss:
return self.get_atss_targets(cls_scores, mlvl_points, gt_bboxes,
gt_labels, img_metas,
gt_bboxes_ignore)
else:
self.norm_on_bbox = False
return self.get_fcos_targets(mlvl_points, gt_bboxes, gt_labels)
def _get_target_single(self, *args, **kwargs):
"""Avoid ambiguity in multiple inheritance."""
if self.use_atss:
return ATSSHead._get_target_single(self, *args, **kwargs)
else:
return FCOSHead._get_target_single(self, *args, **kwargs)
def get_fcos_targets(self, points, gt_bboxes_list, gt_labels_list):
"""Compute FCOS regression and classification targets for points in
multiple images.
Args:
points (list[Tensor]): Points of each fpn level, each has shape
(num_points, 2).
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
each has shape (num_gt, 4).
gt_labels_list (list[Tensor]): Ground truth labels of each box,
each has shape (num_gt,).
Returns:
tuple:
labels (list[Tensor]): Labels of each level.
label_weights: None, to be compatible with ATSS targets.
bbox_targets (list[Tensor]): BBox targets of each level.
bbox_weights: None, to be compatible with ATSS targets.
"""
labels, bbox_targets = FCOSHead.get_targets(self, points,
gt_bboxes_list,
gt_labels_list)
label_weights = None
bbox_weights = None
return labels, label_weights, bbox_targets, bbox_weights
def get_atss_targets(self,
cls_scores,
mlvl_points,
gt_bboxes,
gt_labels,
img_metas,
gt_bboxes_ignore=None):
"""A wrapper for computing ATSS targets for points in multiple images.
Args:
cls_scores (list[Tensor]): Box iou-aware scores for each scale
level with shape (N, num_points * num_classes, H, W).
mlvl_points (list[Tensor]): Points of each fpn level, each has
shape (num_points, 2).
gt_bboxes (list[Tensor]): Ground truth bboxes of each image,
each has shape (num_gt, 4).
gt_labels (list[Tensor]): Ground truth labels of each box,
each has shape (num_gt,).
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | Tensor): Ground truth bboxes to be
ignored, shape (num_ignored_gts, 4). Default: None.
Returns:
tuple:
labels_list (list[Tensor]): Labels of each level.
label_weights (Tensor): Label weights of all levels.
bbox_targets_list (list[Tensor]): Regression targets of each
level, (l, t, r, b).
bbox_weights (Tensor): Bbox weights of all levels.
"""
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == self.anchor_generator.num_levels
device = cls_scores[0].device
anchor_list, valid_flag_list = self.get_anchors(
featmap_sizes, img_metas, device=device)
label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
cls_reg_targets = ATSSHead.get_targets(
self,
anchor_list,
valid_flag_list,
gt_bboxes,
img_metas,
gt_bboxes_ignore_list=gt_bboxes_ignore,
gt_labels_list=gt_labels,
label_channels=label_channels,
unmap_outputs=True)
if cls_reg_targets is None:
return None
(anchor_list, labels_list, label_weights_list, bbox_targets_list,
bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets
bbox_targets_list = [
bbox_targets.reshape(-1, 4) for bbox_targets in bbox_targets_list
]
num_imgs = len(img_metas)
# transform bbox_targets (x1, y1, x2, y2) into (l, t, r, b) format
bbox_targets_list = self.transform_bbox_targets(
bbox_targets_list, mlvl_points, num_imgs)
labels_list = [labels.reshape(-1) for labels in labels_list]
label_weights_list = [
label_weights.reshape(-1) for label_weights in label_weights_list
]
bbox_weights_list = [
bbox_weights.reshape(-1) for bbox_weights in bbox_weights_list
]
label_weights = torch.cat(label_weights_list)
bbox_weights = torch.cat(bbox_weights_list)
return labels_list, label_weights, bbox_targets_list, bbox_weights
def transform_bbox_targets(self, decoded_bboxes, mlvl_points, num_imgs):
"""Transform bbox_targets (x1, y1, x2, y2) into (l, t, r, b) format.
Args:
decoded_bboxes (list[Tensor]): Regression targets of each level,
in the form of (x1, y1, x2, y2).
mlvl_points (list[Tensor]): Points of each fpn level, each has
shape (num_points, 2).
num_imgs (int): the number of images in a batch.
Returns:
bbox_targets (list[Tensor]): Regression targets of each level in
the form of (l, t, r, b).
"""
# TODO: Re-implemented in Class PointCoder
assert len(decoded_bboxes) == len(mlvl_points)
num_levels = len(decoded_bboxes)
mlvl_points = [points.repeat(num_imgs, 1) for points in mlvl_points]
bbox_targets = []
for i in range(num_levels):
bbox_target = bbox2distance(mlvl_points[i], decoded_bboxes[i])
bbox_targets.append(bbox_target)
return bbox_targets
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
missing_keys, unexpected_keys, error_msgs):
"""Override the method in the parent class to avoid changing para's
name."""
pass
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import ConvModule, xavier_init
from mmcv.runner import force_fp32
from mmdet.core import build_sampler, fast_nms, images_to_levels, multi_apply
from ..builder import HEADS, build_loss
from .anchor_head import AnchorHead
@HEADS.register_module()
class YOLACTHead(AnchorHead):
"""YOLACT box head used in https://arxiv.org/abs/1904.02689.
Note that YOLACT head is a light version of RetinaNet head.
Four differences are described as follows:
1. YOLACT box head has three-times fewer anchors.
2. YOLACT box head shares the convs for box and cls branches.
3. YOLACT box head uses OHEM instead of Focal loss.
4. YOLACT box head predicts a set of mask coefficients for each box.
Args:
num_classes (int): Number of categories excluding the background
category.
in_channels (int): Number of channels in the input feature map.
anchor_generator (dict): Config dict for anchor generator
loss_cls (dict): Config of classification loss.
loss_bbox (dict): Config of localization loss.
num_head_convs (int): Number of the conv layers shared by
box and cls branches.
num_protos (int): Number of the mask coefficients.
use_ohem (bool): If true, ``loss_single_OHEM`` will be used for
cls loss calculation. If false, ``loss_single`` will be used.
conv_cfg (dict): Dictionary to construct and config conv layer.
norm_cfg (dict): Dictionary to construct and config norm layer.
"""
def __init__(self,
num_classes,
in_channels,
anchor_generator=dict(
type='AnchorGenerator',
octave_base_scale=3,
scales_per_octave=1,
ratios=[0.5, 1.0, 2.0],
strides=[8, 16, 32, 64, 128]),
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
reduction='none',
loss_weight=1.0),
loss_bbox=dict(
type='SmoothL1Loss', beta=1.0, loss_weight=1.5),
num_head_convs=1,
num_protos=32,
use_ohem=True,
conv_cfg=None,
norm_cfg=None,
**kwargs):
self.num_head_convs = num_head_convs
self.num_protos = num_protos
self.use_ohem = use_ohem
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
super(YOLACTHead, self).__init__(
num_classes,
in_channels,
loss_cls=loss_cls,
loss_bbox=loss_bbox,
anchor_generator=anchor_generator,
**kwargs)
if self.use_ohem:
sampler_cfg = dict(type='PseudoSampler')
self.sampler = build_sampler(sampler_cfg, context=self)
self.sampling = False
def _init_layers(self):
"""Initialize layers of the head."""
self.relu = nn.ReLU(inplace=True)
self.head_convs = nn.ModuleList()
for i in range(self.num_head_convs):
chn = self.in_channels if i == 0 else self.feat_channels
self.head_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg))
self.conv_cls = nn.Conv2d(
self.feat_channels,
self.num_anchors * self.cls_out_channels,
3,
padding=1)
self.conv_reg = nn.Conv2d(
self.feat_channels, self.num_anchors * 4, 3, padding=1)
self.conv_coeff = nn.Conv2d(
self.feat_channels,
self.num_anchors * self.num_protos,
3,
padding=1)
def init_weights(self):
"""Initialize weights of the head."""
for m in self.head_convs:
xavier_init(m.conv, distribution='uniform', bias=0)
xavier_init(self.conv_cls, distribution='uniform', bias=0)
xavier_init(self.conv_reg, distribution='uniform', bias=0)
xavier_init(self.conv_coeff, distribution='uniform', bias=0)
def forward_single(self, x):
"""Forward feature of a single scale level.
Args:
x (Tensor): Features of a single scale level.
Returns:
tuple:
cls_score (Tensor): Cls scores for a single scale level \
the channels number is num_anchors * num_classes.
bbox_pred (Tensor): Box energies / deltas for a single scale \
level, the channels number is num_anchors * 4.
coeff_pred (Tensor): Mask coefficients for a single scale \
level, the channels number is num_anchors * num_protos.
"""
for head_conv in self.head_convs:
x = head_conv(x)
cls_score = self.conv_cls(x)
bbox_pred = self.conv_reg(x)
coeff_pred = self.conv_coeff(x).tanh()
return cls_score, bbox_pred, coeff_pred
@force_fp32(apply_to=('cls_scores', 'bbox_preds'))
def loss(self,
cls_scores,
bbox_preds,
gt_bboxes,
gt_labels,
img_metas,
gt_bboxes_ignore=None):
"""A combination of the func:``AnchorHead.loss`` and
func:``SSDHead.loss``.
When ``self.use_ohem == True``, it functions like ``SSDHead.loss``,
otherwise, it follows ``AnchorHead.loss``. Besides, it additionally
returns ``sampling_results``.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
Has shape (N, num_anchors * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_anchors * 4, H, W)
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): Class indices corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
boxes can be ignored when computing the loss. Default: None
Returns:
tuple:
dict[str, Tensor]: A dictionary of loss components.
List[:obj:``SamplingResult``]: Sampler results for each image.
"""
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == self.anchor_generator.num_levels
device = cls_scores[0].device
anchor_list, valid_flag_list = self.get_anchors(
featmap_sizes, img_metas, device=device)
label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
cls_reg_targets = self.get_targets(
anchor_list,
valid_flag_list,
gt_bboxes,
img_metas,
gt_bboxes_ignore_list=gt_bboxes_ignore,
gt_labels_list=gt_labels,
label_channels=label_channels,
unmap_outputs=not self.use_ohem,
return_sampling_results=True)
if cls_reg_targets is None:
return None
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
num_total_pos, num_total_neg, sampling_results) = cls_reg_targets
if self.use_ohem:
num_images = len(img_metas)
all_cls_scores = torch.cat([
s.permute(0, 2, 3, 1).reshape(
num_images, -1, self.cls_out_channels) for s in cls_scores
], 1)
all_labels = torch.cat(labels_list, -1).view(num_images, -1)
all_label_weights = torch.cat(label_weights_list,
-1).view(num_images, -1)
all_bbox_preds = torch.cat([
b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
for b in bbox_preds
], -2)
all_bbox_targets = torch.cat(bbox_targets_list,
-2).view(num_images, -1, 4)
all_bbox_weights = torch.cat(bbox_weights_list,
-2).view(num_images, -1, 4)
# concat all level anchors to a single tensor
all_anchors = []
for i in range(num_images):
all_anchors.append(torch.cat(anchor_list[i]))
# check NaN and Inf
assert torch.isfinite(all_cls_scores).all().item(), \
'classification scores become infinite or NaN!'
assert torch.isfinite(all_bbox_preds).all().item(), \
'bbox predications become infinite or NaN!'
losses_cls, losses_bbox = multi_apply(
self.loss_single_OHEM,
all_cls_scores,
all_bbox_preds,
all_anchors,
all_labels,
all_label_weights,
all_bbox_targets,
all_bbox_weights,
num_total_samples=num_total_pos)
else:
num_total_samples = (
num_total_pos +
num_total_neg if self.sampling else num_total_pos)
# anchor number of multi levels
num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
# concat all level anchors and flags to a single tensor
concat_anchor_list = []
for i in range(len(anchor_list)):
concat_anchor_list.append(torch.cat(anchor_list[i]))
all_anchor_list = images_to_levels(concat_anchor_list,
num_level_anchors)
losses_cls, losses_bbox = multi_apply(
self.loss_single,
cls_scores,
bbox_preds,
all_anchor_list,
labels_list,
label_weights_list,
bbox_targets_list,
bbox_weights_list,
num_total_samples=num_total_samples)
return dict(
loss_cls=losses_cls, loss_bbox=losses_bbox), sampling_results
def loss_single_OHEM(self, cls_score, bbox_pred, anchors, labels,
label_weights, bbox_targets, bbox_weights,
num_total_samples):
""""See func:``SSDHead.loss``."""
loss_cls_all = self.loss_cls(cls_score, labels, label_weights)
# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
pos_inds = ((labels >= 0) &
(labels < self.num_classes)).nonzero().reshape(-1)
neg_inds = (labels == self.num_classes).nonzero().view(-1)
num_pos_samples = pos_inds.size(0)
if num_pos_samples == 0:
num_neg_samples = neg_inds.size(0)
else:
num_neg_samples = self.train_cfg.neg_pos_ratio * num_pos_samples
if num_neg_samples > neg_inds.size(0):
num_neg_samples = neg_inds.size(0)
topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples)
loss_cls_pos = loss_cls_all[pos_inds].sum()
loss_cls_neg = topk_loss_cls_neg.sum()
loss_cls = (loss_cls_pos + loss_cls_neg) / num_total_samples
if self.reg_decoded_bbox:
# When the regression loss (e.g. `IouLoss`, `GIouLoss`)
# is applied directly on the decoded bounding boxes, it
# decodes the already encoded coordinates to absolute format.
bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
loss_bbox = self.loss_bbox(
bbox_pred,
bbox_targets,
bbox_weights,
avg_factor=num_total_samples)
return loss_cls[None], loss_bbox
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'coeff_preds'))
def get_bboxes(self,
cls_scores,
bbox_preds,
coeff_preds,
img_metas,
cfg=None,
rescale=False):
""""Similiar to func:``AnchorHead.get_bboxes``, but additionally
processes coeff_preds.
Args:
cls_scores (list[Tensor]): Box scores for each scale level
with shape (N, num_anchors * num_classes, H, W)
bbox_preds (list[Tensor]): Box energies / deltas for each scale
level with shape (N, num_anchors * 4, H, W)
coeff_preds (list[Tensor]): Mask coefficients for each scale
level with shape (N, num_anchors * num_protos, H, W)
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cfg (mmcv.Config | None): Test / postprocessing configuration,
if None, test_cfg would be used
rescale (bool): If True, return boxes in original image space.
Default: False.
Returns:
list[tuple[Tensor, Tensor, Tensor]]: Each item in result_list is
a 3-tuple. The first item is an (n, 5) tensor, where the
first 4 columns are bounding box positions
(tl_x, tl_y, br_x, br_y) and the 5-th column is a score
between 0 and 1. The second item is an (n,) tensor where each
item is the predicted class label of the corresponding box.
The third item is an (n, num_protos) tensor where each item
is the predicted mask coefficients of instance inside the
corresponding box.
"""
assert len(cls_scores) == len(bbox_preds)
num_levels = len(cls_scores)
device = cls_scores[0].device
featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
mlvl_anchors = self.anchor_generator.grid_anchors(
featmap_sizes, device=device)
det_bboxes = []
det_labels = []
det_coeffs = []
for img_id in range(len(img_metas)):
cls_score_list = [
cls_scores[i][img_id].detach() for i in range(num_levels)
]
bbox_pred_list = [
bbox_preds[i][img_id].detach() for i in range(num_levels)
]
coeff_pred_list = [
coeff_preds[i][img_id].detach() for i in range(num_levels)
]
img_shape = img_metas[img_id]['img_shape']
scale_factor = img_metas[img_id]['scale_factor']
bbox_res = self._get_bboxes_single(cls_score_list, bbox_pred_list,
coeff_pred_list, mlvl_anchors,
img_shape, scale_factor, cfg,
rescale)
det_bboxes.append(bbox_res[0])
det_labels.append(bbox_res[1])
det_coeffs.append(bbox_res[2])
return det_bboxes, det_labels, det_coeffs
def _get_bboxes_single(self,
cls_score_list,
bbox_pred_list,
coeff_preds_list,
mlvl_anchors,
img_shape,
scale_factor,
cfg,
rescale=False):
""""Similiar to func:``AnchorHead._get_bboxes_single``, but
additionally processes coeff_preds_list and uses fast NMS instead of
traditional NMS.
Args:
cls_score_list (list[Tensor]): Box scores for a single scale level
Has shape (num_anchors * num_classes, H, W).
bbox_pred_list (list[Tensor]): Box energies / deltas for a single
scale level with shape (num_anchors * 4, H, W).
coeff_preds_list (list[Tensor]): Mask coefficients for a single
scale level with shape (num_anchors * num_protos, H, W).
mlvl_anchors (list[Tensor]): Box reference for a single scale level
with shape (num_total_anchors, 4).
img_shape (tuple[int]): Shape of the input image,
(height, width, 3).
scale_factor (ndarray): Scale factor of the image arange as
(w_scale, h_scale, w_scale, h_scale).
cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used.
rescale (bool): If True, return boxes in original image space.
Returns:
tuple[Tensor, Tensor, Tensor]: The first item is an (n, 5) tensor,
where the first 4 columns are bounding box positions
(tl_x, tl_y, br_x, br_y) and the 5-th column is a score between
0 and 1. The second item is an (n,) tensor where each item is
the predicted class label of the corresponding box. The third
item is an (n, num_protos) tensor where each item is the
predicted mask coefficients of instance inside the
corresponding box.
"""
cfg = self.test_cfg if cfg is None else cfg
assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_anchors)
mlvl_bboxes = []
mlvl_scores = []
mlvl_coeffs = []
for cls_score, bbox_pred, coeff_pred, anchors in \
zip(cls_score_list, bbox_pred_list,
coeff_preds_list, mlvl_anchors):
assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
cls_score = cls_score.permute(1, 2,
0).reshape(-1, self.cls_out_channels)
if self.use_sigmoid_cls:
scores = cls_score.sigmoid()
else:
scores = cls_score.softmax(-1)
bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
coeff_pred = coeff_pred.permute(1, 2,
0).reshape(-1, self.num_protos)
nms_pre = cfg.get('nms_pre', -1)
if nms_pre > 0 and scores.shape[0] > nms_pre:
# Get maximum scores for foreground classes.
if self.use_sigmoid_cls:
max_scores, _ = scores.max(dim=1)
else:
# remind that we set FG labels to [0, num_class-1]
# since mmdet v2.0
# BG cat_id: num_class
max_scores, _ = scores[:, :-1].max(dim=1)
_, topk_inds = max_scores.topk(nms_pre)
anchors = anchors[topk_inds, :]
bbox_pred = bbox_pred[topk_inds, :]
scores = scores[topk_inds, :]
coeff_pred = coeff_pred[topk_inds, :]
bboxes = self.bbox_coder.decode(
anchors, bbox_pred, max_shape=img_shape)
mlvl_bboxes.append(bboxes)
mlvl_scores.append(scores)
mlvl_coeffs.append(coeff_pred)
mlvl_bboxes = torch.cat(mlvl_bboxes)
if rescale:
mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
mlvl_scores = torch.cat(mlvl_scores)
mlvl_coeffs = torch.cat(mlvl_coeffs)
if self.use_sigmoid_cls:
# Add a dummy background class to the backend when using sigmoid
# remind that we set FG labels to [0, num_class-1] since mmdet v2.0
# BG cat_id: num_class
padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
det_bboxes, det_labels, det_coeffs = fast_nms(mlvl_bboxes, mlvl_scores,
mlvl_coeffs,
cfg.score_thr,
cfg.iou_thr, cfg.top_k,
cfg.max_per_img)
return det_bboxes, det_labels, det_coeffs
@HEADS.register_module()
class YOLACTSegmHead(nn.Module):
"""YOLACT segmentation head used in https://arxiv.org/abs/1904.02689.
Apply a semantic segmentation loss on feature space using layers that are
only evaluated during training to increase performance with no speed
penalty.
Args:
in_channels (int): Number of channels in the input feature map.
num_classes (int): Number of categories excluding the background
category.
loss_segm (dict): Config of semantic segmentation loss.
"""
def __init__(self,
num_classes,
in_channels=256,
loss_segm=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
loss_weight=1.0)):
super(YOLACTSegmHead, self).__init__()
self.in_channels = in_channels
self.num_classes = num_classes
self.loss_segm = build_loss(loss_segm)
self._init_layers()
self.fp16_enabled = False
def _init_layers(self):
"""Initialize layers of the head."""
self.segm_conv = nn.Conv2d(
self.in_channels, self.num_classes, kernel_size=1)
def init_weights(self):
"""Initialize weights of the head."""
xavier_init(self.segm_conv, distribution='uniform')
def forward(self, x):
"""Forward feature from the upstream network.
Args:
x (Tensor): Feature from the upstream network, which is
a 4D-tensor.
Returns:
Tensor: Predicted semantic segmentation map with shape
(N, num_classes, H, W).
"""
return self.segm_conv(x)
@force_fp32(apply_to=('segm_pred', ))
def loss(self, segm_pred, gt_masks, gt_labels):
"""Compute loss of the head.
Args:
segm_pred (list[Tensor]): Predicted semantic segmentation map
with shape (N, num_classes, H, W).
gt_masks (list[Tensor]): Ground truth masks for each image with
the same shape of the input image.
gt_labels (list[Tensor]): Class indices corresponding to each box.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
loss_segm = []
num_imgs, num_classes, mask_h, mask_w = segm_pred.size()
for idx in range(num_imgs):
cur_segm_pred = segm_pred[idx]
cur_gt_masks = gt_masks[idx].float()
cur_gt_labels = gt_labels[idx]
segm_targets = self.get_targets(cur_segm_pred, cur_gt_masks,
cur_gt_labels)
if segm_targets is None:
loss = self.loss_segm(cur_segm_pred,
torch.zeros_like(cur_segm_pred),
torch.zeros_like(cur_segm_pred))
else:
loss = self.loss_segm(
cur_segm_pred,
segm_targets,
avg_factor=num_imgs * mask_h * mask_w)
loss_segm.append(loss)
return dict(loss_segm=loss_segm)
def get_targets(self, segm_pred, gt_masks, gt_labels):
"""Compute semantic segmentation targets for each image.
Args:
segm_pred (Tensor): Predicted semantic segmentation map
with shape (num_classes, H, W).
gt_masks (Tensor): Ground truth masks for each image with
the same shape of the input image.
gt_labels (Tensor): Class indices corresponding to each box.
Returns:
Tensor: Semantic segmentation targets with shape
(num_classes, H, W).
"""
if gt_masks.size(0) == 0:
return None
num_classes, mask_h, mask_w = segm_pred.size()
with torch.no_grad():
downsampled_masks = F.interpolate(
gt_masks.unsqueeze(0), (mask_h, mask_w),
mode='bilinear',
align_corners=False).squeeze(0)
downsampled_masks = downsampled_masks.gt(0.5).float()
segm_targets = torch.zeros_like(segm_pred, requires_grad=False)
for obj_idx in range(downsampled_masks.size(0)):
segm_targets[gt_labels[obj_idx] - 1] = torch.max(
segm_targets[gt_labels[obj_idx] - 1],
downsampled_masks[obj_idx])
return segm_targets
@HEADS.register_module()
class YOLACTProtonet(nn.Module):
"""YOLACT mask head used in https://arxiv.org/abs/1904.02689.
This head outputs the mask prototypes for YOLACT.
Args:
in_channels (int): Number of channels in the input feature map.
proto_channels (tuple[int]): Output channels of protonet convs.
proto_kernel_sizes (tuple[int]): Kernel sizes of protonet convs.
include_last_relu (Bool): If keep the last relu of protonet.
num_protos (int): Number of prototypes.
num_classes (int): Number of categories excluding the background
category.
loss_mask_weight (float): Reweight the mask loss by this factor.
max_masks_to_train (int): Maximum number of masks to train for
each image.
"""
def __init__(self,
num_classes,
in_channels=256,
proto_channels=(256, 256, 256, None, 256, 32),
proto_kernel_sizes=(3, 3, 3, -2, 3, 1),
include_last_relu=True,
num_protos=32,
loss_mask_weight=1.0,
max_masks_to_train=100):
super(YOLACTProtonet, self).__init__()
self.in_channels = in_channels
self.proto_channels = proto_channels
self.proto_kernel_sizes = proto_kernel_sizes
self.include_last_relu = include_last_relu
self.protonet = self._init_layers()
self.loss_mask_weight = loss_mask_weight
self.num_protos = num_protos
self.num_classes = num_classes
self.max_masks_to_train = max_masks_to_train
self.fp16_enabled = False
def _init_layers(self):
"""A helper function to take a config setting and turn it into a
network."""
# Possible patterns:
# ( 256, 3) -> conv
# ( 256,-2) -> deconv
# (None,-2) -> bilinear interpolate
in_channels = self.in_channels
protonets = nn.ModuleList()
for num_channels, kernel_size in zip(self.proto_channels,
self.proto_kernel_sizes):
if kernel_size > 0:
layer = nn.Conv2d(
in_channels,
num_channels,
kernel_size,
padding=kernel_size // 2)
else:
if num_channels is None:
layer = InterpolateModule(
scale_factor=-kernel_size,
mode='bilinear',
align_corners=False)
else:
layer = nn.ConvTranspose2d(
in_channels,
num_channels,
-kernel_size,
padding=kernel_size // 2)
protonets.append(layer)
protonets.append(nn.ReLU(inplace=True))
in_channels = num_channels if num_channels is not None \
else in_channels
if not self.include_last_relu:
protonets = protonets[:-1]
return nn.Sequential(*protonets)
def init_weights(self):
"""Initialize weights of the head."""
for m in self.protonet:
if isinstance(m, nn.Conv2d):
xavier_init(m, distribution='uniform')
def forward(self, x, coeff_pred, bboxes, img_meta, sampling_results=None):
"""Forward feature from the upstream network to get prototypes and
linearly combine the prototypes, using masks coefficients, into
instance masks. Finally, crop the instance masks with given bboxes.
Args:
x (Tensor): Feature from the upstream network, which is
a 4D-tensor.
coeff_pred (list[Tensor]): Mask coefficients for each scale
level with shape (N, num_anchors * num_protos, H, W).
bboxes (list[Tensor]): Box used for cropping with shape
(N, num_anchors * 4, H, W). During training, they are
ground truth boxes. During testing, they are predicted
boxes.
img_meta (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
sampling_results (List[:obj:``SamplingResult``]): Sampler results
for each image.
Returns:
list[Tensor]: Predicted instance segmentation masks.
"""
prototypes = self.protonet(x)
prototypes = prototypes.permute(0, 2, 3, 1).contiguous()
num_imgs = x.size(0)
# Training state
if self.training:
coeff_pred_list = []
for coeff_pred_per_level in coeff_pred:
coeff_pred_per_level = \
coeff_pred_per_level.permute(0, 2, 3, 1)\
.reshape(num_imgs, -1, self.num_protos)
coeff_pred_list.append(coeff_pred_per_level)
coeff_pred = torch.cat(coeff_pred_list, dim=1)
mask_pred_list = []
for idx in range(num_imgs):
cur_prototypes = prototypes[idx]
cur_coeff_pred = coeff_pred[idx]
cur_bboxes = bboxes[idx]
cur_img_meta = img_meta[idx]
# Testing state
if not self.training:
bboxes_for_cropping = cur_bboxes
else:
cur_sampling_results = sampling_results[idx]
pos_assigned_gt_inds = \
cur_sampling_results.pos_assigned_gt_inds
bboxes_for_cropping = cur_bboxes[pos_assigned_gt_inds].clone()
pos_inds = cur_sampling_results.pos_inds
cur_coeff_pred = cur_coeff_pred[pos_inds]
# Linearly combine the prototypes with the mask coefficients
mask_pred = cur_prototypes @ cur_coeff_pred.t()
mask_pred = torch.sigmoid(mask_pred)
h, w = cur_img_meta['img_shape'][:2]
bboxes_for_cropping[:, 0] /= w
bboxes_for_cropping[:, 1] /= h
bboxes_for_cropping[:, 2] /= w
bboxes_for_cropping[:, 3] /= h
mask_pred = self.crop(mask_pred, bboxes_for_cropping)
mask_pred = mask_pred.permute(2, 0, 1).contiguous()
mask_pred_list.append(mask_pred)
return mask_pred_list
@force_fp32(apply_to=('mask_pred', ))
def loss(self, mask_pred, gt_masks, gt_bboxes, img_meta, sampling_results):
"""Compute loss of the head.
Args:
mask_pred (list[Tensor]): Predicted prototypes with shape
(num_classes, H, W).
gt_masks (list[Tensor]): Ground truth masks for each image with
the same shape of the input image.
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
img_meta (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
sampling_results (List[:obj:``SamplingResult``]): Sampler results
for each image.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
loss_mask = []
num_imgs = len(mask_pred)
total_pos = 0
for idx in range(num_imgs):
cur_mask_pred = mask_pred[idx]
cur_gt_masks = gt_masks[idx].float()
cur_gt_bboxes = gt_bboxes[idx]
cur_img_meta = img_meta[idx]
cur_sampling_results = sampling_results[idx]
pos_assigned_gt_inds = cur_sampling_results.pos_assigned_gt_inds
num_pos = pos_assigned_gt_inds.size(0)
# Since we're producing (near) full image masks,
# it'd take too much vram to backprop on every single mask.
# Thus we select only a subset.
if num_pos > self.max_masks_to_train:
perm = torch.randperm(num_pos)
select = perm[:self.max_masks_to_train]
cur_mask_pred = cur_mask_pred[select]
pos_assigned_gt_inds = pos_assigned_gt_inds[select]
num_pos = self.max_masks_to_train
total_pos += num_pos
gt_bboxes_for_reweight = cur_gt_bboxes[pos_assigned_gt_inds]
mask_targets = self.get_targets(cur_mask_pred, cur_gt_masks,
pos_assigned_gt_inds)
if num_pos == 0:
loss = cur_mask_pred.sum() * 0.
elif mask_targets is None:
loss = F.binary_cross_entropy(cur_mask_pred,
torch.zeros_like(cur_mask_pred),
torch.zeros_like(cur_mask_pred))
else:
cur_mask_pred = torch.clamp(cur_mask_pred, 0, 1)
loss = F.binary_cross_entropy(
cur_mask_pred, mask_targets,
reduction='none') * self.loss_mask_weight
h, w = cur_img_meta['img_shape'][:2]
gt_bboxes_width = (gt_bboxes_for_reweight[:, 2] -
gt_bboxes_for_reweight[:, 0]) / w
gt_bboxes_height = (gt_bboxes_for_reweight[:, 3] -
gt_bboxes_for_reweight[:, 1]) / h
loss = loss.mean(dim=(1,
2)) / gt_bboxes_width / gt_bboxes_height
loss = torch.sum(loss)
loss_mask.append(loss)
if total_pos == 0:
total_pos += 1 # avoid nan
loss_mask = [x / total_pos for x in loss_mask]
return dict(loss_mask=loss_mask)
def get_targets(self, mask_pred, gt_masks, pos_assigned_gt_inds):
"""Compute instance segmentation targets for each image.
Args:
mask_pred (Tensor): Predicted prototypes with shape
(num_classes, H, W).
gt_masks (Tensor): Ground truth masks for each image with
the same shape of the input image.
pos_assigned_gt_inds (Tensor): GT indices of the corresponding
positive samples.
Returns:
Tensor: Instance segmentation targets with shape
(num_instances, H, W).
"""
if gt_masks.size(0) == 0:
return None
mask_h, mask_w = mask_pred.shape[-2:]
gt_masks = F.interpolate(
gt_masks.unsqueeze(0), (mask_h, mask_w),
mode='bilinear',
align_corners=False).squeeze(0)
gt_masks = gt_masks.gt(0.5).float()
mask_targets = gt_masks[pos_assigned_gt_inds]
return mask_targets
def get_seg_masks(self, mask_pred, label_pred, img_meta, rescale):
"""Resize, binarize, and format the instance mask predictions.
Args:
mask_pred (Tensor): shape (N, H, W).
label_pred (Tensor): shape (N, ).
img_meta (dict): Meta information of each image, e.g.,
image size, scaling factor, etc.
rescale (bool): If rescale is False, then returned masks will
fit the scale of imgs[0].
Returns:
list[ndarray]: Mask predictions grouped by their predicted classes.
"""
ori_shape = img_meta['ori_shape']
scale_factor = img_meta['scale_factor']
if rescale:
img_h, img_w = ori_shape[:2]
else:
img_h = np.round(ori_shape[0] * scale_factor[1]).astype(np.int32)
img_w = np.round(ori_shape[1] * scale_factor[0]).astype(np.int32)
cls_segms = [[] for _ in range(self.num_classes)]
if mask_pred.size(0) == 0:
return cls_segms
mask_pred = F.interpolate(
mask_pred.unsqueeze(0), (img_h, img_w),
mode='bilinear',
align_corners=False).squeeze(0) > 0.5
mask_pred = mask_pred.cpu().numpy().astype(np.uint8)
for m, l in zip(mask_pred, label_pred):
cls_segms[l].append(m)
return cls_segms
def crop(self, masks, boxes, padding=1):
"""Crop predicted masks by zeroing out everything not in the predicted
bbox.
Args:
masks (Tensor): shape [H, W, N].
boxes (Tensor): bbox coords in relative point form with
shape [N, 4].
Return:
Tensor: The cropped masks.
"""
h, w, n = masks.size()
x1, x2 = self.sanitize_coordinates(
boxes[:, 0], boxes[:, 2], w, padding, cast=False)
y1, y2 = self.sanitize_coordinates(
boxes[:, 1], boxes[:, 3], h, padding, cast=False)
rows = torch.arange(
w, device=masks.device, dtype=x1.dtype).view(1, -1,
1).expand(h, w, n)
cols = torch.arange(
h, device=masks.device, dtype=x1.dtype).view(-1, 1,
1).expand(h, w, n)
masks_left = rows >= x1.view(1, 1, -1)
masks_right = rows < x2.view(1, 1, -1)
masks_up = cols >= y1.view(1, 1, -1)
masks_down = cols < y2.view(1, 1, -1)
crop_mask = masks_left * masks_right * masks_up * masks_down
return masks * crop_mask.float()
def sanitize_coordinates(self, x1, x2, img_size, padding=0, cast=True):
"""Sanitizes the input coordinates so that x1 < x2, x1 != x2, x1 >= 0,
and x2 <= image_size. Also converts from relative to absolute
coordinates and casts the results to long tensors.
Warning: this does things in-place behind the scenes so
copy if necessary.
Args:
_x1 (Tensor): shape (N, ).
_x2 (Tensor): shape (N, ).
img_size (int): Size of the input image.
padding (int): x1 >= padding, x2 <= image_size-padding.
cast (bool): If cast is false, the result won't be cast to longs.
Returns:
tuple:
x1 (Tensor): Sanitized _x1.
x2 (Tensor): Sanitized _x2.
"""
x1 = x1 * img_size
x2 = x2 * img_size
if cast:
x1 = x1.long()
x2 = x2.long()
x1 = torch.min(x1, x2)
x2 = torch.max(x1, x2)
x1 = torch.clamp(x1 - padding, min=0)
x2 = torch.clamp(x2 + padding, max=img_size)
return x1, x2
class InterpolateModule(nn.Module):
"""This is a module version of F.interpolate.
Any arguments you give it just get passed along for the ride.
"""
def __init__(self, *args, **kwargs):
super().__init__()
self.args = args
self.kwargs = kwargs
def forward(self, x):
"""Forward features from the upstream network."""
return F.interpolate(x, *self.args, **self.kwargs)
# Copyright (c) 2019 Western Digital Corporation or its affiliates.
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import ConvModule, normal_init
from mmcv.runner import force_fp32
from mmdet.core import (build_anchor_generator, build_assigner,
build_bbox_coder, build_sampler, images_to_levels,
multi_apply, multiclass_nms)
from ..builder import HEADS, build_loss
from .base_dense_head import BaseDenseHead
from .dense_test_mixins import BBoxTestMixin
@HEADS.register_module()
class YOLOV3Head(BaseDenseHead, BBoxTestMixin):
"""YOLOV3Head Paper link: https://arxiv.org/abs/1804.02767.
Args:
num_classes (int): The number of object classes (w/o background)
in_channels (List[int]): Number of input channels per scale.
out_channels (List[int]): The number of output channels per scale
before the final 1x1 layer. Default: (1024, 512, 256).
anchor_generator (dict): Config dict for anchor generator
bbox_coder (dict): Config of bounding box coder.
featmap_strides (List[int]): The stride of each scale.
Should be in descending order. Default: (32, 16, 8).
one_hot_smoother (float): Set a non-zero value to enable label-smooth
Default: 0.
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Dictionary to construct and config norm layer.
Default: dict(type='BN', requires_grad=True)
act_cfg (dict): Config dict for activation layer.
Default: dict(type='LeakyReLU', negative_slope=0.1).
loss_cls (dict): Config of classification loss.
loss_conf (dict): Config of confidence loss.
loss_xy (dict): Config of xy coordinate loss.
loss_wh (dict): Config of wh coordinate loss.
train_cfg (dict): Training config of YOLOV3 head. Default: None.
test_cfg (dict): Testing config of YOLOV3 head. Default: None.
"""
def __init__(self,
num_classes,
in_channels,
out_channels=(1024, 512, 256),
anchor_generator=dict(
type='YOLOAnchorGenerator',
base_sizes=[[(116, 90), (156, 198), (373, 326)],
[(30, 61), (62, 45), (59, 119)],
[(10, 13), (16, 30), (33, 23)]],
strides=[32, 16, 8]),
bbox_coder=dict(type='YOLOBBoxCoder'),
featmap_strides=[32, 16, 8],
one_hot_smoother=0.,
conv_cfg=None,
norm_cfg=dict(type='BN', requires_grad=True),
act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
loss_weight=1.0),
loss_conf=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
loss_weight=1.0),
loss_xy=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
loss_weight=1.0),
loss_wh=dict(type='MSELoss', loss_weight=1.0),
train_cfg=None,
test_cfg=None):
super(YOLOV3Head, self).__init__()
# Check params
assert (len(in_channels) == len(out_channels) == len(featmap_strides))
self.num_classes = num_classes
self.in_channels = in_channels
self.out_channels = out_channels
self.featmap_strides = featmap_strides
self.train_cfg = train_cfg
self.test_cfg = test_cfg
if self.train_cfg:
self.assigner = build_assigner(self.train_cfg.assigner)
if hasattr(self.train_cfg, 'sampler'):
sampler_cfg = self.train_cfg.sampler
else:
sampler_cfg = dict(type='PseudoSampler')
self.sampler = build_sampler(sampler_cfg, context=self)
self.one_hot_smoother = one_hot_smoother
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.act_cfg = act_cfg
self.bbox_coder = build_bbox_coder(bbox_coder)
self.anchor_generator = build_anchor_generator(anchor_generator)
self.loss_cls = build_loss(loss_cls)
self.loss_conf = build_loss(loss_conf)
self.loss_xy = build_loss(loss_xy)
self.loss_wh = build_loss(loss_wh)
# usually the numbers of anchors for each level are the same
# except SSD detectors
self.num_anchors = self.anchor_generator.num_base_anchors[0]
assert len(
self.anchor_generator.num_base_anchors) == len(featmap_strides)
self._init_layers()
@property
def num_levels(self):
return len(self.featmap_strides)
@property
def num_attrib(self):
"""int: number of attributes in pred_map, bboxes (4) +
objectness (1) + num_classes"""
return 5 + self.num_classes
def _init_layers(self):
self.convs_bridge = nn.ModuleList()
self.convs_pred = nn.ModuleList()
for i in range(self.num_levels):
conv_bridge = ConvModule(
self.in_channels[i],
self.out_channels[i],
3,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg)
conv_pred = nn.Conv2d(self.out_channels[i],
self.num_anchors * self.num_attrib, 1)
self.convs_bridge.append(conv_bridge)
self.convs_pred.append(conv_pred)
def init_weights(self):
"""Initialize weights of the head."""
for m in self.convs_pred:
normal_init(m, std=0.01)
def forward(self, feats):
"""Forward features from the upstream network.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
Returns:
tuple[Tensor]: A tuple of multi-level predication map, each is a
4D-tensor of shape (batch_size, 5+num_classes, height, width).
"""
assert len(feats) == self.num_levels
pred_maps = []
for i in range(self.num_levels):
x = feats[i]
x = self.convs_bridge[i](x)
pred_map = self.convs_pred[i](x)
pred_maps.append(pred_map)
return tuple(pred_maps),
@force_fp32(apply_to=('pred_maps', ))
def get_bboxes(self,
pred_maps,
img_metas,
cfg=None,
rescale=False,
with_nms=True):
"""Transform network output for a batch into bbox predictions.
Args:
pred_maps (list[Tensor]): Raw predictions for a batch of images.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
cfg (mmcv.Config | None): Test / postprocessing configuration,
if None, test_cfg would be used. Default: None.
rescale (bool): If True, return boxes in original image space.
Default: False.
with_nms (bool): If True, do nms before return boxes.
Default: True.
Returns:
list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
The first item is an (n, 5) tensor, where the first 4 columns
are bounding box positions (tl_x, tl_y, br_x, br_y) and the
5-th column is a score between 0 and 1. The second item is a
(n,) tensor where each item is the predicted class label of the
corresponding box.
"""
result_list = []
num_levels = len(pred_maps)
for img_id in range(len(img_metas)):
pred_maps_list = [
pred_maps[i][img_id].detach() for i in range(num_levels)
]
scale_factor = img_metas[img_id]['scale_factor']
proposals = self._get_bboxes_single(pred_maps_list, scale_factor,
cfg, rescale, with_nms)
result_list.append(proposals)
return result_list
def _get_bboxes_single(self,
pred_maps_list,
scale_factor,
cfg,
rescale=False,
with_nms=True):
"""Transform outputs for a single batch item into bbox predictions.
Args:
pred_maps_list (list[Tensor]): Prediction maps for different scales
of each single image in the batch.
scale_factor (ndarray): Scale factor of the image arrange as
(w_scale, h_scale, w_scale, h_scale).
cfg (mmcv.Config | None): Test / postprocessing configuration,
if None, test_cfg would be used.
rescale (bool): If True, return boxes in original image space.
Default: False.
with_nms (bool): If True, do nms before return boxes.
Default: True.
Returns:
tuple(Tensor):
det_bboxes (Tensor): BBox predictions in shape (n, 5), where
the first 4 columns are bounding box positions
(tl_x, tl_y, br_x, br_y) and the 5-th column is a score
between 0 and 1.
det_labels (Tensor): A (n,) tensor where each item is the
predicted class label of the corresponding box.
"""
cfg = self.test_cfg if cfg is None else cfg
assert len(pred_maps_list) == self.num_levels
multi_lvl_bboxes = []
multi_lvl_cls_scores = []
multi_lvl_conf_scores = []
num_levels = len(pred_maps_list)
featmap_sizes = [
pred_maps_list[i].shape[-2:] for i in range(num_levels)
]
multi_lvl_anchors = self.anchor_generator.grid_anchors(
featmap_sizes, pred_maps_list[0][0].device)
for i in range(self.num_levels):
# get some key info for current scale
pred_map = pred_maps_list[i]
stride = self.featmap_strides[i]
# (h, w, num_anchors*num_attrib) -> (h*w*num_anchors, num_attrib)
pred_map = pred_map.permute(1, 2, 0).reshape(-1, self.num_attrib)
pred_map[..., :2] = torch.sigmoid(pred_map[..., :2])
bbox_pred = self.bbox_coder.decode(multi_lvl_anchors[i],
pred_map[..., :4], stride)
# conf and cls
conf_pred = torch.sigmoid(pred_map[..., 4]).view(-1)
cls_pred = torch.sigmoid(pred_map[..., 5:]).view(
-1, self.num_classes) # Cls pred one-hot.
# Filtering out all predictions with conf < conf_thr
conf_thr = cfg.get('conf_thr', -1)
if conf_thr > 0:
# add as_tuple=False for compatibility in Pytorch 1.6
# flatten would create a Reshape op with constant values,
# and raise RuntimeError when doing inference in ONNX Runtime
# with a different input image (#4221).
conf_inds = conf_pred.ge(conf_thr).nonzero(
as_tuple=False).squeeze(1)
bbox_pred = bbox_pred[conf_inds, :]
cls_pred = cls_pred[conf_inds, :]
conf_pred = conf_pred[conf_inds]
# Get top-k prediction
nms_pre = cfg.get('nms_pre', -1)
if 0 < nms_pre < conf_pred.size(0) and (
not torch.onnx.is_in_onnx_export()):
_, topk_inds = conf_pred.topk(nms_pre)
bbox_pred = bbox_pred[topk_inds, :]
cls_pred = cls_pred[topk_inds, :]
conf_pred = conf_pred[topk_inds]
# Save the result of current scale
multi_lvl_bboxes.append(bbox_pred)
multi_lvl_cls_scores.append(cls_pred)
multi_lvl_conf_scores.append(conf_pred)
# Merge the results of different scales together
multi_lvl_bboxes = torch.cat(multi_lvl_bboxes)
multi_lvl_cls_scores = torch.cat(multi_lvl_cls_scores)
multi_lvl_conf_scores = torch.cat(multi_lvl_conf_scores)
if with_nms and (multi_lvl_conf_scores.size(0) == 0):
return torch.zeros((0, 5)), torch.zeros((0, ))
if rescale:
multi_lvl_bboxes /= multi_lvl_bboxes.new_tensor(scale_factor)
# In mmdet 2.x, the class_id for background is num_classes.
# i.e., the last column.
padding = multi_lvl_cls_scores.new_zeros(multi_lvl_cls_scores.shape[0],
1)
multi_lvl_cls_scores = torch.cat([multi_lvl_cls_scores, padding],
dim=1)
# Support exporting to onnx without nms
if with_nms and cfg.get('nms', None) is not None:
det_bboxes, det_labels = multiclass_nms(
multi_lvl_bboxes,
multi_lvl_cls_scores,
cfg.score_thr,
cfg.nms,
cfg.max_per_img,
score_factors=multi_lvl_conf_scores)
return det_bboxes, det_labels
else:
return (multi_lvl_bboxes, multi_lvl_cls_scores,
multi_lvl_conf_scores)
@force_fp32(apply_to=('pred_maps', ))
def loss(self,
pred_maps,
gt_bboxes,
gt_labels,
img_metas,
gt_bboxes_ignore=None):
"""Compute loss of the head.
Args:
pred_maps (list[Tensor]): Prediction map for each scale level,
shape (N, num_anchors * num_attrib, H, W)
gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[Tensor]): class indices corresponding to each box
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes_ignore (None | list[Tensor]): specify which bounding
boxes can be ignored when computing the loss.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
num_imgs = len(img_metas)
device = pred_maps[0][0].device
featmap_sizes = [
pred_maps[i].shape[-2:] for i in range(self.num_levels)
]
multi_level_anchors = self.anchor_generator.grid_anchors(
featmap_sizes, device)
anchor_list = [multi_level_anchors for _ in range(num_imgs)]
responsible_flag_list = []
for img_id in range(len(img_metas)):
responsible_flag_list.append(
self.anchor_generator.responsible_flags(
featmap_sizes, gt_bboxes[img_id], device))
target_maps_list, neg_maps_list = self.get_targets(
anchor_list, responsible_flag_list, gt_bboxes, gt_labels)
losses_cls, losses_conf, losses_xy, losses_wh = multi_apply(
self.loss_single, pred_maps, target_maps_list, neg_maps_list)
return dict(
loss_cls=losses_cls,
loss_conf=losses_conf,
loss_xy=losses_xy,
loss_wh=losses_wh)
def loss_single(self, pred_map, target_map, neg_map):
"""Compute loss of a single image from a batch.
Args:
pred_map (Tensor): Raw predictions for a single level.
target_map (Tensor): The Ground-Truth target for a single level.
neg_map (Tensor): The negative masks for a single level.
Returns:
tuple:
loss_cls (Tensor): Classification loss.
loss_conf (Tensor): Confidence loss.
loss_xy (Tensor): Regression loss of x, y coordinate.
loss_wh (Tensor): Regression loss of w, h coordinate.
"""
num_imgs = len(pred_map)
pred_map = pred_map.permute(0, 2, 3,
1).reshape(num_imgs, -1, self.num_attrib)
neg_mask = neg_map.float()
pos_mask = target_map[..., 4]
pos_and_neg_mask = neg_mask + pos_mask
pos_mask = pos_mask.unsqueeze(dim=-1)
if torch.max(pos_and_neg_mask) > 1.:
warnings.warn('There is overlap between pos and neg sample.')
pos_and_neg_mask = pos_and_neg_mask.clamp(min=0., max=1.)
pred_xy = pred_map[..., :2]
pred_wh = pred_map[..., 2:4]
pred_conf = pred_map[..., 4]
pred_label = pred_map[..., 5:]
target_xy = target_map[..., :2]
target_wh = target_map[..., 2:4]
target_conf = target_map[..., 4]
target_label = target_map[..., 5:]
loss_cls = self.loss_cls(pred_label, target_label, weight=pos_mask)
loss_conf = self.loss_conf(
pred_conf, target_conf, weight=pos_and_neg_mask)
loss_xy = self.loss_xy(pred_xy, target_xy, weight=pos_mask)
loss_wh = self.loss_wh(pred_wh, target_wh, weight=pos_mask)
return loss_cls, loss_conf, loss_xy, loss_wh
def get_targets(self, anchor_list, responsible_flag_list, gt_bboxes_list,
gt_labels_list):
"""Compute target maps for anchors in multiple images.
Args:
anchor_list (list[list[Tensor]]): Multi level anchors of each
image. The outer list indicates images, and the inner list
corresponds to feature levels of the image. Each element of
the inner list is a tensor of shape (num_total_anchors, 4).
responsible_flag_list (list[list[Tensor]]): Multi level responsible
flags of each image. Each element is a tensor of shape
(num_total_anchors, )
gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
gt_labels_list (list[Tensor]): Ground truth labels of each box.
Returns:
tuple: Usually returns a tuple containing learning targets.
- target_map_list (list[Tensor]): Target map of each level.
- neg_map_list (list[Tensor]): Negative map of each level.
"""
num_imgs = len(anchor_list)
# anchor number of multi levels
num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
results = multi_apply(self._get_targets_single, anchor_list,
responsible_flag_list, gt_bboxes_list,
gt_labels_list)
all_target_maps, all_neg_maps = results
assert num_imgs == len(all_target_maps) == len(all_neg_maps)
target_maps_list = images_to_levels(all_target_maps, num_level_anchors)
neg_maps_list = images_to_levels(all_neg_maps, num_level_anchors)
return target_maps_list, neg_maps_list
def _get_targets_single(self, anchors, responsible_flags, gt_bboxes,
gt_labels):
"""Generate matching bounding box prior and converted GT.
Args:
anchors (list[Tensor]): Multi-level anchors of the image.
responsible_flags (list[Tensor]): Multi-level responsible flags of
anchors
gt_bboxes (Tensor): Ground truth bboxes of single image.
gt_labels (Tensor): Ground truth labels of single image.
Returns:
tuple:
target_map (Tensor): Predication target map of each
scale level, shape (num_total_anchors,
5+num_classes)
neg_map (Tensor): Negative map of each scale level,
shape (num_total_anchors,)
"""
anchor_strides = []
for i in range(len(anchors)):
anchor_strides.append(
torch.tensor(self.featmap_strides[i],
device=gt_bboxes.device).repeat(len(anchors[i])))
concat_anchors = torch.cat(anchors)
concat_responsible_flags = torch.cat(responsible_flags)
anchor_strides = torch.cat(anchor_strides)
assert len(anchor_strides) == len(concat_anchors) == \
len(concat_responsible_flags)
assign_result = self.assigner.assign(concat_anchors,
concat_responsible_flags,
gt_bboxes)
sampling_result = self.sampler.sample(assign_result, concat_anchors,
gt_bboxes)
target_map = concat_anchors.new_zeros(
concat_anchors.size(0), self.num_attrib)
target_map[sampling_result.pos_inds, :4] = self.bbox_coder.encode(
sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes,
anchor_strides[sampling_result.pos_inds])
target_map[sampling_result.pos_inds, 4] = 1
gt_labels_one_hot = F.one_hot(
gt_labels, num_classes=self.num_classes).float()
if self.one_hot_smoother != 0: # label smooth
gt_labels_one_hot = gt_labels_one_hot * (
1 - self.one_hot_smoother
) + self.one_hot_smoother / self.num_classes
target_map[sampling_result.pos_inds, 5:] = gt_labels_one_hot[
sampling_result.pos_assigned_gt_inds]
neg_map = concat_anchors.new_zeros(
concat_anchors.size(0), dtype=torch.uint8)
neg_map[sampling_result.neg_inds] = 1
return target_map, neg_map
def aug_test(self, feats, img_metas, rescale=False):
"""Test function with test time augmentation.
Args:
feats (list[Tensor]): the outer list indicates test-time
augmentations and inner Tensor should have a shape NxCxHxW,
which contains features for all images in the batch.
img_metas (list[list[dict]]): the outer list indicates test-time
augs (multiscale, flip, etc.) and the inner list indicates
images in a batch. each dict has image information.
rescale (bool, optional): Whether to rescale the results.
Defaults to False.
Returns:
list[ndarray]: bbox results of each class
"""
return self.aug_test_bboxes(feats, img_metas, rescale=rescale)
from .atss import ATSS
from .base import BaseDetector
from .cascade_rcnn import CascadeRCNN
from .cornernet import CornerNet
from .detr import DETR
from .fast_rcnn import FastRCNN
from .faster_rcnn import FasterRCNN
from .fcos import FCOS
from .fovea import FOVEA
from .fsaf import FSAF
from .gfl import GFL
from .grid_rcnn import GridRCNN
from .htc import HybridTaskCascade
from .mask_rcnn import MaskRCNN
from .mask_scoring_rcnn import MaskScoringRCNN
from .nasfcos import NASFCOS
from .paa import PAA
from .point_rend import PointRend
from .reppoints_detector import RepPointsDetector
from .retinanet import RetinaNet
from .rpn import RPN
from .single_stage import SingleStageDetector
from .sparse_rcnn import SparseRCNN
from .trident_faster_rcnn import TridentFasterRCNN
from .two_stage import TwoStageDetector
from .vfnet import VFNet
from .yolact import YOLACT
from .yolo import YOLOV3
__all__ = [
'ATSS', 'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN',
'FastRCNN', 'FasterRCNN', 'MaskRCNN', 'CascadeRCNN', 'HybridTaskCascade',
'RetinaNet', 'FCOS', 'GridRCNN', 'MaskScoringRCNN', 'RepPointsDetector',
'FOVEA', 'FSAF', 'NASFCOS', 'PointRend', 'GFL', 'CornerNet', 'PAA',
'YOLOV3', 'YOLACT', 'VFNet', 'DETR', 'TridentFasterRCNN', 'SparseRCNN'
]
from ..builder import DETECTORS
from .single_stage import SingleStageDetector
@DETECTORS.register_module()
class ATSS(SingleStageDetector):
"""Implementation of `ATSS <https://arxiv.org/abs/1912.02424>`_."""
def __init__(self,
backbone,
neck,
bbox_head,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(ATSS, self).__init__(backbone, neck, bbox_head, train_cfg,
test_cfg, pretrained)
from abc import ABCMeta, abstractmethod
from collections import OrderedDict
import mmcv
import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
from mmcv.runner import auto_fp16
from mmcv.utils import print_log
from mmdet.core.visualization import imshow_det_bboxes
from mmdet.utils import get_root_logger
class BaseDetector(nn.Module, metaclass=ABCMeta):
"""Base class for detectors."""
def __init__(self):
super(BaseDetector, self).__init__()
self.fp16_enabled = False
@property
def with_neck(self):
"""bool: whether the detector has a neck"""
return hasattr(self, 'neck') and self.neck is not None
# TODO: these properties need to be carefully handled
# for both single stage & two stage detectors
@property
def with_shared_head(self):
"""bool: whether the detector has a shared head in the RoI Head"""
return hasattr(self, 'roi_head') and self.roi_head.with_shared_head
@property
def with_bbox(self):
"""bool: whether the detector has a bbox head"""
return ((hasattr(self, 'roi_head') and self.roi_head.with_bbox)
or (hasattr(self, 'bbox_head') and self.bbox_head is not None))
@property
def with_mask(self):
"""bool: whether the detector has a mask head"""
return ((hasattr(self, 'roi_head') and self.roi_head.with_mask)
or (hasattr(self, 'mask_head') and self.mask_head is not None))
@abstractmethod
def extract_feat(self, imgs):
"""Extract features from images."""
pass
def extract_feats(self, imgs):
"""Extract features from multiple images.
Args:
imgs (list[torch.Tensor]): A list of images. The images are
augmented from the same image but in different ways.
Returns:
list[torch.Tensor]: Features of different images
"""
assert isinstance(imgs, list)
return [self.extract_feat(img) for img in imgs]
def forward_train(self, imgs, img_metas, **kwargs):
"""
Args:
img (list[Tensor]): List of tensors of shape (1, C, H, W).
Typically these should be mean centered and std scaled.
img_metas (list[dict]): List of image info dict where each dict
has: 'img_shape', 'scale_factor', 'flip', and may also contain
'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
For details on the values of these keys, see
:class:`mmdet.datasets.pipelines.Collect`.
kwargs (keyword arguments): Specific to concrete implementation.
"""
# NOTE the batched image size information may be useful, e.g.
# in DETR, this is needed for the construction of masks, which is
# then used for the transformer_head.
batch_input_shape = tuple(imgs[0].size()[-2:])
for img_meta in img_metas:
img_meta['batch_input_shape'] = batch_input_shape
async def async_simple_test(self, img, img_metas, **kwargs):
raise NotImplementedError
@abstractmethod
def simple_test(self, img, img_metas, **kwargs):
pass
@abstractmethod
def aug_test(self, imgs, img_metas, **kwargs):
"""Test function with test time augmentation."""
pass
def init_weights(self, pretrained=None):
"""Initialize the weights in detector.
Args:
pretrained (str, optional): Path to pre-trained weights.
Defaults to None.
"""
if pretrained is not None:
logger = get_root_logger()
print_log(f'load model from: {pretrained}', logger=logger)
async def aforward_test(self, *, img, img_metas, **kwargs):
for var, name in [(img, 'img'), (img_metas, 'img_metas')]:
if not isinstance(var, list):
raise TypeError(f'{name} must be a list, but got {type(var)}')
num_augs = len(img)
if num_augs != len(img_metas):
raise ValueError(f'num of augmentations ({len(img)}) '
f'!= num of image metas ({len(img_metas)})')
# TODO: remove the restriction of samples_per_gpu == 1 when prepared
samples_per_gpu = img[0].size(0)
assert samples_per_gpu == 1
if num_augs == 1:
return await self.async_simple_test(img[0], img_metas[0], **kwargs)
else:
raise NotImplementedError
def forward_test(self, imgs, img_metas, **kwargs):
"""
Args:
imgs (List[Tensor]): the outer list indicates test-time
augmentations and inner Tensor should have a shape NxCxHxW,
which contains all images in the batch.
img_metas (List[List[dict]]): the outer list indicates test-time
augs (multiscale, flip, etc.) and the inner list indicates
images in a batch.
"""
for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
if not isinstance(var, list):
raise TypeError(f'{name} must be a list, but got {type(var)}')
num_augs = len(imgs)
if num_augs != len(img_metas):
raise ValueError(f'num of augmentations ({len(imgs)}) '
f'!= num of image meta ({len(img_metas)})')
# NOTE the batched image size information may be useful, e.g.
# in DETR, this is needed for the construction of masks, which is
# then used for the transformer_head.
for img, img_meta in zip(imgs, img_metas):
batch_size = len(img_meta)
for img_id in range(batch_size):
img_meta[img_id]['batch_input_shape'] = tuple(img.size()[-2:])
if num_augs == 1:
# proposals (List[List[Tensor]]): the outer list indicates
# test-time augs (multiscale, flip, etc.) and the inner list
# indicates images in a batch.
# The Tensor should have a shape Px4, where P is the number of
# proposals.
if 'proposals' in kwargs:
kwargs['proposals'] = kwargs['proposals'][0]
return self.simple_test(imgs[0], img_metas[0], **kwargs)
else:
assert imgs[0].size(0) == 1, 'aug test does not support ' \
'inference with batch size ' \
f'{imgs[0].size(0)}'
# TODO: support test augmentation for predefined proposals
assert 'proposals' not in kwargs
return self.aug_test(imgs, img_metas, **kwargs)
@auto_fp16(apply_to=('img', ))
def forward(self, img, img_metas, return_loss=True, **kwargs):
"""Calls either :func:`forward_train` or :func:`forward_test` depending
on whether ``return_loss`` is ``True``.
Note this setting will change the expected inputs. When
``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
and List[dict]), and when ``resturn_loss=False``, img and img_meta
should be double nested (i.e. List[Tensor], List[List[dict]]), with
the outer list indicating test time augmentations.
"""
if return_loss:
return self.forward_train(img, img_metas, **kwargs)
else:
return self.forward_test(img, img_metas, **kwargs)
def _parse_losses(self, losses):
"""Parse the raw outputs (losses) of the network.
Args:
losses (dict): Raw output of the network, which usually contain
losses and other necessary infomation.
Returns:
tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor \
which may be a weighted sum of all losses, log_vars contains \
all the variables to be sent to the logger.
"""
log_vars = OrderedDict()
for loss_name, loss_value in losses.items():
if isinstance(loss_value, torch.Tensor):
log_vars[loss_name] = loss_value.mean()
elif isinstance(loss_value, list):
log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
else:
raise TypeError(
f'{loss_name} is not a tensor or list of tensors')
loss = sum(_value for _key, _value in log_vars.items()
if 'loss' in _key)
log_vars['loss'] = loss
for loss_name, loss_value in log_vars.items():
# reduce loss when distributed training
if dist.is_available() and dist.is_initialized():
loss_value = loss_value.data.clone()
dist.all_reduce(loss_value.div_(dist.get_world_size()))
log_vars[loss_name] = loss_value.item()
return loss, log_vars
def train_step(self, data, optimizer):
"""The iteration step during training.
This method defines an iteration step during training, except for the
back propagation and optimizer updating, which are done in an optimizer
hook. Note that in some complicated cases or models, the whole process
including back propagation and optimizer updating is also defined in
this method, such as GAN.
Args:
data (dict): The output of dataloader.
optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
runner is passed to ``train_step()``. This argument is unused
and reserved.
Returns:
dict: It should contain at least 3 keys: ``loss``, ``log_vars``, \
``num_samples``.
- ``loss`` is a tensor for back propagation, which can be a \
weighted sum of multiple losses.
- ``log_vars`` contains all the variables to be sent to the
logger.
- ``num_samples`` indicates the batch size (when the model is \
DDP, it means the batch size on each GPU), which is used for \
averaging the logs.
"""
losses = self(**data)
loss, log_vars = self._parse_losses(losses)
outputs = dict(
loss=loss, log_vars=log_vars, num_samples=len(data['img_metas']))
return outputs
def val_step(self, data, optimizer):
"""The iteration step during validation.
This method shares the same signature as :func:`train_step`, but used
during val epochs. Note that the evaluation after training epochs is
not implemented with this method, but an evaluation hook.
"""
losses = self(**data)
loss, log_vars = self._parse_losses(losses)
outputs = dict(
loss=loss, log_vars=log_vars, num_samples=len(data['img_metas']))
return outputs
def show_result(self,
img,
result,
score_thr=0.3,
bbox_color=(72, 101, 241),
text_color=(72, 101, 241),
mask_color=None,
thickness=2,
font_scale=0.5,
font_size=13,
win_name='',
fig_size=(15, 10),
show=False,
wait_time=0,
out_file=None):
"""Draw `result` over `img`.
Args:
img (str or Tensor): The image to be displayed.
result (Tensor or tuple): The results to draw over `img`
bbox_result or (bbox_result, segm_result).
score_thr (float, optional): Minimum score of bboxes to be shown.
Default: 0.3.
bbox_color (str or tuple(int) or :obj:`Color`):Color of bbox lines.
The tuple of color should be in BGR order. Default: 'green'
text_color (str or tuple(int) or :obj:`Color`):Color of texts.
The tuple of color should be in BGR order. Default: 'green'
mask_color (None or str or tuple(int) or :obj:`Color`):
Color of masks. The tuple of color should be in BGR order.
Default: None
thickness (int): Thickness of lines. Default: 2
font_scale (float): Font scales of texts. Default: 0.5
font_size (int): Font size of texts. Default: 13
win_name (str): The window name. Default: ''
fig_size (tuple): Figure size of the pyplot figure.
Default: (15, 10)
wait_time (float): Value of waitKey param.
Default: 0.
show (bool): Whether to show the image.
Default: False.
out_file (str or None): The filename to write the image.
Default: None.
Returns:
img (Tensor): Only if not `show` or `out_file`
"""
img = mmcv.imread(img)
img = img.copy()
if isinstance(result, tuple):
bbox_result, segm_result = result
if isinstance(segm_result, tuple):
segm_result = segm_result[0] # ms rcnn
else:
bbox_result, segm_result = result, None
bboxes = np.vstack(bbox_result)
labels = [
np.full(bbox.shape[0], i, dtype=np.int32)
for i, bbox in enumerate(bbox_result)
]
labels = np.concatenate(labels)
# draw segmentation masks
segms = None
if segm_result is not None and len(labels) > 0: # non empty
segms = mmcv.concat_list(segm_result)
if isinstance(segms[0], torch.Tensor):
segms = torch.stack(segms, dim=0).detach().cpu().numpy()
else:
segms = np.stack(segms, axis=0)
# if out_file specified, do not show image in window
if out_file is not None:
show = False
# draw bounding boxes
imshow_det_bboxes(
img,
bboxes,
labels,
segms,
class_names=self.CLASSES,
score_thr=score_thr,
bbox_color=bbox_color,
text_color=text_color,
mask_color=mask_color,
thickness=thickness,
font_scale=font_scale,
font_size=font_size,
win_name=win_name,
fig_size=fig_size,
show=show,
wait_time=wait_time,
out_file=out_file)
if not (show or out_file):
return img
from ..builder import DETECTORS
from .two_stage import TwoStageDetector
@DETECTORS.register_module()
class CascadeRCNN(TwoStageDetector):
r"""Implementation of `Cascade R-CNN: Delving into High Quality Object
Detection <https://arxiv.org/abs/1906.09756>`_"""
def __init__(self,
backbone,
neck=None,
rpn_head=None,
roi_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(CascadeRCNN, self).__init__(
backbone=backbone,
neck=neck,
rpn_head=rpn_head,
roi_head=roi_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained)
def show_result(self, data, result, **kwargs):
"""Show prediction results of the detector."""
if self.with_mask:
ms_bbox_result, ms_segm_result = result
if isinstance(ms_bbox_result, dict):
result = (ms_bbox_result['ensemble'],
ms_segm_result['ensemble'])
else:
if isinstance(result, dict):
result = result['ensemble']
return super(CascadeRCNN, self).show_result(data, result, **kwargs)
import torch
from mmdet.core import bbox2result, bbox_mapping_back
from ..builder import DETECTORS
from .single_stage import SingleStageDetector
@DETECTORS.register_module()
class CornerNet(SingleStageDetector):
"""CornerNet.
This detector is the implementation of the paper `CornerNet: Detecting
Objects as Paired Keypoints <https://arxiv.org/abs/1808.01244>`_ .
"""
def __init__(self,
backbone,
neck,
bbox_head,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(CornerNet, self).__init__(backbone, neck, bbox_head, train_cfg,
test_cfg, pretrained)
def merge_aug_results(self, aug_results, img_metas):
"""Merge augmented detection bboxes and score.
Args:
aug_results (list[list[Tensor]]): Det_bboxes and det_labels of each
image.
img_metas (list[list[dict]]): Meta information of each image, e.g.,
image size, scaling factor, etc.
Returns:
tuple: (bboxes, labels)
"""
recovered_bboxes, aug_labels = [], []
for bboxes_labels, img_info in zip(aug_results, img_metas):
img_shape = img_info[0]['img_shape'] # using shape before padding
scale_factor = img_info[0]['scale_factor']
flip = img_info[0]['flip']
bboxes, labels = bboxes_labels
bboxes, scores = bboxes[:, :4], bboxes[:, -1:]
bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip)
recovered_bboxes.append(torch.cat([bboxes, scores], dim=-1))
aug_labels.append(labels)
bboxes = torch.cat(recovered_bboxes, dim=0)
labels = torch.cat(aug_labels)
if bboxes.shape[0] > 0:
out_bboxes, out_labels = self.bbox_head._bboxes_nms(
bboxes, labels, self.bbox_head.test_cfg)
else:
out_bboxes, out_labels = bboxes, labels
return out_bboxes, out_labels
def aug_test(self, imgs, img_metas, rescale=False):
"""Augment testing of CornerNet.
Args:
imgs (list[Tensor]): Augmented images.
img_metas (list[list[dict]]): Meta information of each image, e.g.,
image size, scaling factor, etc.
rescale (bool): If True, return boxes in original image space.
Default: False.
Note:
``imgs`` must including flipped image pairs.
Returns:
list[list[np.ndarray]]: BBox results of each image and classes.
The outer list corresponds to each image. The inner list
corresponds to each class.
"""
img_inds = list(range(len(imgs)))
assert img_metas[0][0]['flip'] + img_metas[1][0]['flip'], (
'aug test must have flipped image pair')
aug_results = []
for ind, flip_ind in zip(img_inds[0::2], img_inds[1::2]):
img_pair = torch.cat([imgs[ind], imgs[flip_ind]])
x = self.extract_feat(img_pair)
outs = self.bbox_head(x)
bbox_list = self.bbox_head.get_bboxes(
*outs, [img_metas[ind], img_metas[flip_ind]], False, False)
aug_results.append(bbox_list[0])
aug_results.append(bbox_list[1])
bboxes, labels = self.merge_aug_results(aug_results, img_metas)
bbox_results = bbox2result(bboxes, labels, self.bbox_head.num_classes)
return [bbox_results]
from mmdet.core import bbox2result
from ..builder import DETECTORS
from .single_stage import SingleStageDetector
@DETECTORS.register_module()
class DETR(SingleStageDetector):
r"""Implementation of `DETR: End-to-End Object Detection with
Transformers <https://arxiv.org/pdf/2005.12872>`_"""
def __init__(self,
backbone,
bbox_head,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(DETR, self).__init__(backbone, None, bbox_head, train_cfg,
test_cfg, pretrained)
def simple_test(self, img, img_metas, rescale=False):
"""Test function without test time augmentation.
Args:
imgs (list[torch.Tensor]): List of multiple images
img_metas (list[dict]): List of image information.
rescale (bool, optional): Whether to rescale the results.
Defaults to False.
Returns:
list[list[np.ndarray]]: BBox results of each image and classes.
The outer list corresponds to each image. The inner list
corresponds to each class.
"""
batch_size = len(img_metas)
assert batch_size == 1, 'Currently only batch_size 1 for inference ' \
f'mode is supported. Found batch_size {batch_size}.'
x = self.extract_feat(img)
outs = self.bbox_head(x, img_metas)
bbox_list = self.bbox_head.get_bboxes(
*outs, img_metas, rescale=rescale)
bbox_results = [
bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
for det_bboxes, det_labels in bbox_list
]
return bbox_results
from ..builder import DETECTORS
from .two_stage import TwoStageDetector
@DETECTORS.register_module()
class FastRCNN(TwoStageDetector):
"""Implementation of `Fast R-CNN <https://arxiv.org/abs/1504.08083>`_"""
def __init__(self,
backbone,
roi_head,
train_cfg,
test_cfg,
neck=None,
pretrained=None):
super(FastRCNN, self).__init__(
backbone=backbone,
neck=neck,
roi_head=roi_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained)
def forward_test(self, imgs, img_metas, proposals, **kwargs):
"""
Args:
imgs (List[Tensor]): the outer list indicates test-time
augmentations and inner Tensor should have a shape NxCxHxW,
which contains all images in the batch.
img_metas (List[List[dict]]): the outer list indicates test-time
augs (multiscale, flip, etc.) and the inner list indicates
images in a batch.
proposals (List[List[Tensor]]): the outer list indicates test-time
augs (multiscale, flip, etc.) and the inner list indicates
images in a batch. The Tensor should have a shape Px4, where
P is the number of proposals.
"""
for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
if not isinstance(var, list):
raise TypeError(f'{name} must be a list, but got {type(var)}')
num_augs = len(imgs)
if num_augs != len(img_metas):
raise ValueError(f'num of augmentations ({len(imgs)}) '
f'!= num of image meta ({len(img_metas)})')
if num_augs == 1:
return self.simple_test(imgs[0], img_metas[0], proposals[0],
**kwargs)
else:
# TODO: support test-time augmentation
assert NotImplementedError
from ..builder import DETECTORS
from .two_stage import TwoStageDetector
@DETECTORS.register_module()
class FasterRCNN(TwoStageDetector):
"""Implementation of `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_"""
def __init__(self,
backbone,
rpn_head,
roi_head,
train_cfg,
test_cfg,
neck=None,
pretrained=None):
super(FasterRCNN, self).__init__(
backbone=backbone,
neck=neck,
rpn_head=rpn_head,
roi_head=roi_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained)
from ..builder import DETECTORS
from .single_stage import SingleStageDetector
@DETECTORS.register_module()
class FCOS(SingleStageDetector):
"""Implementation of `FCOS <https://arxiv.org/abs/1904.01355>`_"""
def __init__(self,
backbone,
neck,
bbox_head,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(FCOS, self).__init__(backbone, neck, bbox_head, train_cfg,
test_cfg, pretrained)
from ..builder import DETECTORS
from .single_stage import SingleStageDetector
@DETECTORS.register_module()
class FOVEA(SingleStageDetector):
"""Implementation of `FoveaBox <https://arxiv.org/abs/1904.03797>`_"""
def __init__(self,
backbone,
neck,
bbox_head,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(FOVEA, self).__init__(backbone, neck, bbox_head, train_cfg,
test_cfg, pretrained)
from ..builder import DETECTORS
from .single_stage import SingleStageDetector
@DETECTORS.register_module()
class FSAF(SingleStageDetector):
"""Implementation of `FSAF <https://arxiv.org/abs/1903.00621>`_"""
def __init__(self,
backbone,
neck,
bbox_head,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(FSAF, self).__init__(backbone, neck, bbox_head, train_cfg,
test_cfg, pretrained)
from ..builder import DETECTORS
from .single_stage import SingleStageDetector
@DETECTORS.register_module()
class GFL(SingleStageDetector):
def __init__(self,
backbone,
neck,
bbox_head,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(GFL, self).__init__(backbone, neck, bbox_head, train_cfg,
test_cfg, pretrained)
from ..builder import DETECTORS
from .two_stage import TwoStageDetector
@DETECTORS.register_module()
class GridRCNN(TwoStageDetector):
"""Grid R-CNN.
This detector is the implementation of:
- Grid R-CNN (https://arxiv.org/abs/1811.12030)
- Grid R-CNN Plus: Faster and Better (https://arxiv.org/abs/1906.05688)
"""
def __init__(self,
backbone,
rpn_head,
roi_head,
train_cfg,
test_cfg,
neck=None,
pretrained=None):
super(GridRCNN, self).__init__(
backbone=backbone,
neck=neck,
rpn_head=rpn_head,
roi_head=roi_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained)
from ..builder import DETECTORS
from .cascade_rcnn import CascadeRCNN
@DETECTORS.register_module()
class HybridTaskCascade(CascadeRCNN):
"""Implementation of `HTC <https://arxiv.org/abs/1901.07518>`_"""
def __init__(self, **kwargs):
super(HybridTaskCascade, self).__init__(**kwargs)
@property
def with_semantic(self):
"""bool: whether the detector has a semantic head"""
return self.roi_head.with_semantic
from ..builder import DETECTORS
from .two_stage import TwoStageDetector
@DETECTORS.register_module()
class MaskRCNN(TwoStageDetector):
"""Implementation of `Mask R-CNN <https://arxiv.org/abs/1703.06870>`_"""
def __init__(self,
backbone,
rpn_head,
roi_head,
train_cfg,
test_cfg,
neck=None,
pretrained=None):
super(MaskRCNN, self).__init__(
backbone=backbone,
neck=neck,
rpn_head=rpn_head,
roi_head=roi_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment