"example/39_permute/permute_HxWx4_fp16.cpp" did not exist on "b8abc3e33b6c7f1757844539e5a7f2aff37ca392"
Commit a4372e17 authored by huchen's avatar huchen
Browse files

Merge branch 'hepj_work' into 'main'

增加conformer代码

See merge request dcutoolkit/deeplearing/dlexamples_new!7
parents 7f99c1c3 142dcf29
import torch
from ..builder import BBOX_ASSIGNERS
from ..match_costs import build_match_cost
from ..transforms import bbox_cxcywh_to_xyxy
from .assign_result import AssignResult
from .base_assigner import BaseAssigner
try:
from scipy.optimize import linear_sum_assignment
except ImportError:
linear_sum_assignment = None
@BBOX_ASSIGNERS.register_module()
class HungarianAssigner(BaseAssigner):
"""Computes one-to-one matching between predictions and ground truth.
This class computes an assignment between the targets and the predictions
based on the costs. The costs are weighted sum of three components:
classfication cost, regression L1 cost and regression iou cost. The
targets don't include the no_object, so generally there are more
predictions than targets. After the one-to-one matching, the un-matched
are treated as backgrounds. Thus each query prediction will be assigned
with `0` or a positive integer indicating the ground truth index:
- 0: negative sample, no assigned gt
- positive integer: positive sample, index (1-based) of assigned gt
Args:
cls_weight (int | float, optional): The scale factor for classification
cost. Default 1.0.
bbox_weight (int | float, optional): The scale factor for regression
L1 cost. Default 1.0.
iou_weight (int | float, optional): The scale factor for regression
iou cost. Default 1.0.
iou_calculator (dict | optional): The config for the iou calculation.
Default type `BboxOverlaps2D`.
iou_mode (str | optional): "iou" (intersection over union), "iof"
(intersection over foreground), or "giou" (generalized
intersection over union). Default "giou".
"""
def __init__(self,
cls_cost=dict(type='ClassificationCost', weight=1.),
reg_cost=dict(type='BBoxL1Cost', weight=1.0),
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=1.0)):
self.cls_cost = build_match_cost(cls_cost)
self.reg_cost = build_match_cost(reg_cost)
self.iou_cost = build_match_cost(iou_cost)
def assign(self,
bbox_pred,
cls_pred,
gt_bboxes,
gt_labels,
img_meta,
gt_bboxes_ignore=None,
eps=1e-7):
"""Computes one-to-one matching based on the weighted costs.
This method assign each query prediction to a ground truth or
background. The `assigned_gt_inds` with -1 means don't care,
0 means negative sample, and positive number is the index (1-based)
of assigned gt.
The assignment is done in the following steps, the order matters.
1. assign every prediction to -1
2. compute the weighted costs
3. do Hungarian matching on CPU based on the costs
4. assign all to 0 (background) first, then for each matched pair
between predictions and gts, treat this prediction as foreground
and assign the corresponding gt index (plus 1) to it.
Args:
bbox_pred (Tensor): Predicted boxes with normalized coordinates
(cx, cy, w, h), which are all in range [0, 1]. Shape
[num_query, 4].
cls_pred (Tensor): Predicted classification logits, shape
[num_query, num_class].
gt_bboxes (Tensor): Ground truth boxes with unnormalized
coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
img_meta (dict): Meta information for current image.
gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
labelled as `ignored`. Default None.
eps (int | float, optional): A value added to the denominator for
numerical stability. Default 1e-7.
Returns:
:obj:`AssignResult`: The assigned result.
"""
assert gt_bboxes_ignore is None, \
'Only case when gt_bboxes_ignore is None is supported.'
num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
# 1. assign -1 by default
assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
-1,
dtype=torch.long)
assigned_labels = bbox_pred.new_full((num_bboxes, ),
-1,
dtype=torch.long)
if num_gts == 0 or num_bboxes == 0:
# No ground truth or boxes, return empty assignment
if num_gts == 0:
# No ground truth, assign all to background
assigned_gt_inds[:] = 0
return AssignResult(
num_gts, assigned_gt_inds, None, labels=assigned_labels)
img_h, img_w, _ = img_meta['img_shape']
factor = gt_bboxes.new_tensor([img_w, img_h, img_w,
img_h]).unsqueeze(0)
# 2. compute the weighted costs
# classification and bboxcost.
cls_cost = self.cls_cost(cls_pred, gt_labels)
# regression L1 cost
normalize_gt_bboxes = gt_bboxes / factor
reg_cost = self.reg_cost(bbox_pred, normalize_gt_bboxes)
# regression iou cost, defaultly giou is used in official DETR.
bboxes = bbox_cxcywh_to_xyxy(bbox_pred) * factor
iou_cost = self.iou_cost(bboxes, gt_bboxes)
# weighted sum of above three costs
cost = cls_cost + reg_cost + iou_cost
# 3. do Hungarian matching on CPU using linear_sum_assignment
cost = cost.detach().cpu()
if linear_sum_assignment is None:
raise ImportError('Please run "pip install scipy" '
'to install scipy first.')
matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
matched_row_inds = torch.from_numpy(matched_row_inds).to(
bbox_pred.device)
matched_col_inds = torch.from_numpy(matched_col_inds).to(
bbox_pred.device)
# 4. assign backgrounds and foregrounds
# assign all indices to backgrounds first
assigned_gt_inds[:] = 0
# assign foregrounds based on matching results
assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
return AssignResult(
num_gts, assigned_gt_inds, None, labels=assigned_labels)
import torch
from ..builder import BBOX_ASSIGNERS
from ..iou_calculators import build_iou_calculator
from .assign_result import AssignResult
from .base_assigner import BaseAssigner
@BBOX_ASSIGNERS.register_module()
class MaxIoUAssigner(BaseAssigner):
"""Assign a corresponding gt bbox or background to each bbox.
Each proposals will be assigned with `-1`, or a semi-positive integer
indicating the ground truth index.
- -1: negative sample, no assigned gt
- semi-positive integer: positive sample, index (0-based) of assigned gt
Args:
pos_iou_thr (float): IoU threshold for positive bboxes.
neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
min_pos_iou (float): Minimum iou for a bbox to be considered as a
positive bbox. Positive samples can have smaller IoU than
pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
gt_max_assign_all (bool): Whether to assign all bboxes with the same
highest overlap with some gt to that gt.
ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
`gt_bboxes_ignore` is specified). Negative values mean not
ignoring any bboxes.
ignore_wrt_candidates (bool): Whether to compute the iof between
`bboxes` and `gt_bboxes_ignore`, or the contrary.
match_low_quality (bool): Whether to allow low quality matches. This is
usually allowed for RPN and single stage detectors, but not allowed
in the second stage. Details are demonstrated in Step 4.
gpu_assign_thr (int): The upper bound of the number of GT for GPU
assign. When the number of gt is above this threshold, will assign
on CPU device. Negative values mean not assign on CPU.
"""
def __init__(self,
pos_iou_thr,
neg_iou_thr,
min_pos_iou=.0,
gt_max_assign_all=True,
ignore_iof_thr=-1,
ignore_wrt_candidates=True,
match_low_quality=True,
gpu_assign_thr=-1,
iou_calculator=dict(type='BboxOverlaps2D')):
self.pos_iou_thr = pos_iou_thr
self.neg_iou_thr = neg_iou_thr
self.min_pos_iou = min_pos_iou
self.gt_max_assign_all = gt_max_assign_all
self.ignore_iof_thr = ignore_iof_thr
self.ignore_wrt_candidates = ignore_wrt_candidates
self.gpu_assign_thr = gpu_assign_thr
self.match_low_quality = match_low_quality
self.iou_calculator = build_iou_calculator(iou_calculator)
def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
"""Assign gt to bboxes.
This method assign a gt bbox to every bbox (proposal/anchor), each bbox
will be assigned with -1, or a semi-positive number. -1 means negative
sample, semi-positive number is the index (0-based) of assigned gt.
The assignment is done in following steps, the order matters.
1. assign every bbox to the background
2. assign proposals whose iou with all gts < neg_iou_thr to 0
3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
assign it to that bbox
4. for each gt bbox, assign its nearest proposals (may be more than
one) to itself
Args:
bboxes (Tensor): Bounding boxes to be assigned, shape(n, 4).
gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
labelled as `ignored`, e.g., crowd boxes in COCO.
gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
Returns:
:obj:`AssignResult`: The assign result.
Example:
>>> self = MaxIoUAssigner(0.5, 0.5)
>>> bboxes = torch.Tensor([[0, 0, 10, 10], [10, 10, 20, 20]])
>>> gt_bboxes = torch.Tensor([[0, 0, 10, 9]])
>>> assign_result = self.assign(bboxes, gt_bboxes)
>>> expected_gt_inds = torch.LongTensor([1, 0])
>>> assert torch.all(assign_result.gt_inds == expected_gt_inds)
"""
assign_on_cpu = True if (self.gpu_assign_thr > 0) and (
gt_bboxes.shape[0] > self.gpu_assign_thr) else False
# compute overlap and assign gt on CPU when number of GT is large
if assign_on_cpu:
device = bboxes.device
bboxes = bboxes.cpu()
gt_bboxes = gt_bboxes.cpu()
if gt_bboxes_ignore is not None:
gt_bboxes_ignore = gt_bboxes_ignore.cpu()
if gt_labels is not None:
gt_labels = gt_labels.cpu()
overlaps = self.iou_calculator(gt_bboxes, bboxes)
if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
and gt_bboxes_ignore.numel() > 0 and bboxes.numel() > 0):
if self.ignore_wrt_candidates:
ignore_overlaps = self.iou_calculator(
bboxes, gt_bboxes_ignore, mode='iof')
ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
else:
ignore_overlaps = self.iou_calculator(
gt_bboxes_ignore, bboxes, mode='iof')
ignore_max_overlaps, _ = ignore_overlaps.max(dim=0)
overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1
assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
if assign_on_cpu:
assign_result.gt_inds = assign_result.gt_inds.to(device)
assign_result.max_overlaps = assign_result.max_overlaps.to(device)
if assign_result.labels is not None:
assign_result.labels = assign_result.labels.to(device)
return assign_result
def assign_wrt_overlaps(self, overlaps, gt_labels=None):
"""Assign w.r.t. the overlaps of bboxes with gts.
Args:
overlaps (Tensor): Overlaps between k gt_bboxes and n bboxes,
shape(k, n).
gt_labels (Tensor, optional): Labels of k gt_bboxes, shape (k, ).
Returns:
:obj:`AssignResult`: The assign result.
"""
num_gts, num_bboxes = overlaps.size(0), overlaps.size(1)
# 1. assign -1 by default
assigned_gt_inds = overlaps.new_full((num_bboxes, ),
-1,
dtype=torch.long)
if num_gts == 0 or num_bboxes == 0:
# No ground truth or boxes, return empty assignment
max_overlaps = overlaps.new_zeros((num_bboxes, ))
if num_gts == 0:
# No truth, assign everything to background
assigned_gt_inds[:] = 0
if gt_labels is None:
assigned_labels = None
else:
assigned_labels = overlaps.new_full((num_bboxes, ),
-1,
dtype=torch.long)
return AssignResult(
num_gts,
assigned_gt_inds,
max_overlaps,
labels=assigned_labels)
# for each anchor, which gt best overlaps with it
# for each anchor, the max iou of all gts
max_overlaps, argmax_overlaps = overlaps.max(dim=0)
# for each gt, which anchor best overlaps with it
# for each gt, the max iou of all proposals
gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1)
# 2. assign negative: below
# the negative inds are set to be 0
if isinstance(self.neg_iou_thr, float):
assigned_gt_inds[(max_overlaps >= 0)
& (max_overlaps < self.neg_iou_thr)] = 0
elif isinstance(self.neg_iou_thr, tuple):
assert len(self.neg_iou_thr) == 2
assigned_gt_inds[(max_overlaps >= self.neg_iou_thr[0])
& (max_overlaps < self.neg_iou_thr[1])] = 0
# 3. assign positive: above positive IoU threshold
pos_inds = max_overlaps >= self.pos_iou_thr
assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
if self.match_low_quality:
# Low-quality matching will overwirte the assigned_gt_inds assigned
# in Step 3. Thus, the assigned gt might not be the best one for
# prediction.
# For example, if bbox A has 0.9 and 0.8 iou with GT bbox 1 & 2,
# bbox 1 will be assigned as the best target for bbox A in step 3.
# However, if GT bbox 2's gt_argmax_overlaps = A, bbox A's
# assigned_gt_inds will be overwritten to be bbox B.
# This might be the reason that it is not used in ROI Heads.
for i in range(num_gts):
if gt_max_overlaps[i] >= self.min_pos_iou:
if self.gt_max_assign_all:
max_iou_inds = overlaps[i, :] == gt_max_overlaps[i]
assigned_gt_inds[max_iou_inds] = i + 1
else:
assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
if gt_labels is not None:
assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
pos_inds = torch.nonzero(
assigned_gt_inds > 0, as_tuple=False).squeeze()
if pos_inds.numel() > 0:
assigned_labels[pos_inds] = gt_labels[
assigned_gt_inds[pos_inds] - 1]
else:
assigned_labels = None
return AssignResult(
num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
import torch
from ..builder import BBOX_ASSIGNERS
from .assign_result import AssignResult
from .base_assigner import BaseAssigner
@BBOX_ASSIGNERS.register_module()
class PointAssigner(BaseAssigner):
"""Assign a corresponding gt bbox or background to each point.
Each proposals will be assigned with `0`, or a positive integer
indicating the ground truth index.
- 0: negative sample, no assigned gt
- positive integer: positive sample, index (1-based) of assigned gt
"""
def __init__(self, scale=4, pos_num=3):
self.scale = scale
self.pos_num = pos_num
def assign(self, points, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
"""Assign gt to points.
This method assign a gt bbox to every points set, each points set
will be assigned with the background_label (-1), or a label number.
-1 is background, and semi-positive number is the index (0-based) of
assigned gt.
The assignment is done in following steps, the order matters.
1. assign every points to the background_label (-1)
2. A point is assigned to some gt bbox if
(i) the point is within the k closest points to the gt bbox
(ii) the distance between this point and the gt is smaller than
other gt bboxes
Args:
points (Tensor): points to be assigned, shape(n, 3) while last
dimension stands for (x, y, stride).
gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
labelled as `ignored`, e.g., crowd boxes in COCO.
NOTE: currently unused.
gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
Returns:
:obj:`AssignResult`: The assign result.
"""
num_points = points.shape[0]
num_gts = gt_bboxes.shape[0]
if num_gts == 0 or num_points == 0:
# If no truth assign everything to the background
assigned_gt_inds = points.new_full((num_points, ),
0,
dtype=torch.long)
if gt_labels is None:
assigned_labels = None
else:
assigned_labels = points.new_full((num_points, ),
-1,
dtype=torch.long)
return AssignResult(
num_gts, assigned_gt_inds, None, labels=assigned_labels)
points_xy = points[:, :2]
points_stride = points[:, 2]
points_lvl = torch.log2(
points_stride).int() # [3...,4...,5...,6...,7...]
lvl_min, lvl_max = points_lvl.min(), points_lvl.max()
# assign gt box
gt_bboxes_xy = (gt_bboxes[:, :2] + gt_bboxes[:, 2:]) / 2
gt_bboxes_wh = (gt_bboxes[:, 2:] - gt_bboxes[:, :2]).clamp(min=1e-6)
scale = self.scale
gt_bboxes_lvl = ((torch.log2(gt_bboxes_wh[:, 0] / scale) +
torch.log2(gt_bboxes_wh[:, 1] / scale)) / 2).int()
gt_bboxes_lvl = torch.clamp(gt_bboxes_lvl, min=lvl_min, max=lvl_max)
# stores the assigned gt index of each point
assigned_gt_inds = points.new_zeros((num_points, ), dtype=torch.long)
# stores the assigned gt dist (to this point) of each point
assigned_gt_dist = points.new_full((num_points, ), float('inf'))
points_range = torch.arange(points.shape[0])
for idx in range(num_gts):
gt_lvl = gt_bboxes_lvl[idx]
# get the index of points in this level
lvl_idx = gt_lvl == points_lvl
points_index = points_range[lvl_idx]
# get the points in this level
lvl_points = points_xy[lvl_idx, :]
# get the center point of gt
gt_point = gt_bboxes_xy[[idx], :]
# get width and height of gt
gt_wh = gt_bboxes_wh[[idx], :]
# compute the distance between gt center and
# all points in this level
points_gt_dist = ((lvl_points - gt_point) / gt_wh).norm(dim=1)
# find the nearest k points to gt center in this level
min_dist, min_dist_index = torch.topk(
points_gt_dist, self.pos_num, largest=False)
# the index of nearest k points to gt center in this level
min_dist_points_index = points_index[min_dist_index]
# The less_than_recorded_index stores the index
# of min_dist that is less then the assigned_gt_dist. Where
# assigned_gt_dist stores the dist from previous assigned gt
# (if exist) to each point.
less_than_recorded_index = min_dist < assigned_gt_dist[
min_dist_points_index]
# The min_dist_points_index stores the index of points satisfy:
# (1) it is k nearest to current gt center in this level.
# (2) it is closer to current gt center than other gt center.
min_dist_points_index = min_dist_points_index[
less_than_recorded_index]
# assign the result
assigned_gt_inds[min_dist_points_index] = idx + 1
assigned_gt_dist[min_dist_points_index] = min_dist[
less_than_recorded_index]
if gt_labels is not None:
assigned_labels = assigned_gt_inds.new_full((num_points, ), -1)
pos_inds = torch.nonzero(
assigned_gt_inds > 0, as_tuple=False).squeeze()
if pos_inds.numel() > 0:
assigned_labels[pos_inds] = gt_labels[
assigned_gt_inds[pos_inds] - 1]
else:
assigned_labels = None
return AssignResult(
num_gts, assigned_gt_inds, None, labels=assigned_labels)
import torch
from mmdet.core import anchor_inside_flags
from ..builder import BBOX_ASSIGNERS
from .assign_result import AssignResult
from .base_assigner import BaseAssigner
def calc_region(bbox, ratio, stride, featmap_size=None):
"""Calculate region of the box defined by the ratio, the ratio is from the
center of the box to every edge."""
# project bbox on the feature
f_bbox = bbox / stride
x1 = torch.round((1 - ratio) * f_bbox[0] + ratio * f_bbox[2])
y1 = torch.round((1 - ratio) * f_bbox[1] + ratio * f_bbox[3])
x2 = torch.round(ratio * f_bbox[0] + (1 - ratio) * f_bbox[2])
y2 = torch.round(ratio * f_bbox[1] + (1 - ratio) * f_bbox[3])
if featmap_size is not None:
x1 = x1.clamp(min=0, max=featmap_size[1])
y1 = y1.clamp(min=0, max=featmap_size[0])
x2 = x2.clamp(min=0, max=featmap_size[1])
y2 = y2.clamp(min=0, max=featmap_size[0])
return (x1, y1, x2, y2)
def anchor_ctr_inside_region_flags(anchors, stride, region):
"""Get the flag indicate whether anchor centers are inside regions."""
x1, y1, x2, y2 = region
f_anchors = anchors / stride
x = (f_anchors[:, 0] + f_anchors[:, 2]) * 0.5
y = (f_anchors[:, 1] + f_anchors[:, 3]) * 0.5
flags = (x >= x1) & (x <= x2) & (y >= y1) & (y <= y2)
return flags
@BBOX_ASSIGNERS.register_module()
class RegionAssigner(BaseAssigner):
"""Assign a corresponding gt bbox or background to each bbox.
Each proposals will be assigned with `-1`, `0`, or a positive integer
indicating the ground truth index.
- -1: don't care
- 0: negative sample, no assigned gt
- positive integer: positive sample, index (1-based) of assigned gt
Args:
center_ratio: ratio of the region in the center of the bbox to
define positive sample.
ignore_ratio: ratio of the region to define ignore samples.
"""
def __init__(self, center_ratio=0.2, ignore_ratio=0.5):
self.center_ratio = center_ratio
self.ignore_ratio = ignore_ratio
def assign(self,
mlvl_anchors,
mlvl_valid_flags,
gt_bboxes,
img_meta,
featmap_sizes,
anchor_scale,
anchor_strides,
gt_bboxes_ignore=None,
gt_labels=None,
allowed_border=0):
"""Assign gt to anchors.
This method assign a gt bbox to every bbox (proposal/anchor), each bbox
will be assigned with -1, 0, or a positive number. -1 means don't care,
0 means negative sample, positive number is the index (1-based) of
assigned gt.
The assignment is done in following steps, the order matters.
1. Assign every anchor to 0 (negative)
For each gt_bboxes:
2. Compute ignore flags based on ignore_region then
assign -1 to anchors w.r.t. ignore flags
3. Compute pos flags based on center_region then
assign gt_bboxes to anchors w.r.t. pos flags
4. Compute ignore flags based on adjacent anchor lvl then
assign -1 to anchors w.r.t. ignore flags
5. Assign anchor outside of image to -1
Args:
mlvl_anchors (list[Tensor]): Multi level anchors.
mlvl_valid_flags (list[Tensor]): Multi level valid flags.
gt_bboxes (Tensor): Ground truth bboxes of image
img_meta (dict): Meta info of image.
featmap_sizes (list[Tensor]): Feature mapsize each level
anchor_scale (int): Scale of the anchor.
anchor_strides (list[int]): Stride of the anchor.
gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
labelled as `ignored`, e.g., crowd boxes in COCO.
gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
allowed_border (int, optional): The border to allow the valid
anchor. Defaults to 0.
Returns:
:obj:`AssignResult`: The assign result.
"""
# TODO support gt_bboxes_ignore
if gt_bboxes_ignore is not None:
raise NotImplementedError
if gt_bboxes.shape[0] == 0:
raise ValueError('No gt bboxes')
num_gts = gt_bboxes.shape[0]
num_lvls = len(mlvl_anchors)
r1 = (1 - self.center_ratio) / 2
r2 = (1 - self.ignore_ratio) / 2
scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
(gt_bboxes[:, 3] - gt_bboxes[:, 1]))
min_anchor_size = scale.new_full(
(1, ), float(anchor_scale * anchor_strides[0]))
target_lvls = torch.floor(
torch.log2(scale) - torch.log2(min_anchor_size) + 0.5)
target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long()
# 1. assign 0 (negative) by default
mlvl_assigned_gt_inds = []
mlvl_ignore_flags = []
for lvl in range(num_lvls):
h, w = featmap_sizes[lvl]
assert h * w == mlvl_anchors[lvl].shape[0]
assigned_gt_inds = gt_bboxes.new_full((h * w, ),
0,
dtype=torch.long)
ignore_flags = torch.zeros_like(assigned_gt_inds)
mlvl_assigned_gt_inds.append(assigned_gt_inds)
mlvl_ignore_flags.append(ignore_flags)
for gt_id in range(num_gts):
lvl = target_lvls[gt_id].item()
featmap_size = featmap_sizes[lvl]
stride = anchor_strides[lvl]
anchors = mlvl_anchors[lvl]
gt_bbox = gt_bboxes[gt_id, :4]
# Compute regions
ignore_region = calc_region(gt_bbox, r2, stride, featmap_size)
ctr_region = calc_region(gt_bbox, r1, stride, featmap_size)
# 2. Assign -1 to ignore flags
ignore_flags = anchor_ctr_inside_region_flags(
anchors, stride, ignore_region)
mlvl_assigned_gt_inds[lvl][ignore_flags] = -1
# 3. Assign gt_bboxes to pos flags
pos_flags = anchor_ctr_inside_region_flags(anchors, stride,
ctr_region)
mlvl_assigned_gt_inds[lvl][pos_flags] = gt_id + 1
# 4. Assign -1 to ignore adjacent lvl
if lvl > 0:
d_lvl = lvl - 1
d_anchors = mlvl_anchors[d_lvl]
d_featmap_size = featmap_sizes[d_lvl]
d_stride = anchor_strides[d_lvl]
d_ignore_region = calc_region(gt_bbox, r2, d_stride,
d_featmap_size)
ignore_flags = anchor_ctr_inside_region_flags(
d_anchors, d_stride, d_ignore_region)
mlvl_ignore_flags[d_lvl][ignore_flags] = 1
if lvl < num_lvls - 1:
u_lvl = lvl + 1
u_anchors = mlvl_anchors[u_lvl]
u_featmap_size = featmap_sizes[u_lvl]
u_stride = anchor_strides[u_lvl]
u_ignore_region = calc_region(gt_bbox, r2, u_stride,
u_featmap_size)
ignore_flags = anchor_ctr_inside_region_flags(
u_anchors, u_stride, u_ignore_region)
mlvl_ignore_flags[u_lvl][ignore_flags] = 1
# 4. (cont.) Assign -1 to ignore adjacent lvl
for lvl in range(num_lvls):
ignore_flags = mlvl_ignore_flags[lvl]
mlvl_assigned_gt_inds[lvl][ignore_flags] = -1
# 5. Assign -1 to anchor outside of image
flat_assigned_gt_inds = torch.cat(mlvl_assigned_gt_inds)
flat_anchors = torch.cat(mlvl_anchors)
flat_valid_flags = torch.cat(mlvl_valid_flags)
assert (flat_assigned_gt_inds.shape[0] == flat_anchors.shape[0] ==
flat_valid_flags.shape[0])
inside_flags = anchor_inside_flags(flat_anchors, flat_valid_flags,
img_meta['img_shape'],
allowed_border)
outside_flags = ~inside_flags
flat_assigned_gt_inds[outside_flags] = -1
if gt_labels is not None:
assigned_labels = torch.zeros_like(flat_assigned_gt_inds)
pos_flags = assigned_gt_inds > 0
assigned_labels[pos_flags] = gt_labels[
flat_assigned_gt_inds[pos_flags] - 1]
else:
assigned_labels = None
return AssignResult(
num_gts, flat_assigned_gt_inds, None, labels=assigned_labels)
from mmcv.utils import Registry, build_from_cfg
BBOX_ASSIGNERS = Registry('bbox_assigner')
BBOX_SAMPLERS = Registry('bbox_sampler')
BBOX_CODERS = Registry('bbox_coder')
def build_assigner(cfg, **default_args):
"""Builder of box assigner."""
return build_from_cfg(cfg, BBOX_ASSIGNERS, default_args)
def build_sampler(cfg, **default_args):
"""Builder of box sampler."""
return build_from_cfg(cfg, BBOX_SAMPLERS, default_args)
def build_bbox_coder(cfg, **default_args):
"""Builder of box coder."""
return build_from_cfg(cfg, BBOX_CODERS, default_args)
from .base_bbox_coder import BaseBBoxCoder
from .bucketing_bbox_coder import BucketingBBoxCoder
from .delta_xywh_bbox_coder import DeltaXYWHBBoxCoder
from .legacy_delta_xywh_bbox_coder import LegacyDeltaXYWHBBoxCoder
from .pseudo_bbox_coder import PseudoBBoxCoder
from .tblr_bbox_coder import TBLRBBoxCoder
from .yolo_bbox_coder import YOLOBBoxCoder
__all__ = [
'BaseBBoxCoder', 'PseudoBBoxCoder', 'DeltaXYWHBBoxCoder',
'LegacyDeltaXYWHBBoxCoder', 'TBLRBBoxCoder', 'YOLOBBoxCoder',
'BucketingBBoxCoder'
]
from abc import ABCMeta, abstractmethod
class BaseBBoxCoder(metaclass=ABCMeta):
"""Base bounding box coder."""
def __init__(self, **kwargs):
pass
@abstractmethod
def encode(self, bboxes, gt_bboxes):
"""Encode deltas between bboxes and ground truth boxes."""
pass
@abstractmethod
def decode(self, bboxes, bboxes_pred):
"""Decode the predicted bboxes according to prediction and base
boxes."""
pass
import numpy as np
import torch
import torch.nn.functional as F
from ..builder import BBOX_CODERS
from ..transforms import bbox_rescale
from .base_bbox_coder import BaseBBoxCoder
@BBOX_CODERS.register_module()
class BucketingBBoxCoder(BaseBBoxCoder):
"""Bucketing BBox Coder for Side-Aware Bounday Localization (SABL).
Boundary Localization with Bucketing and Bucketing Guided Rescoring
are implemented here.
Please refer to https://arxiv.org/abs/1912.04260 for more details.
Args:
num_buckets (int): Number of buckets.
scale_factor (int): Scale factor of proposals to generate buckets.
offset_topk (int): Topk buckets are used to generate
bucket fine regression targets. Defaults to 2.
offset_upperbound (float): Offset upperbound to generate
bucket fine regression targets.
To avoid too large offset displacements. Defaults to 1.0.
cls_ignore_neighbor (bool): Ignore second nearest bucket or Not.
Defaults to True.
clip_border (bool, optional): Whether clip the objects outside the
border of the image. Defaults to True.
"""
def __init__(self,
num_buckets,
scale_factor,
offset_topk=2,
offset_upperbound=1.0,
cls_ignore_neighbor=True,
clip_border=True):
super(BucketingBBoxCoder, self).__init__()
self.num_buckets = num_buckets
self.scale_factor = scale_factor
self.offset_topk = offset_topk
self.offset_upperbound = offset_upperbound
self.cls_ignore_neighbor = cls_ignore_neighbor
self.clip_border = clip_border
def encode(self, bboxes, gt_bboxes):
"""Get bucketing estimation and fine regression targets during
training.
Args:
bboxes (torch.Tensor): source boxes, e.g., object proposals.
gt_bboxes (torch.Tensor): target of the transformation, e.g.,
ground truth boxes.
Returns:
encoded_bboxes(tuple[Tensor]): bucketing estimation
and fine regression targets and weights
"""
assert bboxes.size(0) == gt_bboxes.size(0)
assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
encoded_bboxes = bbox2bucket(bboxes, gt_bboxes, self.num_buckets,
self.scale_factor, self.offset_topk,
self.offset_upperbound,
self.cls_ignore_neighbor)
return encoded_bboxes
def decode(self, bboxes, pred_bboxes, max_shape=None):
"""Apply transformation `pred_bboxes` to `boxes`.
Args:
boxes (torch.Tensor): Basic boxes.
pred_bboxes (torch.Tensor): Predictions for bucketing estimation
and fine regression
max_shape (tuple[int], optional): Maximum shape of boxes.
Defaults to None.
Returns:
torch.Tensor: Decoded boxes.
"""
assert len(pred_bboxes) == 2
cls_preds, offset_preds = pred_bboxes
assert cls_preds.size(0) == bboxes.size(0) and offset_preds.size(
0) == bboxes.size(0)
decoded_bboxes = bucket2bbox(bboxes, cls_preds, offset_preds,
self.num_buckets, self.scale_factor,
max_shape, self.clip_border)
return decoded_bboxes
def generat_buckets(proposals, num_buckets, scale_factor=1.0):
"""Generate buckets w.r.t bucket number and scale factor of proposals.
Args:
proposals (Tensor): Shape (n, 4)
num_buckets (int): Number of buckets.
scale_factor (float): Scale factor to rescale proposals.
Returns:
tuple[Tensor]: (bucket_w, bucket_h, l_buckets, r_buckets,
t_buckets, d_buckets)
- bucket_w: Width of buckets on x-axis. Shape (n, ).
- bucket_h: Height of buckets on y-axis. Shape (n, ).
- l_buckets: Left buckets. Shape (n, ceil(side_num/2)).
- r_buckets: Right buckets. Shape (n, ceil(side_num/2)).
- t_buckets: Top buckets. Shape (n, ceil(side_num/2)).
- d_buckets: Down buckets. Shape (n, ceil(side_num/2)).
"""
proposals = bbox_rescale(proposals, scale_factor)
# number of buckets in each side
side_num = int(np.ceil(num_buckets / 2.0))
pw = proposals[..., 2] - proposals[..., 0]
ph = proposals[..., 3] - proposals[..., 1]
px1 = proposals[..., 0]
py1 = proposals[..., 1]
px2 = proposals[..., 2]
py2 = proposals[..., 3]
bucket_w = pw / num_buckets
bucket_h = ph / num_buckets
# left buckets
l_buckets = px1[:, None] + (0.5 + torch.arange(
0, side_num).to(proposals).float())[None, :] * bucket_w[:, None]
# right buckets
r_buckets = px2[:, None] - (0.5 + torch.arange(
0, side_num).to(proposals).float())[None, :] * bucket_w[:, None]
# top buckets
t_buckets = py1[:, None] + (0.5 + torch.arange(
0, side_num).to(proposals).float())[None, :] * bucket_h[:, None]
# down buckets
d_buckets = py2[:, None] - (0.5 + torch.arange(
0, side_num).to(proposals).float())[None, :] * bucket_h[:, None]
return bucket_w, bucket_h, l_buckets, r_buckets, t_buckets, d_buckets
def bbox2bucket(proposals,
gt,
num_buckets,
scale_factor,
offset_topk=2,
offset_upperbound=1.0,
cls_ignore_neighbor=True):
"""Generate buckets estimation and fine regression targets.
Args:
proposals (Tensor): Shape (n, 4)
gt (Tensor): Shape (n, 4)
num_buckets (int): Number of buckets.
scale_factor (float): Scale factor to rescale proposals.
offset_topk (int): Topk buckets are used to generate
bucket fine regression targets. Defaults to 2.
offset_upperbound (float): Offset allowance to generate
bucket fine regression targets.
To avoid too large offset displacements. Defaults to 1.0.
cls_ignore_neighbor (bool): Ignore second nearest bucket or Not.
Defaults to True.
Returns:
tuple[Tensor]: (offsets, offsets_weights, bucket_labels, cls_weights).
- offsets: Fine regression targets. \
Shape (n, num_buckets*2).
- offsets_weights: Fine regression weights. \
Shape (n, num_buckets*2).
- bucket_labels: Bucketing estimation labels. \
Shape (n, num_buckets*2).
- cls_weights: Bucketing estimation weights. \
Shape (n, num_buckets*2).
"""
assert proposals.size() == gt.size()
# generate buckets
proposals = proposals.float()
gt = gt.float()
(bucket_w, bucket_h, l_buckets, r_buckets, t_buckets,
d_buckets) = generat_buckets(proposals, num_buckets, scale_factor)
gx1 = gt[..., 0]
gy1 = gt[..., 1]
gx2 = gt[..., 2]
gy2 = gt[..., 3]
# generate offset targets and weights
# offsets from buckets to gts
l_offsets = (l_buckets - gx1[:, None]) / bucket_w[:, None]
r_offsets = (r_buckets - gx2[:, None]) / bucket_w[:, None]
t_offsets = (t_buckets - gy1[:, None]) / bucket_h[:, None]
d_offsets = (d_buckets - gy2[:, None]) / bucket_h[:, None]
# select top-k nearset buckets
l_topk, l_label = l_offsets.abs().topk(
offset_topk, dim=1, largest=False, sorted=True)
r_topk, r_label = r_offsets.abs().topk(
offset_topk, dim=1, largest=False, sorted=True)
t_topk, t_label = t_offsets.abs().topk(
offset_topk, dim=1, largest=False, sorted=True)
d_topk, d_label = d_offsets.abs().topk(
offset_topk, dim=1, largest=False, sorted=True)
offset_l_weights = l_offsets.new_zeros(l_offsets.size())
offset_r_weights = r_offsets.new_zeros(r_offsets.size())
offset_t_weights = t_offsets.new_zeros(t_offsets.size())
offset_d_weights = d_offsets.new_zeros(d_offsets.size())
inds = torch.arange(0, proposals.size(0)).to(proposals).long()
# generate offset weights of top-k nearset buckets
for k in range(offset_topk):
if k >= 1:
offset_l_weights[inds, l_label[:,
k]] = (l_topk[:, k] <
offset_upperbound).float()
offset_r_weights[inds, r_label[:,
k]] = (r_topk[:, k] <
offset_upperbound).float()
offset_t_weights[inds, t_label[:,
k]] = (t_topk[:, k] <
offset_upperbound).float()
offset_d_weights[inds, d_label[:,
k]] = (d_topk[:, k] <
offset_upperbound).float()
else:
offset_l_weights[inds, l_label[:, k]] = 1.0
offset_r_weights[inds, r_label[:, k]] = 1.0
offset_t_weights[inds, t_label[:, k]] = 1.0
offset_d_weights[inds, d_label[:, k]] = 1.0
offsets = torch.cat([l_offsets, r_offsets, t_offsets, d_offsets], dim=-1)
offsets_weights = torch.cat([
offset_l_weights, offset_r_weights, offset_t_weights, offset_d_weights
],
dim=-1)
# generate bucket labels and weight
side_num = int(np.ceil(num_buckets / 2.0))
labels = torch.stack(
[l_label[:, 0], r_label[:, 0], t_label[:, 0], d_label[:, 0]], dim=-1)
batch_size = labels.size(0)
bucket_labels = F.one_hot(labels.view(-1), side_num).view(batch_size,
-1).float()
bucket_cls_l_weights = (l_offsets.abs() < 1).float()
bucket_cls_r_weights = (r_offsets.abs() < 1).float()
bucket_cls_t_weights = (t_offsets.abs() < 1).float()
bucket_cls_d_weights = (d_offsets.abs() < 1).float()
bucket_cls_weights = torch.cat([
bucket_cls_l_weights, bucket_cls_r_weights, bucket_cls_t_weights,
bucket_cls_d_weights
],
dim=-1)
# ignore second nearest buckets for cls if necessay
if cls_ignore_neighbor:
bucket_cls_weights = (~((bucket_cls_weights == 1) &
(bucket_labels == 0))).float()
else:
bucket_cls_weights[:] = 1.0
return offsets, offsets_weights, bucket_labels, bucket_cls_weights
def bucket2bbox(proposals,
cls_preds,
offset_preds,
num_buckets,
scale_factor=1.0,
max_shape=None,
clip_border=True):
"""Apply bucketing estimation (cls preds) and fine regression (offset
preds) to generate det bboxes.
Args:
proposals (Tensor): Boxes to be transformed. Shape (n, 4)
cls_preds (Tensor): bucketing estimation. Shape (n, num_buckets*2).
offset_preds (Tensor): fine regression. Shape (n, num_buckets*2).
num_buckets (int): Number of buckets.
scale_factor (float): Scale factor to rescale proposals.
max_shape (tuple[int, int]): Maximum bounds for boxes. specifies (H, W)
clip_border (bool, optional): Whether clip the objects outside the
border of the image. Defaults to True.
Returns:
tuple[Tensor]: (bboxes, loc_confidence).
- bboxes: predicted bboxes. Shape (n, 4)
- loc_confidence: localization confidence of predicted bboxes.
Shape (n,).
"""
side_num = int(np.ceil(num_buckets / 2.0))
cls_preds = cls_preds.view(-1, side_num)
offset_preds = offset_preds.view(-1, side_num)
scores = F.softmax(cls_preds, dim=1)
score_topk, score_label = scores.topk(2, dim=1, largest=True, sorted=True)
rescaled_proposals = bbox_rescale(proposals, scale_factor)
pw = rescaled_proposals[..., 2] - rescaled_proposals[..., 0]
ph = rescaled_proposals[..., 3] - rescaled_proposals[..., 1]
px1 = rescaled_proposals[..., 0]
py1 = rescaled_proposals[..., 1]
px2 = rescaled_proposals[..., 2]
py2 = rescaled_proposals[..., 3]
bucket_w = pw / num_buckets
bucket_h = ph / num_buckets
score_inds_l = score_label[0::4, 0]
score_inds_r = score_label[1::4, 0]
score_inds_t = score_label[2::4, 0]
score_inds_d = score_label[3::4, 0]
l_buckets = px1 + (0.5 + score_inds_l.float()) * bucket_w
r_buckets = px2 - (0.5 + score_inds_r.float()) * bucket_w
t_buckets = py1 + (0.5 + score_inds_t.float()) * bucket_h
d_buckets = py2 - (0.5 + score_inds_d.float()) * bucket_h
offsets = offset_preds.view(-1, 4, side_num)
inds = torch.arange(proposals.size(0)).to(proposals).long()
l_offsets = offsets[:, 0, :][inds, score_inds_l]
r_offsets = offsets[:, 1, :][inds, score_inds_r]
t_offsets = offsets[:, 2, :][inds, score_inds_t]
d_offsets = offsets[:, 3, :][inds, score_inds_d]
x1 = l_buckets - l_offsets * bucket_w
x2 = r_buckets - r_offsets * bucket_w
y1 = t_buckets - t_offsets * bucket_h
y2 = d_buckets - d_offsets * bucket_h
if clip_border and max_shape is not None:
x1 = x1.clamp(min=0, max=max_shape[1] - 1)
y1 = y1.clamp(min=0, max=max_shape[0] - 1)
x2 = x2.clamp(min=0, max=max_shape[1] - 1)
y2 = y2.clamp(min=0, max=max_shape[0] - 1)
bboxes = torch.cat([x1[:, None], y1[:, None], x2[:, None], y2[:, None]],
dim=-1)
# bucketing guided rescoring
loc_confidence = score_topk[:, 0]
top2_neighbor_inds = (score_label[:, 0] - score_label[:, 1]).abs() == 1
loc_confidence += score_topk[:, 1] * top2_neighbor_inds.float()
loc_confidence = loc_confidence.view(-1, 4).mean(dim=1)
return bboxes, loc_confidence
import numpy as np
import torch
from ..builder import BBOX_CODERS
from .base_bbox_coder import BaseBBoxCoder
@BBOX_CODERS.register_module()
class DeltaXYWHBBoxCoder(BaseBBoxCoder):
"""Delta XYWH BBox coder.
Following the practice in `R-CNN <https://arxiv.org/abs/1311.2524>`_,
this coder encodes bbox (x1, y1, x2, y2) into delta (dx, dy, dw, dh) and
decodes delta (dx, dy, dw, dh) back to original bbox (x1, y1, x2, y2).
Args:
target_means (Sequence[float]): Denormalizing means of target for
delta coordinates
target_stds (Sequence[float]): Denormalizing standard deviation of
target for delta coordinates
clip_border (bool, optional): Whether clip the objects outside the
border of the image. Defaults to True.
"""
def __init__(self,
target_means=(0., 0., 0., 0.),
target_stds=(1., 1., 1., 1.),
clip_border=True):
super(BaseBBoxCoder, self).__init__()
self.means = target_means
self.stds = target_stds
self.clip_border = clip_border
def encode(self, bboxes, gt_bboxes):
"""Get box regression transformation deltas that can be used to
transform the ``bboxes`` into the ``gt_bboxes``.
Args:
bboxes (torch.Tensor): Source boxes, e.g., object proposals.
gt_bboxes (torch.Tensor): Target of the transformation, e.g.,
ground-truth boxes.
Returns:
torch.Tensor: Box transformation deltas
"""
assert bboxes.size(0) == gt_bboxes.size(0)
assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
encoded_bboxes = bbox2delta(bboxes, gt_bboxes, self.means, self.stds)
return encoded_bboxes
def decode(self,
bboxes,
pred_bboxes,
max_shape=None,
wh_ratio_clip=16 / 1000):
"""Apply transformation `pred_bboxes` to `boxes`.
Args:
boxes (torch.Tensor): Basic boxes.
pred_bboxes (torch.Tensor): Encoded boxes with shape
max_shape (tuple[int], optional): Maximum shape of boxes.
Defaults to None.
wh_ratio_clip (float, optional): The allowed ratio between
width and height.
Returns:
torch.Tensor: Decoded boxes.
"""
assert pred_bboxes.size(0) == bboxes.size(0)
decoded_bboxes = delta2bbox(bboxes, pred_bboxes, self.means, self.stds,
max_shape, wh_ratio_clip, self.clip_border)
return decoded_bboxes
def bbox2delta(proposals, gt, means=(0., 0., 0., 0.), stds=(1., 1., 1., 1.)):
"""Compute deltas of proposals w.r.t. gt.
We usually compute the deltas of x, y, w, h of proposals w.r.t ground
truth bboxes to get regression target.
This is the inverse function of :func:`delta2bbox`.
Args:
proposals (Tensor): Boxes to be transformed, shape (N, ..., 4)
gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4)
means (Sequence[float]): Denormalizing means for delta coordinates
stds (Sequence[float]): Denormalizing standard deviation for delta
coordinates
Returns:
Tensor: deltas with shape (N, 4), where columns represent dx, dy,
dw, dh.
"""
assert proposals.size() == gt.size()
proposals = proposals.float()
gt = gt.float()
px = (proposals[..., 0] + proposals[..., 2]) * 0.5
py = (proposals[..., 1] + proposals[..., 3]) * 0.5
pw = proposals[..., 2] - proposals[..., 0]
ph = proposals[..., 3] - proposals[..., 1]
gx = (gt[..., 0] + gt[..., 2]) * 0.5
gy = (gt[..., 1] + gt[..., 3]) * 0.5
gw = gt[..., 2] - gt[..., 0]
gh = gt[..., 3] - gt[..., 1]
dx = (gx - px) / pw
dy = (gy - py) / ph
dw = torch.log(gw / pw)
dh = torch.log(gh / ph)
deltas = torch.stack([dx, dy, dw, dh], dim=-1)
means = deltas.new_tensor(means).unsqueeze(0)
stds = deltas.new_tensor(stds).unsqueeze(0)
deltas = deltas.sub_(means).div_(stds)
return deltas
def delta2bbox(rois,
deltas,
means=(0., 0., 0., 0.),
stds=(1., 1., 1., 1.),
max_shape=None,
wh_ratio_clip=16 / 1000,
clip_border=True):
"""Apply deltas to shift/scale base boxes.
Typically the rois are anchor or proposed bounding boxes and the deltas are
network outputs used to shift/scale those boxes.
This is the inverse function of :func:`bbox2delta`.
Args:
rois (Tensor): Boxes to be transformed. Has shape (N, 4)
deltas (Tensor): Encoded offsets with respect to each roi.
Has shape (N, 4 * num_classes). Note N = num_anchors * W * H when
rois is a grid of anchors. Offset encoding follows [1]_.
means (Sequence[float]): Denormalizing means for delta coordinates
stds (Sequence[float]): Denormalizing standard deviation for delta
coordinates
max_shape (tuple[int, int]): Maximum bounds for boxes. specifies (H, W)
wh_ratio_clip (float): Maximum aspect ratio for boxes.
clip_border (bool, optional): Whether clip the objects outside the
border of the image. Defaults to True.
Returns:
Tensor: Boxes with shape (N, 4), where columns represent
tl_x, tl_y, br_x, br_y.
References:
.. [1] https://arxiv.org/abs/1311.2524
Example:
>>> rois = torch.Tensor([[ 0., 0., 1., 1.],
>>> [ 0., 0., 1., 1.],
>>> [ 0., 0., 1., 1.],
>>> [ 5., 5., 5., 5.]])
>>> deltas = torch.Tensor([[ 0., 0., 0., 0.],
>>> [ 1., 1., 1., 1.],
>>> [ 0., 0., 2., -1.],
>>> [ 0.7, -1.9, -0.5, 0.3]])
>>> delta2bbox(rois, deltas, max_shape=(32, 32))
tensor([[0.0000, 0.0000, 1.0000, 1.0000],
[0.1409, 0.1409, 2.8591, 2.8591],
[0.0000, 0.3161, 4.1945, 0.6839],
[5.0000, 5.0000, 5.0000, 5.0000]])
"""
means = deltas.new_tensor(means).view(1, -1).repeat(1, deltas.size(1) // 4)
stds = deltas.new_tensor(stds).view(1, -1).repeat(1, deltas.size(1) // 4)
denorm_deltas = deltas * stds + means
dx = denorm_deltas[:, 0::4]
dy = denorm_deltas[:, 1::4]
dw = denorm_deltas[:, 2::4]
dh = denorm_deltas[:, 3::4]
max_ratio = np.abs(np.log(wh_ratio_clip))
dw = dw.clamp(min=-max_ratio, max=max_ratio)
dh = dh.clamp(min=-max_ratio, max=max_ratio)
# Compute center of each roi
px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx)
py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy)
# Compute width/height of each roi
pw = (rois[:, 2] - rois[:, 0]).unsqueeze(1).expand_as(dw)
ph = (rois[:, 3] - rois[:, 1]).unsqueeze(1).expand_as(dh)
# Use exp(network energy) to enlarge/shrink each roi
gw = pw * dw.exp()
gh = ph * dh.exp()
# Use network energy to shift the center of each roi
gx = px + pw * dx
gy = py + ph * dy
# Convert center-xy/width/height to top-left, bottom-right
x1 = gx - gw * 0.5
y1 = gy - gh * 0.5
x2 = gx + gw * 0.5
y2 = gy + gh * 0.5
if clip_border and max_shape is not None:
x1 = x1.clamp(min=0, max=max_shape[1])
y1 = y1.clamp(min=0, max=max_shape[0])
x2 = x2.clamp(min=0, max=max_shape[1])
y2 = y2.clamp(min=0, max=max_shape[0])
bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size())
return bboxes
import numpy as np
import torch
from ..builder import BBOX_CODERS
from .base_bbox_coder import BaseBBoxCoder
@BBOX_CODERS.register_module()
class LegacyDeltaXYWHBBoxCoder(BaseBBoxCoder):
"""Legacy Delta XYWH BBox coder used in MMDet V1.x.
Following the practice in R-CNN [1]_, this coder encodes bbox (x1, y1, x2,
y2) into delta (dx, dy, dw, dh) and decodes delta (dx, dy, dw, dh)
back to original bbox (x1, y1, x2, y2).
Note:
The main difference between :class`LegacyDeltaXYWHBBoxCoder` and
:class:`DeltaXYWHBBoxCoder` is whether ``+ 1`` is used during width and
height calculation. We suggest to only use this coder when testing with
MMDet V1.x models.
References:
.. [1] https://arxiv.org/abs/1311.2524
Args:
target_means (Sequence[float]): denormalizing means of target for
delta coordinates
target_stds (Sequence[float]): denormalizing standard deviation of
target for delta coordinates
"""
def __init__(self,
target_means=(0., 0., 0., 0.),
target_stds=(1., 1., 1., 1.)):
super(BaseBBoxCoder, self).__init__()
self.means = target_means
self.stds = target_stds
def encode(self, bboxes, gt_bboxes):
"""Get box regression transformation deltas that can be used to
transform the ``bboxes`` into the ``gt_bboxes``.
Args:
bboxes (torch.Tensor): source boxes, e.g., object proposals.
gt_bboxes (torch.Tensor): target of the transformation, e.g.,
ground-truth boxes.
Returns:
torch.Tensor: Box transformation deltas
"""
assert bboxes.size(0) == gt_bboxes.size(0)
assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
encoded_bboxes = legacy_bbox2delta(bboxes, gt_bboxes, self.means,
self.stds)
return encoded_bboxes
def decode(self,
bboxes,
pred_bboxes,
max_shape=None,
wh_ratio_clip=16 / 1000):
"""Apply transformation `pred_bboxes` to `boxes`.
Args:
boxes (torch.Tensor): Basic boxes.
pred_bboxes (torch.Tensor): Encoded boxes with shape
max_shape (tuple[int], optional): Maximum shape of boxes.
Defaults to None.
wh_ratio_clip (float, optional): The allowed ratio between
width and height.
Returns:
torch.Tensor: Decoded boxes.
"""
assert pred_bboxes.size(0) == bboxes.size(0)
decoded_bboxes = legacy_delta2bbox(bboxes, pred_bboxes, self.means,
self.stds, max_shape, wh_ratio_clip)
return decoded_bboxes
def legacy_bbox2delta(proposals,
gt,
means=(0., 0., 0., 0.),
stds=(1., 1., 1., 1.)):
"""Compute deltas of proposals w.r.t. gt in the MMDet V1.x manner.
We usually compute the deltas of x, y, w, h of proposals w.r.t ground
truth bboxes to get regression target.
This is the inverse function of `delta2bbox()`
Args:
proposals (Tensor): Boxes to be transformed, shape (N, ..., 4)
gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4)
means (Sequence[float]): Denormalizing means for delta coordinates
stds (Sequence[float]): Denormalizing standard deviation for delta
coordinates
Returns:
Tensor: deltas with shape (N, 4), where columns represent dx, dy,
dw, dh.
"""
assert proposals.size() == gt.size()
proposals = proposals.float()
gt = gt.float()
px = (proposals[..., 0] + proposals[..., 2]) * 0.5
py = (proposals[..., 1] + proposals[..., 3]) * 0.5
pw = proposals[..., 2] - proposals[..., 0] + 1.0
ph = proposals[..., 3] - proposals[..., 1] + 1.0
gx = (gt[..., 0] + gt[..., 2]) * 0.5
gy = (gt[..., 1] + gt[..., 3]) * 0.5
gw = gt[..., 2] - gt[..., 0] + 1.0
gh = gt[..., 3] - gt[..., 1] + 1.0
dx = (gx - px) / pw
dy = (gy - py) / ph
dw = torch.log(gw / pw)
dh = torch.log(gh / ph)
deltas = torch.stack([dx, dy, dw, dh], dim=-1)
means = deltas.new_tensor(means).unsqueeze(0)
stds = deltas.new_tensor(stds).unsqueeze(0)
deltas = deltas.sub_(means).div_(stds)
return deltas
def legacy_delta2bbox(rois,
deltas,
means=(0., 0., 0., 0.),
stds=(1., 1., 1., 1.),
max_shape=None,
wh_ratio_clip=16 / 1000):
"""Apply deltas to shift/scale base boxes in the MMDet V1.x manner.
Typically the rois are anchor or proposed bounding boxes and the deltas are
network outputs used to shift/scale those boxes.
This is the inverse function of `bbox2delta()`
Args:
rois (Tensor): Boxes to be transformed. Has shape (N, 4)
deltas (Tensor): Encoded offsets with respect to each roi.
Has shape (N, 4 * num_classes). Note N = num_anchors * W * H when
rois is a grid of anchors. Offset encoding follows [1]_.
means (Sequence[float]): Denormalizing means for delta coordinates
stds (Sequence[float]): Denormalizing standard deviation for delta
coordinates
max_shape (tuple[int, int]): Maximum bounds for boxes. specifies (H, W)
wh_ratio_clip (float): Maximum aspect ratio for boxes.
Returns:
Tensor: Boxes with shape (N, 4), where columns represent
tl_x, tl_y, br_x, br_y.
References:
.. [1] https://arxiv.org/abs/1311.2524
Example:
>>> rois = torch.Tensor([[ 0., 0., 1., 1.],
>>> [ 0., 0., 1., 1.],
>>> [ 0., 0., 1., 1.],
>>> [ 5., 5., 5., 5.]])
>>> deltas = torch.Tensor([[ 0., 0., 0., 0.],
>>> [ 1., 1., 1., 1.],
>>> [ 0., 0., 2., -1.],
>>> [ 0.7, -1.9, -0.5, 0.3]])
>>> legacy_delta2bbox(rois, deltas, max_shape=(32, 32))
tensor([[0.0000, 0.0000, 1.5000, 1.5000],
[0.0000, 0.0000, 5.2183, 5.2183],
[0.0000, 0.1321, 7.8891, 0.8679],
[5.3967, 2.4251, 6.0033, 3.7749]])
"""
means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4)
stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4)
denorm_deltas = deltas * stds + means
dx = denorm_deltas[:, 0::4]
dy = denorm_deltas[:, 1::4]
dw = denorm_deltas[:, 2::4]
dh = denorm_deltas[:, 3::4]
max_ratio = np.abs(np.log(wh_ratio_clip))
dw = dw.clamp(min=-max_ratio, max=max_ratio)
dh = dh.clamp(min=-max_ratio, max=max_ratio)
# Compute center of each roi
px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx)
py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy)
# Compute width/height of each roi
pw = (rois[:, 2] - rois[:, 0] + 1.0).unsqueeze(1).expand_as(dw)
ph = (rois[:, 3] - rois[:, 1] + 1.0).unsqueeze(1).expand_as(dh)
# Use exp(network energy) to enlarge/shrink each roi
gw = pw * dw.exp()
gh = ph * dh.exp()
# Use network energy to shift the center of each roi
gx = px + pw * dx
gy = py + ph * dy
# Convert center-xy/width/height to top-left, bottom-right
# The true legacy box coder should +- 0.5 here.
# However, current implementation improves the performance when testing
# the models trained in MMDetection 1.X (~0.5 bbox AP, 0.2 mask AP)
x1 = gx - gw * 0.5
y1 = gy - gh * 0.5
x2 = gx + gw * 0.5
y2 = gy + gh * 0.5
if max_shape is not None:
x1 = x1.clamp(min=0, max=max_shape[1] - 1)
y1 = y1.clamp(min=0, max=max_shape[0] - 1)
x2 = x2.clamp(min=0, max=max_shape[1] - 1)
y2 = y2.clamp(min=0, max=max_shape[0] - 1)
bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view_as(deltas)
return bboxes
from ..builder import BBOX_CODERS
from .base_bbox_coder import BaseBBoxCoder
@BBOX_CODERS.register_module()
class PseudoBBoxCoder(BaseBBoxCoder):
"""Pseudo bounding box coder."""
def __init__(self, **kwargs):
super(BaseBBoxCoder, self).__init__(**kwargs)
def encode(self, bboxes, gt_bboxes):
"""torch.Tensor: return the given ``bboxes``"""
return gt_bboxes
def decode(self, bboxes, pred_bboxes):
"""torch.Tensor: return the given ``pred_bboxes``"""
return pred_bboxes
import torch
from ..builder import BBOX_CODERS
from .base_bbox_coder import BaseBBoxCoder
@BBOX_CODERS.register_module()
class TBLRBBoxCoder(BaseBBoxCoder):
"""TBLR BBox coder.
Following the practice in `FSAF <https://arxiv.org/abs/1903.00621>`_,
this coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left,
right) and decode it back to the original.
Args:
normalizer (list | float): Normalization factor to be
divided with when coding the coordinates. If it is a list, it should
have length of 4 indicating normalization factor in tblr dims.
Otherwise it is a unified float factor for all dims. Default: 4.0
clip_border (bool, optional): Whether clip the objects outside the
border of the image. Defaults to True.
"""
def __init__(self, normalizer=4.0, clip_border=True):
super(BaseBBoxCoder, self).__init__()
self.normalizer = normalizer
self.clip_border = clip_border
def encode(self, bboxes, gt_bboxes):
"""Get box regression transformation deltas that can be used to
transform the ``bboxes`` into the ``gt_bboxes`` in the (top, left,
bottom, right) order.
Args:
bboxes (torch.Tensor): source boxes, e.g., object proposals.
gt_bboxes (torch.Tensor): target of the transformation, e.g.,
ground truth boxes.
Returns:
torch.Tensor: Box transformation deltas
"""
assert bboxes.size(0) == gt_bboxes.size(0)
assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
encoded_bboxes = bboxes2tblr(
bboxes, gt_bboxes, normalizer=self.normalizer)
return encoded_bboxes
def decode(self, bboxes, pred_bboxes, max_shape=None):
"""Apply transformation `pred_bboxes` to `boxes`.
Args:
boxes (torch.Tensor): Basic boxes.
pred_bboxes (torch.Tensor): Encoded boxes with shape
max_shape (tuple[int], optional): Maximum shape of boxes.
Defaults to None.
Returns:
torch.Tensor: Decoded boxes.
"""
assert pred_bboxes.size(0) == bboxes.size(0)
decoded_bboxes = tblr2bboxes(
bboxes,
pred_bboxes,
normalizer=self.normalizer,
max_shape=max_shape,
clip_border=self.clip_border)
return decoded_bboxes
def bboxes2tblr(priors, gts, normalizer=4.0, normalize_by_wh=True):
"""Encode ground truth boxes to tblr coordinate.
It first convert the gt coordinate to tblr format,
(top, bottom, left, right), relative to prior box centers.
The tblr coordinate may be normalized by the side length of prior bboxes
if `normalize_by_wh` is specified as True, and it is then normalized by
the `normalizer` factor.
Args:
priors (Tensor): Prior boxes in point form
Shape: (num_proposals,4).
gts (Tensor): Coords of ground truth for each prior in point-form
Shape: (num_proposals, 4).
normalizer (Sequence[float] | float): normalization parameter of
encoded boxes. If it is a list, it has to have length = 4.
Default: 4.0
normalize_by_wh (bool): Whether to normalize tblr coordinate by the
side length (wh) of prior bboxes.
Return:
encoded boxes (Tensor), Shape: (num_proposals, 4)
"""
# dist b/t match center and prior's center
if not isinstance(normalizer, float):
normalizer = torch.tensor(normalizer, device=priors.device)
assert len(normalizer) == 4, 'Normalizer must have length = 4'
assert priors.size(0) == gts.size(0)
prior_centers = (priors[:, 0:2] + priors[:, 2:4]) / 2
xmin, ymin, xmax, ymax = gts.split(1, dim=1)
top = prior_centers[:, 1].unsqueeze(1) - ymin
bottom = ymax - prior_centers[:, 1].unsqueeze(1)
left = prior_centers[:, 0].unsqueeze(1) - xmin
right = xmax - prior_centers[:, 0].unsqueeze(1)
loc = torch.cat((top, bottom, left, right), dim=1)
if normalize_by_wh:
# Normalize tblr by anchor width and height
wh = priors[:, 2:4] - priors[:, 0:2]
w, h = torch.split(wh, 1, dim=1)
loc[:, :2] /= h # tb is normalized by h
loc[:, 2:] /= w # lr is normalized by w
# Normalize tblr by the given normalization factor
return loc / normalizer
def tblr2bboxes(priors,
tblr,
normalizer=4.0,
normalize_by_wh=True,
max_shape=None,
clip_border=True):
"""Decode tblr outputs to prediction boxes.
The process includes 3 steps: 1) De-normalize tblr coordinates by
multiplying it with `normalizer`; 2) De-normalize tblr coordinates by the
prior bbox width and height if `normalize_by_wh` is `True`; 3) Convert
tblr (top, bottom, left, right) pair relative to the center of priors back
to (xmin, ymin, xmax, ymax) coordinate.
Args:
priors (Tensor): Prior boxes in point form (x0, y0, x1, y1)
Shape: (n,4).
tblr (Tensor): Coords of network output in tblr form
Shape: (n, 4).
normalizer (Sequence[float] | float): Normalization parameter of
encoded boxes. By list, it represents the normalization factors at
tblr dims. By float, it is the unified normalization factor at all
dims. Default: 4.0
normalize_by_wh (bool): Whether the tblr coordinates have been
normalized by the side length (wh) of prior bboxes.
max_shape (tuple, optional): Shape of the image. Decoded bboxes
exceeding which will be clamped.
clip_border (bool, optional): Whether clip the objects outside the
border of the image. Defaults to True.
Return:
encoded boxes (Tensor), Shape: (n, 4)
"""
if not isinstance(normalizer, float):
normalizer = torch.tensor(normalizer, device=priors.device)
assert len(normalizer) == 4, 'Normalizer must have length = 4'
assert priors.size(0) == tblr.size(0)
loc_decode = tblr * normalizer
prior_centers = (priors[:, 0:2] + priors[:, 2:4]) / 2
if normalize_by_wh:
wh = priors[:, 2:4] - priors[:, 0:2]
w, h = torch.split(wh, 1, dim=1)
loc_decode[:, :2] *= h # tb
loc_decode[:, 2:] *= w # lr
top, bottom, left, right = loc_decode.split((1, 1, 1, 1), dim=1)
xmin = prior_centers[:, 0].unsqueeze(1) - left
xmax = prior_centers[:, 0].unsqueeze(1) + right
ymin = prior_centers[:, 1].unsqueeze(1) - top
ymax = prior_centers[:, 1].unsqueeze(1) + bottom
boxes = torch.cat((xmin, ymin, xmax, ymax), dim=1)
if clip_border and max_shape is not None:
boxes[:, 0].clamp_(min=0, max=max_shape[1])
boxes[:, 1].clamp_(min=0, max=max_shape[0])
boxes[:, 2].clamp_(min=0, max=max_shape[1])
boxes[:, 3].clamp_(min=0, max=max_shape[0])
return boxes
import torch
from ..builder import BBOX_CODERS
from .base_bbox_coder import BaseBBoxCoder
@BBOX_CODERS.register_module()
class YOLOBBoxCoder(BaseBBoxCoder):
"""YOLO BBox coder.
Following `YOLO <https://arxiv.org/abs/1506.02640>`_, this coder divide
image into grids, and encode bbox (x1, y1, x2, y2) into (cx, cy, dw, dh).
cx, cy in [0., 1.], denotes relative center position w.r.t the center of
bboxes. dw, dh are the same as :obj:`DeltaXYWHBBoxCoder`.
Args:
eps (float): Min value of cx, cy when encoding.
"""
def __init__(self, eps=1e-6):
super(BaseBBoxCoder, self).__init__()
self.eps = eps
def encode(self, bboxes, gt_bboxes, stride):
"""Get box regression transformation deltas that can be used to
transform the ``bboxes`` into the ``gt_bboxes``.
Args:
bboxes (torch.Tensor): Source boxes, e.g., anchors.
gt_bboxes (torch.Tensor): Target of the transformation, e.g.,
ground-truth boxes.
stride (torch.Tensor | int): Stride of bboxes.
Returns:
torch.Tensor: Box transformation deltas
"""
assert bboxes.size(0) == gt_bboxes.size(0)
assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
x_center_gt = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) * 0.5
y_center_gt = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) * 0.5
w_gt = gt_bboxes[..., 2] - gt_bboxes[..., 0]
h_gt = gt_bboxes[..., 3] - gt_bboxes[..., 1]
x_center = (bboxes[..., 0] + bboxes[..., 2]) * 0.5
y_center = (bboxes[..., 1] + bboxes[..., 3]) * 0.5
w = bboxes[..., 2] - bboxes[..., 0]
h = bboxes[..., 3] - bboxes[..., 1]
w_target = torch.log((w_gt / w).clamp(min=self.eps))
h_target = torch.log((h_gt / h).clamp(min=self.eps))
x_center_target = ((x_center_gt - x_center) / stride + 0.5).clamp(
self.eps, 1 - self.eps)
y_center_target = ((y_center_gt - y_center) / stride + 0.5).clamp(
self.eps, 1 - self.eps)
encoded_bboxes = torch.stack(
[x_center_target, y_center_target, w_target, h_target], dim=-1)
return encoded_bboxes
def decode(self, bboxes, pred_bboxes, stride):
"""Apply transformation `pred_bboxes` to `boxes`.
Args:
boxes (torch.Tensor): Basic boxes, e.g. anchors.
pred_bboxes (torch.Tensor): Encoded boxes with shape
stride (torch.Tensor | int): Strides of bboxes.
Returns:
torch.Tensor: Decoded boxes.
"""
assert pred_bboxes.size(0) == bboxes.size(0)
assert pred_bboxes.size(-1) == bboxes.size(-1) == 4
x_center = (bboxes[..., 0] + bboxes[..., 2]) * 0.5
y_center = (bboxes[..., 1] + bboxes[..., 3]) * 0.5
w = bboxes[..., 2] - bboxes[..., 0]
h = bboxes[..., 3] - bboxes[..., 1]
# Get outputs x, y
x_center_pred = (pred_bboxes[..., 0] - 0.5) * stride + x_center
y_center_pred = (pred_bboxes[..., 1] - 0.5) * stride + y_center
w_pred = torch.exp(pred_bboxes[..., 2]) * w
h_pred = torch.exp(pred_bboxes[..., 3]) * h
decoded_bboxes = torch.stack(
(x_center_pred - w_pred / 2, y_center_pred - h_pred / 2,
x_center_pred + w_pred / 2, y_center_pred + h_pred / 2),
dim=-1)
return decoded_bboxes
import numpy as np
import torch
def ensure_rng(rng=None):
"""Simple version of the ``kwarray.ensure_rng``
Args:
rng (int | numpy.random.RandomState | None):
if None, then defaults to the global rng. Otherwise this can be an
integer or a RandomState class
Returns:
(numpy.random.RandomState) : rng -
a numpy random number generator
References:
https://gitlab.kitware.com/computer-vision/kwarray/blob/master/kwarray/util_random.py#L270
"""
if rng is None:
rng = np.random.mtrand._rand
elif isinstance(rng, int):
rng = np.random.RandomState(rng)
else:
rng = rng
return rng
def random_boxes(num=1, scale=1, rng=None):
"""Simple version of ``kwimage.Boxes.random``
Returns:
Tensor: shape (n, 4) in x1, y1, x2, y2 format.
References:
https://gitlab.kitware.com/computer-vision/kwimage/blob/master/kwimage/structs/boxes.py#L1390
Example:
>>> num = 3
>>> scale = 512
>>> rng = 0
>>> boxes = random_boxes(num, scale, rng)
>>> print(boxes)
tensor([[280.9925, 278.9802, 308.6148, 366.1769],
[216.9113, 330.6978, 224.0446, 456.5878],
[405.3632, 196.3221, 493.3953, 270.7942]])
"""
rng = ensure_rng(rng)
tlbr = rng.rand(num, 4).astype(np.float32)
tl_x = np.minimum(tlbr[:, 0], tlbr[:, 2])
tl_y = np.minimum(tlbr[:, 1], tlbr[:, 3])
br_x = np.maximum(tlbr[:, 0], tlbr[:, 2])
br_y = np.maximum(tlbr[:, 1], tlbr[:, 3])
tlbr[:, 0] = tl_x * scale
tlbr[:, 1] = tl_y * scale
tlbr[:, 2] = br_x * scale
tlbr[:, 3] = br_y * scale
boxes = torch.from_numpy(tlbr)
return boxes
from .builder import build_iou_calculator
from .iou2d_calculator import BboxOverlaps2D, bbox_overlaps
__all__ = ['build_iou_calculator', 'BboxOverlaps2D', 'bbox_overlaps']
from mmcv.utils import Registry, build_from_cfg
IOU_CALCULATORS = Registry('IoU calculator')
def build_iou_calculator(cfg, default_args=None):
"""Builder of IoU calculator."""
return build_from_cfg(cfg, IOU_CALCULATORS, default_args)
import torch
from .builder import IOU_CALCULATORS
@IOU_CALCULATORS.register_module()
class BboxOverlaps2D(object):
"""2D Overlaps (e.g. IoUs, GIoUs) Calculator."""
def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
"""Calculate IoU between 2D bboxes.
Args:
bboxes1 (Tensor): bboxes have shape (m, 4) in <x1, y1, x2, y2>
format, or shape (m, 5) in <x1, y1, x2, y2, score> format.
bboxes2 (Tensor): bboxes have shape (m, 4) in <x1, y1, x2, y2>
format, shape (m, 5) in <x1, y1, x2, y2, score> format, or be
empty. If ``is_aligned `` is ``True``, then m and n must be
equal.
mode (str): "iou" (intersection over union), "iof" (intersection
over foreground), or "giou" (generalized intersection over
union).
is_aligned (bool, optional): If True, then m and n must be equal.
Default False.
Returns:
Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
"""
assert bboxes1.size(-1) in [0, 4, 5]
assert bboxes2.size(-1) in [0, 4, 5]
if bboxes2.size(-1) == 5:
bboxes2 = bboxes2[..., :4]
if bboxes1.size(-1) == 5:
bboxes1 = bboxes1[..., :4]
return bbox_overlaps(bboxes1, bboxes2, mode, is_aligned)
def __repr__(self):
"""str: a string describing the module"""
repr_str = self.__class__.__name__ + '()'
return repr_str
def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
"""Calculate overlap between two set of bboxes.
If ``is_aligned `` is ``False``, then calculate the overlaps between each
bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
pair of bboxes1 and bboxes2.
Args:
bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
B indicates the batch dim, in shape (B1, B2, ..., Bn).
If ``is_aligned `` is ``True``, then m and n must be equal.
mode (str): "iou" (intersection over union), "iof" (intersection over
foreground) or "giou" (generalized intersection over union).
Default "iou".
is_aligned (bool, optional): If True, then m and n must be equal.
Default False.
eps (float, optional): A value added to the denominator for numerical
stability. Default 1e-6.
Returns:
Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
Example:
>>> bboxes1 = torch.FloatTensor([
>>> [0, 0, 10, 10],
>>> [10, 10, 20, 20],
>>> [32, 32, 38, 42],
>>> ])
>>> bboxes2 = torch.FloatTensor([
>>> [0, 0, 10, 20],
>>> [0, 10, 10, 19],
>>> [10, 10, 20, 20],
>>> ])
>>> overlaps = bbox_overlaps(bboxes1, bboxes2)
>>> assert overlaps.shape == (3, 3)
>>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
>>> assert overlaps.shape == (3, )
Example:
>>> empty = torch.empty(0, 4)
>>> nonempty = torch.FloatTensor([[0, 0, 10, 9]])
>>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
>>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
>>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
"""
assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
# Either the boxes are empty or the length of boxes's last dimenstion is 4
assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
# Batch dim must be the same
# Batch dim: (B1, B2, ... Bn)
assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
batch_shape = bboxes1.shape[:-2]
rows = bboxes1.size(-2)
cols = bboxes2.size(-2)
if is_aligned:
assert rows == cols
if rows * cols == 0:
if is_aligned:
return bboxes1.new(batch_shape + (rows, ))
else:
return bboxes1.new(batch_shape + (rows, cols))
area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
bboxes1[..., 3] - bboxes1[..., 1])
area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
bboxes2[..., 3] - bboxes2[..., 1])
if is_aligned:
lt = torch.max(bboxes1[..., :2], bboxes2[..., :2]) # [B, rows, 2]
rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:]) # [B, rows, 2]
wh = (rb - lt).clamp(min=0) # [B, rows, 2]
overlap = wh[..., 0] * wh[..., 1]
if mode in ['iou', 'giou']:
union = area1 + area2 - overlap
else:
union = area1
if mode == 'giou':
enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2])
enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:])
else:
lt = torch.max(bboxes1[..., :, None, :2],
bboxes2[..., None, :, :2]) # [B, rows, cols, 2]
rb = torch.min(bboxes1[..., :, None, 2:],
bboxes2[..., None, :, 2:]) # [B, rows, cols, 2]
wh = (rb - lt).clamp(min=0) # [B, rows, cols, 2]
overlap = wh[..., 0] * wh[..., 1]
if mode in ['iou', 'giou']:
union = area1[..., None] + area2[..., None, :] - overlap
else:
union = area1[..., None]
if mode == 'giou':
enclosed_lt = torch.min(bboxes1[..., :, None, :2],
bboxes2[..., None, :, :2])
enclosed_rb = torch.max(bboxes1[..., :, None, 2:],
bboxes2[..., None, :, 2:])
eps = union.new_tensor([eps])
union = torch.max(union, eps)
ious = overlap / union
if mode in ['iou', 'iof']:
return ious
# calculate gious
enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0)
enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
enclose_area = torch.max(enclose_area, eps)
gious = ious - (enclose_area - union) / enclose_area
return gious
from .builder import build_match_cost
from .match_cost import BBoxL1Cost, ClassificationCost, FocalLossCost, IoUCost
__all__ = [
'build_match_cost', 'ClassificationCost', 'BBoxL1Cost', 'IoUCost',
'FocalLossCost'
]
from mmcv.utils import Registry, build_from_cfg
MATCH_COST = Registry('Match Cost')
def build_match_cost(cfg, default_args=None):
"""Builder of IoU calculator."""
return build_from_cfg(cfg, MATCH_COST, default_args)
import torch
from mmdet.core.bbox.iou_calculators import bbox_overlaps
from mmdet.core.bbox.transforms import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
from .builder import MATCH_COST
@MATCH_COST.register_module()
class BBoxL1Cost(object):
"""BBoxL1Cost.
Args:
weight (int | float, optional): loss_weight
box_format (str, optional): 'xyxy' for DETR, 'xywh' for Sparse_RCNN
Examples:
>>> from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost
>>> import torch
>>> self = BBoxL1Cost()
>>> bbox_pred = torch.rand(1, 4)
>>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
>>> factor = torch.tensor([10, 8, 10, 8])
>>> self(bbox_pred, gt_bboxes, factor)
tensor([[1.6172, 1.6422]])
"""
def __init__(self, weight=1., box_format='xyxy'):
self.weight = weight
assert box_format in ['xyxy', 'xywh']
self.box_format = box_format
def __call__(self, bbox_pred, gt_bboxes):
"""
Args:
bbox_pred (Tensor): Predicted boxes with normalized coordinates
(cx, cy, w, h), which are all in range [0, 1]. Shape
[num_query, 4].
gt_bboxes (Tensor): Ground truth boxes with normalized
coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
Returns:
torch.Tensor: bbox_cost value with weight
"""
if self.box_format == 'xywh':
gt_bboxes = bbox_xyxy_to_cxcywh(gt_bboxes)
elif self.box_format == 'xyxy':
bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
return bbox_cost * self.weight
@MATCH_COST.register_module()
class FocalLossCost(object):
"""FocalLossCost.
Args:
weight (int | float, optional): loss_weight
alpha (int | float, optional): focal_loss alpha
gamma (int | float, optional): focal_loss gamma
eps (float, optional): default 1e-12
Examples:
>>> from mmdet.core.bbox.match_costs.match_cost import FocalLossCost
>>> import torch
>>> self = FocalLossCost()
>>> cls_pred = torch.rand(4, 3)
>>> gt_labels = torch.tensor([0, 1, 2])
>>> factor = torch.tensor([10, 8, 10, 8])
>>> self(cls_pred, gt_labels)
tensor([[-0.3236, -0.3364, -0.2699],
[-0.3439, -0.3209, -0.4807],
[-0.4099, -0.3795, -0.2929],
[-0.1950, -0.1207, -0.2626]])
"""
def __init__(self, weight=1., alpha=0.25, gamma=2, eps=1e-12):
self.weight = weight
self.alpha = alpha
self.gamma = gamma
self.eps = eps
def __call__(self, cls_pred, gt_labels):
"""
Args:
cls_pred (Tensor): Predicted classification logits, shape
[num_query, num_class].
gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
Returns:
torch.Tensor: cls_cost value with weight
"""
cls_pred = cls_pred.sigmoid()
neg_cost = -(1 - cls_pred + self.eps).log() * (
1 - self.alpha) * cls_pred.pow(self.gamma)
pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
1 - cls_pred).pow(self.gamma)
cls_cost = pos_cost[:, gt_labels] - neg_cost[:, gt_labels]
return cls_cost * self.weight
@MATCH_COST.register_module()
class ClassificationCost(object):
"""ClsSoftmaxCost.
Args:
weight (int | float, optional): loss_weight
Examples:
>>> from mmdet.core.bbox.match_costs.match_cost import \
... ClassificationCost
>>> import torch
>>> self = ClassificationCost()
>>> cls_pred = torch.rand(4, 3)
>>> gt_labels = torch.tensor([0, 1, 2])
>>> factor = torch.tensor([10, 8, 10, 8])
>>> self(cls_pred, gt_labels)
tensor([[-0.3430, -0.3525, -0.3045],
[-0.3077, -0.2931, -0.3992],
[-0.3664, -0.3455, -0.2881],
[-0.3343, -0.2701, -0.3956]])
"""
def __init__(self, weight=1.):
self.weight = weight
def __call__(self, cls_pred, gt_labels):
"""
Args:
cls_pred (Tensor): Predicted classification logits, shape
[num_query, num_class].
gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
Returns:
torch.Tensor: cls_cost value with weight
"""
# Following the official DETR repo, contrary to the loss that
# NLL is used, we approximate it in 1 - cls_score[gt_label].
# The 1 is a constant that doesn't change the matching,
# so it can be ommitted.
cls_score = cls_pred.softmax(-1)
cls_cost = -cls_score[:, gt_labels]
return cls_cost * self.weight
@MATCH_COST.register_module()
class IoUCost(object):
"""IoUCost.
Args:
iou_mode (str, optional): iou mode such as 'iou' | 'giou'
weight (int | float, optional): loss weight
Examples:
>>> from mmdet.core.bbox.match_costs.match_cost import IoUCost
>>> import torch
>>> self = IoUCost()
>>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
>>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
>>> self(bboxes, gt_bboxes)
tensor([[-0.1250, 0.1667],
[ 0.1667, -0.5000]])
"""
def __init__(self, iou_mode='giou', weight=1.):
self.weight = weight
self.iou_mode = iou_mode
def __call__(self, bboxes, gt_bboxes):
"""
Args:
bboxes (Tensor): Predicted boxes with unnormalized coordinates
(x1, y1, x2, y2). Shape [num_query, 4].
gt_bboxes (Tensor): Ground truth boxes with unnormalized
coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
Returns:
torch.Tensor: iou_cost value with weight
"""
# overlaps: [num_bboxes, num_gt]
overlaps = bbox_overlaps(
bboxes, gt_bboxes, mode=self.iou_mode, is_aligned=False)
# The 1 is a constant that doesn't change the matching, so ommitted.
iou_cost = -overlaps
return iou_cost * self.weight
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment