Commit 3144257c authored by mashun1's avatar mashun1
Browse files

catvton

parents
# Copyright (c) Facebook, Inc. and its affiliates.
import math
from typing import List, Tuple, Union
import torch
from fvcore.nn import giou_loss, smooth_l1_loss
from torch.nn import functional as F
from detectron2.layers import cat, ciou_loss, diou_loss
from detectron2.structures import Boxes
# Value for clamping large dw and dh predictions. The heuristic is that we clamp
# such that dw and dh are no larger than what would transform a 16px box into a
# 1000px box (based on a small anchor, 16px, and a typical image size, 1000px).
_DEFAULT_SCALE_CLAMP = math.log(1000.0 / 16)
__all__ = ["Box2BoxTransform", "Box2BoxTransformRotated", "Box2BoxTransformLinear"]
@torch.jit.script
class Box2BoxTransform:
"""
The box-to-box transform defined in R-CNN. The transformation is parameterized
by 4 deltas: (dx, dy, dw, dh). The transformation scales the box's width and height
by exp(dw), exp(dh) and shifts a box's center by the offset (dx * width, dy * height).
"""
def __init__(
self, weights: Tuple[float, float, float, float], scale_clamp: float = _DEFAULT_SCALE_CLAMP
):
"""
Args:
weights (4-element tuple): Scaling factors that are applied to the
(dx, dy, dw, dh) deltas. In Fast R-CNN, these were originally set
such that the deltas have unit variance; now they are treated as
hyperparameters of the system.
scale_clamp (float): When predicting deltas, the predicted box scaling
factors (dw and dh) are clamped such that they are <= scale_clamp.
"""
self.weights = weights
self.scale_clamp = scale_clamp
def get_deltas(self, src_boxes, target_boxes):
"""
Get box regression transformation deltas (dx, dy, dw, dh) that can be used
to transform the `src_boxes` into the `target_boxes`. That is, the relation
``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
any delta is too large and is clamped).
Args:
src_boxes (Tensor): source boxes, e.g., object proposals
target_boxes (Tensor): target of the transformation, e.g., ground-truth
boxes.
"""
assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
src_widths = src_boxes[:, 2] - src_boxes[:, 0]
src_heights = src_boxes[:, 3] - src_boxes[:, 1]
src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths
src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights
target_widths = target_boxes[:, 2] - target_boxes[:, 0]
target_heights = target_boxes[:, 3] - target_boxes[:, 1]
target_ctr_x = target_boxes[:, 0] + 0.5 * target_widths
target_ctr_y = target_boxes[:, 1] + 0.5 * target_heights
wx, wy, ww, wh = self.weights
dx = wx * (target_ctr_x - src_ctr_x) / src_widths
dy = wy * (target_ctr_y - src_ctr_y) / src_heights
dw = ww * torch.log(target_widths / src_widths)
dh = wh * torch.log(target_heights / src_heights)
deltas = torch.stack((dx, dy, dw, dh), dim=1)
assert (src_widths > 0).all().item(), "Input boxes to Box2BoxTransform are not valid!"
return deltas
def apply_deltas(self, deltas, boxes):
"""
Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`.
Args:
deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
deltas[i] represents k potentially different class-specific
box transformations for the single box boxes[i].
boxes (Tensor): boxes to transform, of shape (N, 4)
"""
deltas = deltas.float() # ensure fp32 for decoding precision
boxes = boxes.to(deltas.dtype)
widths = boxes[:, 2] - boxes[:, 0]
heights = boxes[:, 3] - boxes[:, 1]
ctr_x = boxes[:, 0] + 0.5 * widths
ctr_y = boxes[:, 1] + 0.5 * heights
wx, wy, ww, wh = self.weights
dx = deltas[:, 0::4] / wx
dy = deltas[:, 1::4] / wy
dw = deltas[:, 2::4] / ww
dh = deltas[:, 3::4] / wh
# Prevent sending too large values into torch.exp()
dw = torch.clamp(dw, max=self.scale_clamp)
dh = torch.clamp(dh, max=self.scale_clamp)
pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
pred_w = torch.exp(dw) * widths[:, None]
pred_h = torch.exp(dh) * heights[:, None]
x1 = pred_ctr_x - 0.5 * pred_w
y1 = pred_ctr_y - 0.5 * pred_h
x2 = pred_ctr_x + 0.5 * pred_w
y2 = pred_ctr_y + 0.5 * pred_h
pred_boxes = torch.stack((x1, y1, x2, y2), dim=-1)
return pred_boxes.reshape(deltas.shape)
@torch.jit.script
class Box2BoxTransformRotated:
"""
The box-to-box transform defined in Rotated R-CNN. The transformation is parameterized
by 5 deltas: (dx, dy, dw, dh, da). The transformation scales the box's width and height
by exp(dw), exp(dh), shifts a box's center by the offset (dx * width, dy * height),
and rotate a box's angle by da (radians).
Note: angles of deltas are in radians while angles of boxes are in degrees.
"""
def __init__(
self,
weights: Tuple[float, float, float, float, float],
scale_clamp: float = _DEFAULT_SCALE_CLAMP,
):
"""
Args:
weights (5-element tuple): Scaling factors that are applied to the
(dx, dy, dw, dh, da) deltas. These are treated as
hyperparameters of the system.
scale_clamp (float): When predicting deltas, the predicted box scaling
factors (dw and dh) are clamped such that they are <= scale_clamp.
"""
self.weights = weights
self.scale_clamp = scale_clamp
def get_deltas(self, src_boxes, target_boxes):
"""
Get box regression transformation deltas (dx, dy, dw, dh, da) that can be used
to transform the `src_boxes` into the `target_boxes`. That is, the relation
``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
any delta is too large and is clamped).
Args:
src_boxes (Tensor): Nx5 source boxes, e.g., object proposals
target_boxes (Tensor): Nx5 target of the transformation, e.g., ground-truth
boxes.
"""
assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
src_ctr_x, src_ctr_y, src_widths, src_heights, src_angles = torch.unbind(src_boxes, dim=1)
target_ctr_x, target_ctr_y, target_widths, target_heights, target_angles = torch.unbind(
target_boxes, dim=1
)
wx, wy, ww, wh, wa = self.weights
dx = wx * (target_ctr_x - src_ctr_x) / src_widths
dy = wy * (target_ctr_y - src_ctr_y) / src_heights
dw = ww * torch.log(target_widths / src_widths)
dh = wh * torch.log(target_heights / src_heights)
# Angles of deltas are in radians while angles of boxes are in degrees.
# the conversion to radians serve as a way to normalize the values
da = target_angles - src_angles
da = (da + 180.0) % 360.0 - 180.0 # make it in [-180, 180)
da *= wa * math.pi / 180.0
deltas = torch.stack((dx, dy, dw, dh, da), dim=1)
assert (
(src_widths > 0).all().item()
), "Input boxes to Box2BoxTransformRotated are not valid!"
return deltas
def apply_deltas(self, deltas, boxes):
"""
Apply transformation `deltas` (dx, dy, dw, dh, da) to `boxes`.
Args:
deltas (Tensor): transformation deltas of shape (N, k*5).
deltas[i] represents box transformation for the single box boxes[i].
boxes (Tensor): boxes to transform, of shape (N, 5)
"""
assert deltas.shape[1] % 5 == 0 and boxes.shape[1] == 5
boxes = boxes.to(deltas.dtype).unsqueeze(2)
ctr_x = boxes[:, 0]
ctr_y = boxes[:, 1]
widths = boxes[:, 2]
heights = boxes[:, 3]
angles = boxes[:, 4]
wx, wy, ww, wh, wa = self.weights
dx = deltas[:, 0::5] / wx
dy = deltas[:, 1::5] / wy
dw = deltas[:, 2::5] / ww
dh = deltas[:, 3::5] / wh
da = deltas[:, 4::5] / wa
# Prevent sending too large values into torch.exp()
dw = torch.clamp(dw, max=self.scale_clamp)
dh = torch.clamp(dh, max=self.scale_clamp)
pred_boxes = torch.zeros_like(deltas)
pred_boxes[:, 0::5] = dx * widths + ctr_x # x_ctr
pred_boxes[:, 1::5] = dy * heights + ctr_y # y_ctr
pred_boxes[:, 2::5] = torch.exp(dw) * widths # width
pred_boxes[:, 3::5] = torch.exp(dh) * heights # height
# Following original RRPN implementation,
# angles of deltas are in radians while angles of boxes are in degrees.
pred_angle = da * 180.0 / math.pi + angles
pred_angle = (pred_angle + 180.0) % 360.0 - 180.0 # make it in [-180, 180)
pred_boxes[:, 4::5] = pred_angle
return pred_boxes
class Box2BoxTransformLinear:
"""
The linear box-to-box transform defined in FCOS. The transformation is parameterized
by the distance from the center of (square) src box to 4 edges of the target box.
"""
def __init__(self, normalize_by_size=True):
"""
Args:
normalize_by_size: normalize deltas by the size of src (anchor) boxes.
"""
self.normalize_by_size = normalize_by_size
def get_deltas(self, src_boxes, target_boxes):
"""
Get box regression transformation deltas (dx1, dy1, dx2, dy2) that can be used
to transform the `src_boxes` into the `target_boxes`. That is, the relation
``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true.
The center of src must be inside target boxes.
Args:
src_boxes (Tensor): square source boxes, e.g., anchors
target_boxes (Tensor): target of the transformation, e.g., ground-truth
boxes.
"""
assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
src_ctr_x = 0.5 * (src_boxes[:, 0] + src_boxes[:, 2])
src_ctr_y = 0.5 * (src_boxes[:, 1] + src_boxes[:, 3])
target_l = src_ctr_x - target_boxes[:, 0]
target_t = src_ctr_y - target_boxes[:, 1]
target_r = target_boxes[:, 2] - src_ctr_x
target_b = target_boxes[:, 3] - src_ctr_y
deltas = torch.stack((target_l, target_t, target_r, target_b), dim=1)
if self.normalize_by_size:
stride_w = src_boxes[:, 2] - src_boxes[:, 0]
stride_h = src_boxes[:, 3] - src_boxes[:, 1]
strides = torch.stack([stride_w, stride_h, stride_w, stride_h], axis=1)
deltas = deltas / strides
return deltas
def apply_deltas(self, deltas, boxes):
"""
Apply transformation `deltas` (dx1, dy1, dx2, dy2) to `boxes`.
Args:
deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
deltas[i] represents k potentially different class-specific
box transformations for the single box boxes[i].
boxes (Tensor): boxes to transform, of shape (N, 4)
"""
# Ensure the output is a valid box. See Sec 2.1 of https://arxiv.org/abs/2006.09214
deltas = F.relu(deltas)
boxes = boxes.to(deltas.dtype)
ctr_x = 0.5 * (boxes[:, 0] + boxes[:, 2])
ctr_y = 0.5 * (boxes[:, 1] + boxes[:, 3])
if self.normalize_by_size:
stride_w = boxes[:, 2] - boxes[:, 0]
stride_h = boxes[:, 3] - boxes[:, 1]
strides = torch.stack([stride_w, stride_h, stride_w, stride_h], axis=1)
deltas = deltas * strides
l = deltas[:, 0::4]
t = deltas[:, 1::4]
r = deltas[:, 2::4]
b = deltas[:, 3::4]
pred_boxes = torch.zeros_like(deltas)
pred_boxes[:, 0::4] = ctr_x[:, None] - l # x1
pred_boxes[:, 1::4] = ctr_y[:, None] - t # y1
pred_boxes[:, 2::4] = ctr_x[:, None] + r # x2
pred_boxes[:, 3::4] = ctr_y[:, None] + b # y2
return pred_boxes
def _dense_box_regression_loss(
anchors: List[Union[Boxes, torch.Tensor]],
box2box_transform: Box2BoxTransform,
pred_anchor_deltas: List[torch.Tensor],
gt_boxes: List[torch.Tensor],
fg_mask: torch.Tensor,
box_reg_loss_type="smooth_l1",
smooth_l1_beta=0.0,
):
"""
Compute loss for dense multi-level box regression.
Loss is accumulated over ``fg_mask``.
Args:
anchors: #lvl anchor boxes, each is (HixWixA, 4)
pred_anchor_deltas: #lvl predictions, each is (N, HixWixA, 4)
gt_boxes: N ground truth boxes, each has shape (R, 4) (R = sum(Hi * Wi * A))
fg_mask: the foreground boolean mask of shape (N, R) to compute loss on
box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou",
"diou", "ciou".
smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to
use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1"
"""
if isinstance(anchors[0], Boxes):
anchors = type(anchors[0]).cat(anchors).tensor # (R, 4)
else:
anchors = cat(anchors)
if box_reg_loss_type == "smooth_l1":
gt_anchor_deltas = [box2box_transform.get_deltas(anchors, k) for k in gt_boxes]
gt_anchor_deltas = torch.stack(gt_anchor_deltas) # (N, R, 4)
loss_box_reg = smooth_l1_loss(
cat(pred_anchor_deltas, dim=1)[fg_mask],
gt_anchor_deltas[fg_mask],
beta=smooth_l1_beta,
reduction="sum",
)
elif box_reg_loss_type == "giou":
pred_boxes = [
box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
]
loss_box_reg = giou_loss(
torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
)
elif box_reg_loss_type == "diou":
pred_boxes = [
box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
]
loss_box_reg = diou_loss(
torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
)
elif box_reg_loss_type == "ciou":
pred_boxes = [
box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
]
loss_box_reg = ciou_loss(
torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
)
else:
raise ValueError(f"Invalid dense box regression loss type '{box_reg_loss_type}'")
return loss_box_reg
# Copyright (c) Facebook, Inc. and its affiliates.
from typing import List
import torch
from detectron2.layers import nonzero_tuple
# TODO: the name is too general
class Matcher:
"""
This class assigns to each predicted "element" (e.g., a box) a ground-truth
element. Each predicted element will have exactly zero or one matches; each
ground-truth element may be matched to zero or more predicted elements.
The matching is determined by the MxN match_quality_matrix, that characterizes
how well each (ground-truth, prediction)-pair match each other. For example,
if the elements are boxes, this matrix may contain box intersection-over-union
overlap values.
The matcher returns (a) a vector of length N containing the index of the
ground-truth element m in [0, M) that matches to prediction n in [0, N).
(b) a vector of length N containing the labels for each prediction.
"""
def __init__(
self, thresholds: List[float], labels: List[int], allow_low_quality_matches: bool = False
):
"""
Args:
thresholds (list): a list of thresholds used to stratify predictions
into levels.
labels (list): a list of values to label predictions belonging at
each level. A label can be one of {-1, 0, 1} signifying
{ignore, negative class, positive class}, respectively.
allow_low_quality_matches (bool): if True, produce additional matches
for predictions with maximum match quality lower than high_threshold.
See set_low_quality_matches_ for more details.
For example,
thresholds = [0.3, 0.5]
labels = [0, -1, 1]
All predictions with iou < 0.3 will be marked with 0 and
thus will be considered as false positives while training.
All predictions with 0.3 <= iou < 0.5 will be marked with -1 and
thus will be ignored.
All predictions with 0.5 <= iou will be marked with 1 and
thus will be considered as true positives.
"""
# Add -inf and +inf to first and last position in thresholds
thresholds = thresholds[:]
assert thresholds[0] > 0
thresholds.insert(0, -float("inf"))
thresholds.append(float("inf"))
# Currently torchscript does not support all + generator
assert all([low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])])
assert all([l in [-1, 0, 1] for l in labels])
assert len(labels) == len(thresholds) - 1
self.thresholds = thresholds
self.labels = labels
self.allow_low_quality_matches = allow_low_quality_matches
def __call__(self, match_quality_matrix):
"""
Args:
match_quality_matrix (Tensor[float]): an MxN tensor, containing the
pairwise quality between M ground-truth elements and N predicted
elements. All elements must be >= 0 (due to the us of `torch.nonzero`
for selecting indices in :meth:`set_low_quality_matches_`).
Returns:
matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
ground-truth index in [0, M)
match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
whether a prediction is a true or false positive or ignored
"""
assert match_quality_matrix.dim() == 2
if match_quality_matrix.numel() == 0:
default_matches = match_quality_matrix.new_full(
(match_quality_matrix.size(1),), 0, dtype=torch.int64
)
# When no gt boxes exist, we define IOU = 0 and therefore set labels
# to `self.labels[0]`, which usually defaults to background class 0
# To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds
default_match_labels = match_quality_matrix.new_full(
(match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
)
return default_matches, default_match_labels
assert torch.all(match_quality_matrix >= 0)
# match_quality_matrix is M (gt) x N (predicted)
# Max over gt elements (dim 0) to find best gt candidate for each prediction
matched_vals, matches = match_quality_matrix.max(dim=0)
match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
for l, low, high in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
low_high = (matched_vals >= low) & (matched_vals < high)
match_labels[low_high] = l
if self.allow_low_quality_matches:
self.set_low_quality_matches_(match_labels, match_quality_matrix)
return matches, match_labels
def set_low_quality_matches_(self, match_labels, match_quality_matrix):
"""
Produce additional matches for predictions that have only low-quality matches.
Specifically, for each ground-truth G find the set of predictions that have
maximum overlap with it (including ties); for each prediction in that set, if
it is unmatched, then match it to the ground-truth G.
This function implements the RPN assignment case (i) in Sec. 3.1.2 of
:paper:`Faster R-CNN`.
"""
# For each gt, find the prediction with which it has highest quality
highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
# Find the highest quality match available, even if it is low, including ties.
# Note that the matches qualities must be positive due to the use of
# `torch.nonzero`.
_, pred_inds_with_highest_quality = nonzero_tuple(
match_quality_matrix == highest_quality_foreach_gt[:, None]
)
# If an anchor was labeled positive only due to a low-quality match
# with gt_A, but it has larger overlap with gt_B, it's matched index will still be gt_B.
# This follows the implementation in Detectron, and is found to have no significant impact.
match_labels[pred_inds_with_highest_quality] = 1
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
from .build import META_ARCH_REGISTRY, build_model # isort:skip
from .panoptic_fpn import PanopticFPN
# import all the meta_arch, so they will be registered
from .rcnn import GeneralizedRCNN, ProposalNetwork
from .dense_detector import DenseDetector
from .retinanet import RetinaNet
from .fcos import FCOS
from .semantic_seg import SEM_SEG_HEADS_REGISTRY, SemanticSegmentor, build_sem_seg_head
__all__ = list(globals().keys())
# Copyright (c) Facebook, Inc. and its affiliates.
import torch
from detectron2.utils.logger import _log_api_usage
from detectron2.utils.registry import Registry
META_ARCH_REGISTRY = Registry("META_ARCH") # noqa F401 isort:skip
META_ARCH_REGISTRY.__doc__ = """
Registry for meta-architectures, i.e. the whole model.
The registered object will be called with `obj(cfg)`
and expected to return a `nn.Module` object.
"""
def build_model(cfg):
"""
Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
Note that it does not load any weights from ``cfg``.
"""
meta_arch = cfg.MODEL.META_ARCHITECTURE
model = META_ARCH_REGISTRY.get(meta_arch)(cfg)
model.to(torch.device(cfg.MODEL.DEVICE))
_log_api_usage("modeling.meta_arch." + meta_arch)
return model
import numpy as np
from typing import Dict, List, Optional, Tuple
import torch
from torch import Tensor, nn
from detectron2.data.detection_utils import convert_image_to_rgb
from detectron2.layers import move_device_like
from detectron2.modeling import Backbone
from detectron2.structures import Boxes, ImageList, Instances
from detectron2.utils.events import get_event_storage
from ..postprocessing import detector_postprocess
def permute_to_N_HWA_K(tensor, K: int):
"""
Transpose/reshape a tensor from (N, (Ai x K), H, W) to (N, (HxWxAi), K)
"""
assert tensor.dim() == 4, tensor.shape
N, _, H, W = tensor.shape
tensor = tensor.view(N, -1, K, H, W)
tensor = tensor.permute(0, 3, 4, 1, 2)
tensor = tensor.reshape(N, -1, K) # Size=(N,HWA,K)
return tensor
class DenseDetector(nn.Module):
"""
Base class for dense detector. We define a dense detector as a fully-convolutional model that
makes per-pixel (i.e. dense) predictions.
"""
def __init__(
self,
backbone: Backbone,
head: nn.Module,
head_in_features: Optional[List[str]] = None,
*,
pixel_mean,
pixel_std,
):
"""
Args:
backbone: backbone module
head: head module
head_in_features: backbone features to use in head. Default to all backbone features.
pixel_mean (Tuple[float]):
Values to be used for image normalization (BGR order).
To train on images of different number of channels, set different mean & std.
Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675]
pixel_std (Tuple[float]):
When using pre-trained models in Detectron1 or any MSRA models,
std has been absorbed into its conv1 weights, so the std needs to be set 1.
Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
"""
super().__init__()
self.backbone = backbone
self.head = head
if head_in_features is None:
shapes = self.backbone.output_shape()
self.head_in_features = sorted(shapes.keys(), key=lambda x: shapes[x].stride)
else:
self.head_in_features = head_in_features
self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
@property
def device(self):
return self.pixel_mean.device
def _move_to_current_device(self, x):
return move_device_like(x, self.pixel_mean)
def forward(self, batched_inputs: List[Dict[str, Tensor]]):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* image: Tensor, image in (C, H, W) format.
* instances: Instances
Other information that's included in the original dicts, such as:
* "height", "width" (int): the output resolution of the model, used in inference.
See :meth:`postprocess` for details.
Returns:
In training, dict[str, Tensor]: mapping from a named loss to a tensor storing the
loss. Used during training only. In inference, the standard output format, described
in :doc:`/tutorials/models`.
"""
images = self.preprocess_image(batched_inputs)
features = self.backbone(images.tensor)
features = [features[f] for f in self.head_in_features]
predictions = self.head(features)
if self.training:
assert not torch.jit.is_scripting(), "Not supported"
assert "instances" in batched_inputs[0], "Instance annotations are missing in training!"
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
return self.forward_training(images, features, predictions, gt_instances)
else:
results = self.forward_inference(images, features, predictions)
if torch.jit.is_scripting():
return results
processed_results = []
for results_per_image, input_per_image, image_size in zip(
results, batched_inputs, images.image_sizes
):
height = input_per_image.get("height", image_size[0])
width = input_per_image.get("width", image_size[1])
r = detector_postprocess(results_per_image, height, width)
processed_results.append({"instances": r})
return processed_results
def forward_training(self, images, features, predictions, gt_instances):
raise NotImplementedError()
def preprocess_image(self, batched_inputs: List[Dict[str, Tensor]]):
"""
Normalize, pad and batch the input images.
"""
images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(
images,
self.backbone.size_divisibility,
padding_constraints=self.backbone.padding_constraints,
)
return images
def _transpose_dense_predictions(
self, predictions: List[List[Tensor]], dims_per_anchor: List[int]
) -> List[List[Tensor]]:
"""
Transpose the dense per-level predictions.
Args:
predictions: a list of outputs, each is a list of per-level
predictions with shape (N, Ai x K, Hi, Wi), where N is the
number of images, Ai is the number of anchors per location on
level i, K is the dimension of predictions per anchor.
dims_per_anchor: the value of K for each predictions. e.g. 4 for
box prediction, #classes for classification prediction.
Returns:
List[List[Tensor]]: each prediction is transposed to (N, Hi x Wi x Ai, K).
"""
assert len(predictions) == len(dims_per_anchor)
res: List[List[Tensor]] = []
for pred, dim_per_anchor in zip(predictions, dims_per_anchor):
pred = [permute_to_N_HWA_K(x, dim_per_anchor) for x in pred]
res.append(pred)
return res
def _ema_update(self, name: str, value: float, initial_value: float, momentum: float = 0.9):
"""
Apply EMA update to `self.name` using `value`.
This is mainly used for loss normalizer. In Detectron1, loss is normalized by number
of foreground samples in the batch. When batch size is 1 per GPU, #foreground has a
large variance and using it lead to lower performance. Therefore we maintain an EMA of
#foreground to stabilize the normalizer.
Args:
name: name of the normalizer
value: the new value to update
initial_value: the initial value to start with
momentum: momentum of EMA
Returns:
float: the updated EMA value
"""
if hasattr(self, name):
old = getattr(self, name)
else:
old = initial_value
new = old * momentum + value * (1 - momentum)
setattr(self, name, new)
return new
def _decode_per_level_predictions(
self,
anchors: Boxes,
pred_scores: Tensor,
pred_deltas: Tensor,
score_thresh: float,
topk_candidates: int,
image_size: Tuple[int, int],
) -> Instances:
"""
Decode boxes and classification predictions of one featuer level, by
the following steps:
1. filter the predictions based on score threshold and top K scores.
2. transform the box regression outputs
3. return the predicted scores, classes and boxes
Args:
anchors: Boxes, anchor for this feature level
pred_scores: HxWxA,K
pred_deltas: HxWxA,4
Returns:
Instances: with field "scores", "pred_boxes", "pred_classes".
"""
# Apply two filtering to make NMS faster.
# 1. Keep boxes with confidence score higher than threshold
keep_idxs = pred_scores > score_thresh
pred_scores = pred_scores[keep_idxs]
topk_idxs = torch.nonzero(keep_idxs) # Kx2
# 2. Keep top k top scoring boxes only
topk_idxs_size = topk_idxs.shape[0]
if isinstance(topk_idxs_size, Tensor):
# It's a tensor in tracing
num_topk = torch.clamp(topk_idxs_size, max=topk_candidates)
else:
num_topk = min(topk_idxs_size, topk_candidates)
pred_scores, idxs = pred_scores.topk(num_topk)
topk_idxs = topk_idxs[idxs]
anchor_idxs, classes_idxs = topk_idxs.unbind(dim=1)
pred_boxes = self.box2box_transform.apply_deltas(
pred_deltas[anchor_idxs], anchors.tensor[anchor_idxs]
)
return Instances(
image_size, pred_boxes=Boxes(pred_boxes), scores=pred_scores, pred_classes=classes_idxs
)
def _decode_multi_level_predictions(
self,
anchors: List[Boxes],
pred_scores: List[Tensor],
pred_deltas: List[Tensor],
score_thresh: float,
topk_candidates: int,
image_size: Tuple[int, int],
) -> Instances:
"""
Run `_decode_per_level_predictions` for all feature levels and concat the results.
"""
predictions = [
self._decode_per_level_predictions(
anchors_i,
box_cls_i,
box_reg_i,
score_thresh,
topk_candidates,
image_size,
)
# Iterate over every feature level
for box_cls_i, box_reg_i, anchors_i in zip(pred_scores, pred_deltas, anchors)
]
return predictions[0].cat(predictions) # 'Instances.cat' is not scriptale but this is
def visualize_training(self, batched_inputs, results):
"""
A function used to visualize ground truth images and final network predictions.
It shows ground truth bounding boxes on the original image and up to 20
predicted object bounding boxes on the original image.
Args:
batched_inputs (list): a list that contains input to the model.
results (List[Instances]): a list of #images elements returned by forward_inference().
"""
from detectron2.utils.visualizer import Visualizer
assert len(batched_inputs) == len(
results
), "Cannot visualize inputs and results of different sizes"
storage = get_event_storage()
max_boxes = 20
image_index = 0 # only visualize a single image
img = batched_inputs[image_index]["image"]
img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
v_gt = Visualizer(img, None)
v_gt = v_gt.overlay_instances(boxes=batched_inputs[image_index]["instances"].gt_boxes)
anno_img = v_gt.get_image()
processed_results = detector_postprocess(results[image_index], img.shape[0], img.shape[1])
predicted_boxes = processed_results.pred_boxes.tensor.detach().cpu().numpy()
v_pred = Visualizer(img, None)
v_pred = v_pred.overlay_instances(boxes=predicted_boxes[0:max_boxes])
prop_img = v_pred.get_image()
vis_img = np.vstack((anno_img, prop_img))
vis_img = vis_img.transpose(2, 0, 1)
vis_name = f"Top: GT bounding boxes; Bottom: {max_boxes} Highest Scoring Results"
storage.put_image(vis_name, vis_img)
# Copyright (c) Facebook, Inc. and its affiliates.
import logging
from typing import List, Optional, Tuple
import torch
from fvcore.nn import sigmoid_focal_loss_jit
from torch import nn
from torch.nn import functional as F
from detectron2.layers import ShapeSpec, batched_nms
from detectron2.structures import Boxes, ImageList, Instances, pairwise_point_box_distance
from detectron2.utils.events import get_event_storage
from ..anchor_generator import DefaultAnchorGenerator
from ..backbone import Backbone
from ..box_regression import Box2BoxTransformLinear, _dense_box_regression_loss
from .dense_detector import DenseDetector
from .retinanet import RetinaNetHead
__all__ = ["FCOS"]
logger = logging.getLogger(__name__)
class FCOS(DenseDetector):
"""
Implement FCOS in :paper:`fcos`.
"""
def __init__(
self,
*,
backbone: Backbone,
head: nn.Module,
head_in_features: Optional[List[str]] = None,
box2box_transform=None,
num_classes,
center_sampling_radius: float = 1.5,
focal_loss_alpha=0.25,
focal_loss_gamma=2.0,
test_score_thresh=0.2,
test_topk_candidates=1000,
test_nms_thresh=0.6,
max_detections_per_image=100,
pixel_mean,
pixel_std,
):
"""
Args:
center_sampling_radius: radius of the "center" of a groundtruth box,
within which all anchor points are labeled positive.
Other arguments mean the same as in :class:`RetinaNet`.
"""
super().__init__(
backbone, head, head_in_features, pixel_mean=pixel_mean, pixel_std=pixel_std
)
self.num_classes = num_classes
# FCOS uses one anchor point per location.
# We represent the anchor point by a box whose size equals the anchor stride.
feature_shapes = backbone.output_shape()
fpn_strides = [feature_shapes[k].stride for k in self.head_in_features]
self.anchor_generator = DefaultAnchorGenerator(
sizes=[[k] for k in fpn_strides], aspect_ratios=[1.0], strides=fpn_strides
)
# FCOS parameterizes box regression by a linear transform,
# where predictions are normalized by anchor stride (equal to anchor size).
if box2box_transform is None:
box2box_transform = Box2BoxTransformLinear(normalize_by_size=True)
self.box2box_transform = box2box_transform
self.center_sampling_radius = float(center_sampling_radius)
# Loss parameters:
self.focal_loss_alpha = focal_loss_alpha
self.focal_loss_gamma = focal_loss_gamma
# Inference parameters:
self.test_score_thresh = test_score_thresh
self.test_topk_candidates = test_topk_candidates
self.test_nms_thresh = test_nms_thresh
self.max_detections_per_image = max_detections_per_image
def forward_training(self, images, features, predictions, gt_instances):
# Transpose the Hi*Wi*A dimension to the middle:
pred_logits, pred_anchor_deltas, pred_centerness = self._transpose_dense_predictions(
predictions, [self.num_classes, 4, 1]
)
anchors = self.anchor_generator(features)
gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances)
return self.losses(
anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes, pred_centerness
)
@torch.no_grad()
def _match_anchors(self, gt_boxes: Boxes, anchors: List[Boxes]):
"""
Match ground-truth boxes to a set of multi-level anchors.
Args:
gt_boxes: Ground-truth boxes from instances of an image.
anchors: List of anchors for each feature map (of different scales).
Returns:
torch.Tensor
A tensor of shape `(M, R)`, given `M` ground-truth boxes and total
`R` anchor points from all feature levels, indicating the quality
of match between m-th box and r-th anchor. Higher value indicates
better match.
"""
# Naming convention: (M = ground-truth boxes, R = anchor points)
# Anchor points are represented as square boxes of size = stride.
num_anchors_per_level = [len(x) for x in anchors]
anchors = Boxes.cat(anchors) # (R, 4)
anchor_centers = anchors.get_centers() # (R, 2)
anchor_sizes = anchors.tensor[:, 2] - anchors.tensor[:, 0] # (R, )
lower_bound = anchor_sizes * 4
lower_bound[: num_anchors_per_level[0]] = 0
upper_bound = anchor_sizes * 8
upper_bound[-num_anchors_per_level[-1] :] = float("inf")
gt_centers = gt_boxes.get_centers()
# FCOS with center sampling: anchor point must be close enough to
# ground-truth box center.
center_dists = (anchor_centers[None, :, :] - gt_centers[:, None, :]).abs_()
sampling_regions = self.center_sampling_radius * anchor_sizes[None, :]
match_quality_matrix = center_dists.max(dim=2).values < sampling_regions
pairwise_dist = pairwise_point_box_distance(anchor_centers, gt_boxes)
pairwise_dist = pairwise_dist.permute(1, 0, 2) # (M, R, 4)
# The original FCOS anchor matching rule: anchor point must be inside GT.
match_quality_matrix &= pairwise_dist.min(dim=2).values > 0
# Multilevel anchor matching in FCOS: each anchor is only responsible
# for certain scale range.
pairwise_dist = pairwise_dist.max(dim=2).values
match_quality_matrix &= (pairwise_dist > lower_bound[None, :]) & (
pairwise_dist < upper_bound[None, :]
)
# Match the GT box with minimum area, if there are multiple GT matches.
gt_areas = gt_boxes.area() # (M, )
match_quality_matrix = match_quality_matrix.to(torch.float32)
match_quality_matrix *= 1e8 - gt_areas[:, None]
return match_quality_matrix # (M, R)
@torch.no_grad()
def label_anchors(self, anchors: List[Boxes], gt_instances: List[Instances]):
"""
Same interface as :meth:`RetinaNet.label_anchors`, but implemented with FCOS
anchor matching rule.
Unlike RetinaNet, there are no ignored anchors.
"""
gt_labels, matched_gt_boxes = [], []
for inst in gt_instances:
if len(inst) > 0:
match_quality_matrix = self._match_anchors(inst.gt_boxes, anchors)
# Find matched ground-truth box per anchor. Un-matched anchors are
# assigned -1. This is equivalent to using an anchor matcher as used
# in R-CNN/RetinaNet: `Matcher(thresholds=[1e-5], labels=[0, 1])`
match_quality, matched_idxs = match_quality_matrix.max(dim=0)
matched_idxs[match_quality < 1e-5] = -1
matched_gt_boxes_i = inst.gt_boxes.tensor[matched_idxs.clip(min=0)]
gt_labels_i = inst.gt_classes[matched_idxs.clip(min=0)]
# Anchors with matched_idxs = -1 are labeled background.
gt_labels_i[matched_idxs < 0] = self.num_classes
else:
matched_gt_boxes_i = torch.zeros_like(Boxes.cat(anchors).tensor)
gt_labels_i = torch.full(
(len(matched_gt_boxes_i),),
fill_value=self.num_classes,
dtype=torch.long,
device=matched_gt_boxes_i.device,
)
gt_labels.append(gt_labels_i)
matched_gt_boxes.append(matched_gt_boxes_i)
return gt_labels, matched_gt_boxes
def losses(
self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes, pred_centerness
):
"""
This method is almost identical to :meth:`RetinaNet.losses`, with an extra
"loss_centerness" in the returned dict.
"""
num_images = len(gt_labels)
gt_labels = torch.stack(gt_labels) # (M, R)
pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes)
num_pos_anchors = pos_mask.sum().item()
get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images)
normalizer = self._ema_update("loss_normalizer", max(num_pos_anchors, 1), 300)
# classification and regression loss
gt_labels_target = F.one_hot(gt_labels, num_classes=self.num_classes + 1)[
:, :, :-1
] # no loss for the last (background) class
loss_cls = sigmoid_focal_loss_jit(
torch.cat(pred_logits, dim=1),
gt_labels_target.to(pred_logits[0].dtype),
alpha=self.focal_loss_alpha,
gamma=self.focal_loss_gamma,
reduction="sum",
)
loss_box_reg = _dense_box_regression_loss(
anchors,
self.box2box_transform,
pred_anchor_deltas,
gt_boxes,
pos_mask,
box_reg_loss_type="giou",
)
ctrness_targets = self.compute_ctrness_targets(anchors, gt_boxes) # (M, R)
pred_centerness = torch.cat(pred_centerness, dim=1).squeeze(dim=2) # (M, R)
ctrness_loss = F.binary_cross_entropy_with_logits(
pred_centerness[pos_mask], ctrness_targets[pos_mask], reduction="sum"
)
return {
"loss_fcos_cls": loss_cls / normalizer,
"loss_fcos_loc": loss_box_reg / normalizer,
"loss_fcos_ctr": ctrness_loss / normalizer,
}
def compute_ctrness_targets(self, anchors: List[Boxes], gt_boxes: List[torch.Tensor]):
anchors = Boxes.cat(anchors).tensor # Rx4
reg_targets = [self.box2box_transform.get_deltas(anchors, m) for m in gt_boxes]
reg_targets = torch.stack(reg_targets, dim=0) # NxRx4
if len(reg_targets) == 0:
return reg_targets.new_zeros(len(reg_targets))
left_right = reg_targets[:, :, [0, 2]]
top_bottom = reg_targets[:, :, [1, 3]]
ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * (
top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]
)
return torch.sqrt(ctrness)
def forward_inference(
self,
images: ImageList,
features: List[torch.Tensor],
predictions: List[List[torch.Tensor]],
):
pred_logits, pred_anchor_deltas, pred_centerness = self._transpose_dense_predictions(
predictions, [self.num_classes, 4, 1]
)
anchors = self.anchor_generator(features)
results: List[Instances] = []
for img_idx, image_size in enumerate(images.image_sizes):
scores_per_image = [
# Multiply and sqrt centerness & classification scores
# (See eqn. 4 in https://arxiv.org/abs/2006.09214)
torch.sqrt(x[img_idx].sigmoid_() * y[img_idx].sigmoid_())
for x, y in zip(pred_logits, pred_centerness)
]
deltas_per_image = [x[img_idx] for x in pred_anchor_deltas]
results_per_image = self.inference_single_image(
anchors, scores_per_image, deltas_per_image, image_size
)
results.append(results_per_image)
return results
def inference_single_image(
self,
anchors: List[Boxes],
box_cls: List[torch.Tensor],
box_delta: List[torch.Tensor],
image_size: Tuple[int, int],
):
"""
Identical to :meth:`RetinaNet.inference_single_image.
"""
pred = self._decode_multi_level_predictions(
anchors,
box_cls,
box_delta,
self.test_score_thresh,
self.test_topk_candidates,
image_size,
)
keep = batched_nms(
pred.pred_boxes.tensor, pred.scores, pred.pred_classes, self.test_nms_thresh
)
return pred[keep[: self.max_detections_per_image]]
class FCOSHead(RetinaNetHead):
"""
The head used in :paper:`fcos`. It adds an additional centerness
prediction branch on top of :class:`RetinaNetHead`.
"""
def __init__(self, *, input_shape: List[ShapeSpec], conv_dims: List[int], **kwargs):
super().__init__(input_shape=input_shape, conv_dims=conv_dims, num_anchors=1, **kwargs)
# Unlike original FCOS, we do not add an additional learnable scale layer
# because it's found to have no benefits after normalizing regression targets by stride.
self._num_features = len(input_shape)
self.ctrness = nn.Conv2d(conv_dims[-1], 1, kernel_size=3, stride=1, padding=1)
torch.nn.init.normal_(self.ctrness.weight, std=0.01)
torch.nn.init.constant_(self.ctrness.bias, 0)
def forward(self, features):
assert len(features) == self._num_features
logits = []
bbox_reg = []
ctrness = []
for feature in features:
logits.append(self.cls_score(self.cls_subnet(feature)))
bbox_feature = self.bbox_subnet(feature)
bbox_reg.append(self.bbox_pred(bbox_feature))
ctrness.append(self.ctrness(bbox_feature))
return logits, bbox_reg, ctrness
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
import logging
from typing import Dict, List
import torch
from torch import nn
from detectron2.config import configurable
from detectron2.structures import ImageList
from ..postprocessing import detector_postprocess, sem_seg_postprocess
from .build import META_ARCH_REGISTRY
from .rcnn import GeneralizedRCNN
from .semantic_seg import build_sem_seg_head
__all__ = ["PanopticFPN"]
@META_ARCH_REGISTRY.register()
class PanopticFPN(GeneralizedRCNN):
"""
Implement the paper :paper:`PanopticFPN`.
"""
@configurable
def __init__(
self,
*,
sem_seg_head: nn.Module,
combine_overlap_thresh: float = 0.5,
combine_stuff_area_thresh: float = 4096,
combine_instances_score_thresh: float = 0.5,
**kwargs,
):
"""
NOTE: this interface is experimental.
Args:
sem_seg_head: a module for the semantic segmentation head.
combine_overlap_thresh: combine masks into one instances if
they have enough overlap
combine_stuff_area_thresh: ignore stuff areas smaller than this threshold
combine_instances_score_thresh: ignore instances whose score is
smaller than this threshold
Other arguments are the same as :class:`GeneralizedRCNN`.
"""
super().__init__(**kwargs)
self.sem_seg_head = sem_seg_head
# options when combining instance & semantic outputs
self.combine_overlap_thresh = combine_overlap_thresh
self.combine_stuff_area_thresh = combine_stuff_area_thresh
self.combine_instances_score_thresh = combine_instances_score_thresh
@classmethod
def from_config(cls, cfg):
ret = super().from_config(cfg)
ret.update(
{
"combine_overlap_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH,
"combine_stuff_area_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT,
"combine_instances_score_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH, # noqa
}
)
ret["sem_seg_head"] = build_sem_seg_head(cfg, ret["backbone"].output_shape())
logger = logging.getLogger(__name__)
if not cfg.MODEL.PANOPTIC_FPN.COMBINE.ENABLED:
logger.warning(
"PANOPTIC_FPN.COMBINED.ENABLED is no longer used. "
" model.inference(do_postprocess=) should be used to toggle postprocessing."
)
if cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT != 1.0:
w = cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT
logger.warning(
"PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT should be replaced by weights on each ROI head."
)
def update_weight(x):
if isinstance(x, dict):
return {k: v * w for k, v in x.items()}
else:
return x * w
roi_heads = ret["roi_heads"]
roi_heads.box_predictor.loss_weight = update_weight(roi_heads.box_predictor.loss_weight)
roi_heads.mask_head.loss_weight = update_weight(roi_heads.mask_head.loss_weight)
return ret
def forward(self, batched_inputs):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* "image": Tensor, image in (C, H, W) format.
* "instances": Instances
* "sem_seg": semantic segmentation ground truth.
* Other information that's included in the original dicts, such as:
"height", "width" (int): the output resolution of the model, used in inference.
See :meth:`postprocess` for details.
Returns:
list[dict]:
each dict has the results for one image. The dict contains the following keys:
* "instances": see :meth:`GeneralizedRCNN.forward` for its format.
* "sem_seg": see :meth:`SemanticSegmentor.forward` for its format.
* "panoptic_seg": See the return value of
:func:`combine_semantic_and_instance_outputs` for its format.
"""
if not self.training:
return self.inference(batched_inputs)
images = self.preprocess_image(batched_inputs)
features = self.backbone(images.tensor)
assert "sem_seg" in batched_inputs[0]
gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs]
gt_sem_seg = ImageList.from_tensors(
gt_sem_seg,
self.backbone.size_divisibility,
self.sem_seg_head.ignore_value,
self.backbone.padding_constraints,
).tensor
sem_seg_results, sem_seg_losses = self.sem_seg_head(features, gt_sem_seg)
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
detector_results, detector_losses = self.roi_heads(
images, features, proposals, gt_instances
)
losses = sem_seg_losses
losses.update(proposal_losses)
losses.update(detector_losses)
return losses
def inference(self, batched_inputs: List[Dict[str, torch.Tensor]], do_postprocess: bool = True):
"""
Run inference on the given inputs.
Args:
batched_inputs (list[dict]): same as in :meth:`forward`
do_postprocess (bool): whether to apply post-processing on the outputs.
Returns:
When do_postprocess=True, see docs in :meth:`forward`.
Otherwise, returns a (list[Instances], list[Tensor]) that contains
the raw detector outputs, and raw semantic segmentation outputs.
"""
images = self.preprocess_image(batched_inputs)
features = self.backbone(images.tensor)
sem_seg_results, sem_seg_losses = self.sem_seg_head(features, None)
proposals, _ = self.proposal_generator(images, features, None)
detector_results, _ = self.roi_heads(images, features, proposals, None)
if do_postprocess:
processed_results = []
for sem_seg_result, detector_result, input_per_image, image_size in zip(
sem_seg_results, detector_results, batched_inputs, images.image_sizes
):
height = input_per_image.get("height", image_size[0])
width = input_per_image.get("width", image_size[1])
sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width)
detector_r = detector_postprocess(detector_result, height, width)
processed_results.append({"sem_seg": sem_seg_r, "instances": detector_r})
panoptic_r = combine_semantic_and_instance_outputs(
detector_r,
sem_seg_r.argmax(dim=0),
self.combine_overlap_thresh,
self.combine_stuff_area_thresh,
self.combine_instances_score_thresh,
)
processed_results[-1]["panoptic_seg"] = panoptic_r
return processed_results
else:
return detector_results, sem_seg_results
def combine_semantic_and_instance_outputs(
instance_results,
semantic_results,
overlap_threshold,
stuff_area_thresh,
instances_score_thresh,
):
"""
Implement a simple combining logic following
"combine_semantic_and_instance_predictions.py" in panopticapi
to produce panoptic segmentation outputs.
Args:
instance_results: output of :func:`detector_postprocess`.
semantic_results: an (H, W) tensor, each element is the contiguous semantic
category id
Returns:
panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
segments_info (list[dict]): Describe each segment in `panoptic_seg`.
Each dict contains keys "id", "category_id", "isthing".
"""
panoptic_seg = torch.zeros_like(semantic_results, dtype=torch.int32)
# sort instance outputs by scores
sorted_inds = torch.argsort(-instance_results.scores)
current_segment_id = 0
segments_info = []
instance_masks = instance_results.pred_masks.to(dtype=torch.bool, device=panoptic_seg.device)
# Add instances one-by-one, check for overlaps with existing ones
for inst_id in sorted_inds:
score = instance_results.scores[inst_id].item()
if score < instances_score_thresh:
break
mask = instance_masks[inst_id] # H,W
mask_area = mask.sum().item()
if mask_area == 0:
continue
intersect = (mask > 0) & (panoptic_seg > 0)
intersect_area = intersect.sum().item()
if intersect_area * 1.0 / mask_area > overlap_threshold:
continue
if intersect_area > 0:
mask = mask & (panoptic_seg == 0)
current_segment_id += 1
panoptic_seg[mask] = current_segment_id
segments_info.append(
{
"id": current_segment_id,
"isthing": True,
"score": score,
"category_id": instance_results.pred_classes[inst_id].item(),
"instance_id": inst_id.item(),
}
)
# Add semantic results to remaining empty areas
semantic_labels = torch.unique(semantic_results).cpu().tolist()
for semantic_label in semantic_labels:
if semantic_label == 0: # 0 is a special "thing" class
continue
mask = (semantic_results == semantic_label) & (panoptic_seg == 0)
mask_area = mask.sum().item()
if mask_area < stuff_area_thresh:
continue
current_segment_id += 1
panoptic_seg[mask] = current_segment_id
segments_info.append(
{
"id": current_segment_id,
"isthing": False,
"category_id": semantic_label,
"area": mask_area,
}
)
return panoptic_seg, segments_info
# Copyright (c) Facebook, Inc. and its affiliates.
import logging
import numpy as np
from typing import Dict, List, Optional, Tuple
import torch
from torch import nn
from detectron2.config import configurable
from detectron2.data.detection_utils import convert_image_to_rgb
from detectron2.layers import move_device_like
from detectron2.structures import ImageList, Instances
from detectron2.utils.events import get_event_storage
from detectron2.utils.logger import log_first_n
from ..backbone import Backbone, build_backbone
from ..postprocessing import detector_postprocess
from ..proposal_generator import build_proposal_generator
from ..roi_heads import build_roi_heads
from .build import META_ARCH_REGISTRY
__all__ = ["GeneralizedRCNN", "ProposalNetwork"]
@META_ARCH_REGISTRY.register()
class GeneralizedRCNN(nn.Module):
"""
Generalized R-CNN. Any models that contains the following three components:
1. Per-image feature extraction (aka backbone)
2. Region proposal generation
3. Per-region feature extraction and prediction
"""
@configurable
def __init__(
self,
*,
backbone: Backbone,
proposal_generator: nn.Module,
roi_heads: nn.Module,
pixel_mean: Tuple[float],
pixel_std: Tuple[float],
input_format: Optional[str] = None,
vis_period: int = 0,
):
"""
Args:
backbone: a backbone module, must follow detectron2's backbone interface
proposal_generator: a module that generates proposals using backbone features
roi_heads: a ROI head that performs per-region computation
pixel_mean, pixel_std: list or tuple with #channels element, representing
the per-channel mean and std to be used to normalize the input image
input_format: describe the meaning of channels of input. Needed by visualization
vis_period: the period to run visualization. Set to 0 to disable.
"""
super().__init__()
self.backbone = backbone
self.proposal_generator = proposal_generator
self.roi_heads = roi_heads
self.input_format = input_format
self.vis_period = vis_period
if vis_period > 0:
assert input_format is not None, "input_format is required for visualization!"
self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
assert (
self.pixel_mean.shape == self.pixel_std.shape
), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
@classmethod
def from_config(cls, cfg):
backbone = build_backbone(cfg)
return {
"backbone": backbone,
"proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
"roi_heads": build_roi_heads(cfg, backbone.output_shape()),
"input_format": cfg.INPUT.FORMAT,
"vis_period": cfg.VIS_PERIOD,
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
"pixel_std": cfg.MODEL.PIXEL_STD,
}
@property
def device(self):
return self.pixel_mean.device
def _move_to_current_device(self, x):
return move_device_like(x, self.pixel_mean)
def visualize_training(self, batched_inputs, proposals):
"""
A function used to visualize images and proposals. It shows ground truth
bounding boxes on the original image and up to 20 top-scoring predicted
object proposals on the original image. Users can implement different
visualization functions for different models.
Args:
batched_inputs (list): a list that contains input to the model.
proposals (list): a list that contains predicted proposals. Both
batched_inputs and proposals should have the same length.
"""
from detectron2.utils.visualizer import Visualizer
storage = get_event_storage()
max_vis_prop = 20
for input, prop in zip(batched_inputs, proposals):
img = input["image"]
img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
v_gt = Visualizer(img, None)
v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
anno_img = v_gt.get_image()
box_size = min(len(prop.proposal_boxes), max_vis_prop)
v_pred = Visualizer(img, None)
v_pred = v_pred.overlay_instances(
boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
)
prop_img = v_pred.get_image()
vis_img = np.concatenate((anno_img, prop_img), axis=1)
vis_img = vis_img.transpose(2, 0, 1)
vis_name = "Left: GT bounding boxes; Right: Predicted proposals"
storage.put_image(vis_name, vis_img)
break # only visualize one image in a batch
def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* image: Tensor, image in (C, H, W) format.
* instances (optional): groundtruth :class:`Instances`
* proposals (optional): :class:`Instances`, precomputed proposals.
Other information that's included in the original dicts, such as:
* "height", "width" (int): the output resolution of the model, used in inference.
See :meth:`postprocess` for details.
Returns:
list[dict]:
Each dict is the output for one input image.
The dict contains one key "instances" whose value is a :class:`Instances`.
The :class:`Instances` object has the following keys:
"pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
"""
if not self.training:
return self.inference(batched_inputs)
images = self.preprocess_image(batched_inputs)
if "instances" in batched_inputs[0]:
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
else:
gt_instances = None
features = self.backbone(images.tensor)
if self.proposal_generator is not None:
proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
else:
assert "proposals" in batched_inputs[0]
proposals = [x["proposals"].to(self.device) for x in batched_inputs]
proposal_losses = {}
_, detector_losses = self.roi_heads(images, features, proposals, gt_instances)
if self.vis_period > 0:
storage = get_event_storage()
if storage.iter % self.vis_period == 0:
self.visualize_training(batched_inputs, proposals)
losses = {}
losses.update(detector_losses)
losses.update(proposal_losses)
return losses
def inference(
self,
batched_inputs: List[Dict[str, torch.Tensor]],
detected_instances: Optional[List[Instances]] = None,
do_postprocess: bool = True,
):
"""
Run inference on the given inputs.
Args:
batched_inputs (list[dict]): same as in :meth:`forward`
detected_instances (None or list[Instances]): if not None, it
contains an `Instances` object per image. The `Instances`
object contains "pred_boxes" and "pred_classes" which are
known boxes in the image.
The inference will then skip the detection of bounding boxes,
and only predict other per-ROI outputs.
do_postprocess (bool): whether to apply post-processing on the outputs.
Returns:
When do_postprocess=True, same as in :meth:`forward`.
Otherwise, a list[Instances] containing raw network outputs.
"""
assert not self.training
images = self.preprocess_image(batched_inputs)
features = self.backbone(images.tensor)
if detected_instances is None:
if self.proposal_generator is not None:
proposals, _ = self.proposal_generator(images, features, None)
else:
assert "proposals" in batched_inputs[0]
proposals = [x["proposals"].to(self.device) for x in batched_inputs]
results, _ = self.roi_heads(images, features, proposals, None)
else:
detected_instances = [x.to(self.device) for x in detected_instances]
results = self.roi_heads.forward_with_given_boxes(features, detected_instances)
if do_postprocess:
assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
return results
def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]]):
"""
Normalize, pad and batch the input images.
"""
images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(
images,
self.backbone.size_divisibility,
padding_constraints=self.backbone.padding_constraints,
)
return images
@staticmethod
def _postprocess(instances, batched_inputs: List[Dict[str, torch.Tensor]], image_sizes):
"""
Rescale the output instances to the target size.
"""
# note: private function; subject to changes
processed_results = []
for results_per_image, input_per_image, image_size in zip(
instances, batched_inputs, image_sizes
):
height = input_per_image.get("height", image_size[0])
width = input_per_image.get("width", image_size[1])
r = detector_postprocess(results_per_image, height, width)
processed_results.append({"instances": r})
return processed_results
@META_ARCH_REGISTRY.register()
class ProposalNetwork(nn.Module):
"""
A meta architecture that only predicts object proposals.
"""
@configurable
def __init__(
self,
*,
backbone: Backbone,
proposal_generator: nn.Module,
pixel_mean: Tuple[float],
pixel_std: Tuple[float],
):
"""
Args:
backbone: a backbone module, must follow detectron2's backbone interface
proposal_generator: a module that generates proposals using backbone features
pixel_mean, pixel_std: list or tuple with #channels element, representing
the per-channel mean and std to be used to normalize the input image
"""
super().__init__()
self.backbone = backbone
self.proposal_generator = proposal_generator
self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
@classmethod
def from_config(cls, cfg):
backbone = build_backbone(cfg)
return {
"backbone": backbone,
"proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
"pixel_std": cfg.MODEL.PIXEL_STD,
}
@property
def device(self):
return self.pixel_mean.device
def _move_to_current_device(self, x):
return move_device_like(x, self.pixel_mean)
def forward(self, batched_inputs):
"""
Args:
Same as in :class:`GeneralizedRCNN.forward`
Returns:
list[dict]:
Each dict is the output for one input image.
The dict contains one key "proposals" whose value is a
:class:`Instances` with keys "proposal_boxes" and "objectness_logits".
"""
images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(
images,
self.backbone.size_divisibility,
padding_constraints=self.backbone.padding_constraints,
)
features = self.backbone(images.tensor)
if "instances" in batched_inputs[0]:
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
elif "targets" in batched_inputs[0]:
log_first_n(
logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
)
gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
else:
gt_instances = None
proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
# In training, the proposals are not useful at all but we generate them anyway.
# This makes RPN-only models about 5% slower.
if self.training:
return proposal_losses
processed_results = []
for results_per_image, input_per_image, image_size in zip(
proposals, batched_inputs, images.image_sizes
):
height = input_per_image.get("height", image_size[0])
width = input_per_image.get("width", image_size[1])
r = detector_postprocess(results_per_image, height, width)
processed_results.append({"proposals": r})
return processed_results
# Copyright (c) Facebook, Inc. and its affiliates.
import logging
import math
from typing import List, Tuple
import torch
from fvcore.nn import sigmoid_focal_loss_jit
from torch import Tensor, nn
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.layers import CycleBatchNormList, ShapeSpec, batched_nms, cat, get_norm
from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou
from detectron2.utils.events import get_event_storage
from ..anchor_generator import build_anchor_generator
from ..backbone import Backbone, build_backbone
from ..box_regression import Box2BoxTransform, _dense_box_regression_loss
from ..matcher import Matcher
from .build import META_ARCH_REGISTRY
from .dense_detector import DenseDetector, permute_to_N_HWA_K # noqa
__all__ = ["RetinaNet"]
logger = logging.getLogger(__name__)
@META_ARCH_REGISTRY.register()
class RetinaNet(DenseDetector):
"""
Implement RetinaNet in :paper:`RetinaNet`.
"""
@configurable
def __init__(
self,
*,
backbone: Backbone,
head: nn.Module,
head_in_features,
anchor_generator,
box2box_transform,
anchor_matcher,
num_classes,
focal_loss_alpha=0.25,
focal_loss_gamma=2.0,
smooth_l1_beta=0.0,
box_reg_loss_type="smooth_l1",
test_score_thresh=0.05,
test_topk_candidates=1000,
test_nms_thresh=0.5,
max_detections_per_image=100,
pixel_mean,
pixel_std,
vis_period=0,
input_format="BGR",
):
"""
NOTE: this interface is experimental.
Args:
backbone: a backbone module, must follow detectron2's backbone interface
head (nn.Module): a module that predicts logits and regression deltas
for each level from a list of per-level features
head_in_features (Tuple[str]): Names of the input feature maps to be used in head
anchor_generator (nn.Module): a module that creates anchors from a
list of features. Usually an instance of :class:`AnchorGenerator`
box2box_transform (Box2BoxTransform): defines the transform from anchors boxes to
instance boxes
anchor_matcher (Matcher): label the anchors by matching them with ground truth.
num_classes (int): number of classes. Used to label background proposals.
# Loss parameters:
focal_loss_alpha (float): focal_loss_alpha
focal_loss_gamma (float): focal_loss_gamma
smooth_l1_beta (float): smooth_l1_beta
box_reg_loss_type (str): Options are "smooth_l1", "giou", "diou", "ciou"
# Inference parameters:
test_score_thresh (float): Inference cls score threshold, only anchors with
score > INFERENCE_TH are considered for inference (to improve speed)
test_topk_candidates (int): Select topk candidates before NMS
test_nms_thresh (float): Overlap threshold used for non-maximum suppression
(suppress boxes with IoU >= this threshold)
max_detections_per_image (int):
Maximum number of detections to return per image during inference
(100 is based on the limit established for the COCO dataset).
pixel_mean, pixel_std: see :class:`DenseDetector`.
"""
super().__init__(
backbone, head, head_in_features, pixel_mean=pixel_mean, pixel_std=pixel_std
)
self.num_classes = num_classes
# Anchors
self.anchor_generator = anchor_generator
self.box2box_transform = box2box_transform
self.anchor_matcher = anchor_matcher
# Loss parameters:
self.focal_loss_alpha = focal_loss_alpha
self.focal_loss_gamma = focal_loss_gamma
self.smooth_l1_beta = smooth_l1_beta
self.box_reg_loss_type = box_reg_loss_type
# Inference parameters:
self.test_score_thresh = test_score_thresh
self.test_topk_candidates = test_topk_candidates
self.test_nms_thresh = test_nms_thresh
self.max_detections_per_image = max_detections_per_image
# Vis parameters
self.vis_period = vis_period
self.input_format = input_format
@classmethod
def from_config(cls, cfg):
backbone = build_backbone(cfg)
backbone_shape = backbone.output_shape()
feature_shapes = [backbone_shape[f] for f in cfg.MODEL.RETINANET.IN_FEATURES]
head = RetinaNetHead(cfg, feature_shapes)
anchor_generator = build_anchor_generator(cfg, feature_shapes)
return {
"backbone": backbone,
"head": head,
"anchor_generator": anchor_generator,
"box2box_transform": Box2BoxTransform(weights=cfg.MODEL.RETINANET.BBOX_REG_WEIGHTS),
"anchor_matcher": Matcher(
cfg.MODEL.RETINANET.IOU_THRESHOLDS,
cfg.MODEL.RETINANET.IOU_LABELS,
allow_low_quality_matches=True,
),
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
"pixel_std": cfg.MODEL.PIXEL_STD,
"num_classes": cfg.MODEL.RETINANET.NUM_CLASSES,
"head_in_features": cfg.MODEL.RETINANET.IN_FEATURES,
# Loss parameters:
"focal_loss_alpha": cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA,
"focal_loss_gamma": cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA,
"smooth_l1_beta": cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA,
"box_reg_loss_type": cfg.MODEL.RETINANET.BBOX_REG_LOSS_TYPE,
# Inference parameters:
"test_score_thresh": cfg.MODEL.RETINANET.SCORE_THRESH_TEST,
"test_topk_candidates": cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST,
"test_nms_thresh": cfg.MODEL.RETINANET.NMS_THRESH_TEST,
"max_detections_per_image": cfg.TEST.DETECTIONS_PER_IMAGE,
# Vis parameters
"vis_period": cfg.VIS_PERIOD,
"input_format": cfg.INPUT.FORMAT,
}
def forward_training(self, images, features, predictions, gt_instances):
# Transpose the Hi*Wi*A dimension to the middle:
pred_logits, pred_anchor_deltas = self._transpose_dense_predictions(
predictions, [self.num_classes, 4]
)
anchors = self.anchor_generator(features)
gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances)
return self.losses(anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes)
def losses(self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes):
"""
Args:
anchors (list[Boxes]): a list of #feature level Boxes
gt_labels, gt_boxes: see output of :meth:`RetinaNet.label_anchors`.
Their shapes are (N, R) and (N, R, 4), respectively, where R is
the total number of anchors across levels, i.e. sum(Hi x Wi x Ai)
pred_logits, pred_anchor_deltas: both are list[Tensor]. Each element in the
list corresponds to one level and has shape (N, Hi * Wi * Ai, K or 4).
Where K is the number of classes used in `pred_logits`.
Returns:
dict[str, Tensor]:
mapping from a named loss to a scalar tensor storing the loss.
Used during training only. The dict keys are: "loss_cls" and "loss_box_reg"
"""
num_images = len(gt_labels)
gt_labels = torch.stack(gt_labels) # (N, R)
valid_mask = gt_labels >= 0
pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes)
num_pos_anchors = pos_mask.sum().item()
get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images)
normalizer = self._ema_update("loss_normalizer", max(num_pos_anchors, 1), 100)
# classification and regression loss
gt_labels_target = F.one_hot(gt_labels[valid_mask], num_classes=self.num_classes + 1)[
:, :-1
] # no loss for the last (background) class
loss_cls = sigmoid_focal_loss_jit(
cat(pred_logits, dim=1)[valid_mask],
gt_labels_target.to(pred_logits[0].dtype),
alpha=self.focal_loss_alpha,
gamma=self.focal_loss_gamma,
reduction="sum",
)
loss_box_reg = _dense_box_regression_loss(
anchors,
self.box2box_transform,
pred_anchor_deltas,
gt_boxes,
pos_mask,
box_reg_loss_type=self.box_reg_loss_type,
smooth_l1_beta=self.smooth_l1_beta,
)
return {
"loss_cls": loss_cls / normalizer,
"loss_box_reg": loss_box_reg / normalizer,
}
@torch.no_grad()
def label_anchors(self, anchors, gt_instances):
"""
Args:
anchors (list[Boxes]): A list of #feature level Boxes.
The Boxes contains anchors of this image on the specific feature level.
gt_instances (list[Instances]): a list of N `Instances`s. The i-th
`Instances` contains the ground-truth per-instance annotations
for the i-th input image.
Returns:
list[Tensor]: List of #img tensors. i-th element is a vector of labels whose length is
the total number of anchors across all feature maps (sum(Hi * Wi * A)).
Label values are in {-1, 0, ..., K}, with -1 means ignore, and K means background.
list[Tensor]: i-th element is a Rx4 tensor, where R is the total number of anchors
across feature maps. The values are the matched gt boxes for each anchor.
Values are undefined for those anchors not labeled as foreground.
"""
anchors = Boxes.cat(anchors) # Rx4
gt_labels = []
matched_gt_boxes = []
for gt_per_image in gt_instances:
match_quality_matrix = pairwise_iou(gt_per_image.gt_boxes, anchors)
matched_idxs, anchor_labels = self.anchor_matcher(match_quality_matrix)
del match_quality_matrix
if len(gt_per_image) > 0:
matched_gt_boxes_i = gt_per_image.gt_boxes.tensor[matched_idxs]
gt_labels_i = gt_per_image.gt_classes[matched_idxs]
# Anchors with label 0 are treated as background.
gt_labels_i[anchor_labels == 0] = self.num_classes
# Anchors with label -1 are ignored.
gt_labels_i[anchor_labels == -1] = -1
else:
matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
gt_labels_i = torch.zeros_like(matched_idxs) + self.num_classes
gt_labels.append(gt_labels_i)
matched_gt_boxes.append(matched_gt_boxes_i)
return gt_labels, matched_gt_boxes
def forward_inference(
self, images: ImageList, features: List[Tensor], predictions: List[List[Tensor]]
):
pred_logits, pred_anchor_deltas = self._transpose_dense_predictions(
predictions, [self.num_classes, 4]
)
anchors = self.anchor_generator(features)
results: List[Instances] = []
for img_idx, image_size in enumerate(images.image_sizes):
scores_per_image = [x[img_idx].sigmoid_() for x in pred_logits]
deltas_per_image = [x[img_idx] for x in pred_anchor_deltas]
results_per_image = self.inference_single_image(
anchors, scores_per_image, deltas_per_image, image_size
)
results.append(results_per_image)
return results
def inference_single_image(
self,
anchors: List[Boxes],
box_cls: List[Tensor],
box_delta: List[Tensor],
image_size: Tuple[int, int],
):
"""
Single-image inference. Return bounding-box detection results by thresholding
on scores and applying non-maximum suppression (NMS).
Arguments:
anchors (list[Boxes]): list of #feature levels. Each entry contains
a Boxes object, which contains all the anchors in that feature level.
box_cls (list[Tensor]): list of #feature levels. Each entry contains
tensor of size (H x W x A, K)
box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4.
image_size (tuple(H, W)): a tuple of the image height and width.
Returns:
Same as `inference`, but for only one image.
"""
pred = self._decode_multi_level_predictions(
anchors,
box_cls,
box_delta,
self.test_score_thresh,
self.test_topk_candidates,
image_size,
)
keep = batched_nms( # per-class NMS
pred.pred_boxes.tensor, pred.scores, pred.pred_classes, self.test_nms_thresh
)
return pred[keep[: self.max_detections_per_image]]
class RetinaNetHead(nn.Module):
"""
The head used in RetinaNet for object classification and box regression.
It has two subnets for the two tasks, with a common structure but separate parameters.
"""
@configurable
def __init__(
self,
*,
input_shape: List[ShapeSpec],
num_classes,
num_anchors,
conv_dims: List[int],
norm="",
prior_prob=0.01,
):
"""
NOTE: this interface is experimental.
Args:
input_shape (List[ShapeSpec]): input shape
num_classes (int): number of classes. Used to label background proposals.
num_anchors (int): number of generated anchors
conv_dims (List[int]): dimensions for each convolution layer
norm (str or callable):
Normalization for conv layers except for the two output layers.
See :func:`detectron2.layers.get_norm` for supported types.
prior_prob (float): Prior weight for computing bias
"""
super().__init__()
self._num_features = len(input_shape)
if norm == "BN" or norm == "SyncBN":
logger.info(
f"Using domain-specific {norm} in RetinaNetHead with len={self._num_features}."
)
bn_class = nn.BatchNorm2d if norm == "BN" else nn.SyncBatchNorm
def norm(c):
return CycleBatchNormList(
length=self._num_features, bn_class=bn_class, num_features=c
)
else:
norm_name = str(type(get_norm(norm, 32)))
if "BN" in norm_name:
logger.warning(
f"Shared BatchNorm (type={norm_name}) may not work well in RetinaNetHead."
)
cls_subnet = []
bbox_subnet = []
for in_channels, out_channels in zip(
[input_shape[0].channels] + list(conv_dims), conv_dims
):
cls_subnet.append(
nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
)
if norm:
cls_subnet.append(get_norm(norm, out_channels))
cls_subnet.append(nn.ReLU())
bbox_subnet.append(
nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
)
if norm:
bbox_subnet.append(get_norm(norm, out_channels))
bbox_subnet.append(nn.ReLU())
self.cls_subnet = nn.Sequential(*cls_subnet)
self.bbox_subnet = nn.Sequential(*bbox_subnet)
self.cls_score = nn.Conv2d(
conv_dims[-1], num_anchors * num_classes, kernel_size=3, stride=1, padding=1
)
self.bbox_pred = nn.Conv2d(
conv_dims[-1], num_anchors * 4, kernel_size=3, stride=1, padding=1
)
# Initialization
for modules in [self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred]:
for layer in modules.modules():
if isinstance(layer, nn.Conv2d):
torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
torch.nn.init.constant_(layer.bias, 0)
# Use prior in model initialization to improve stability
bias_value = -(math.log((1 - prior_prob) / prior_prob))
torch.nn.init.constant_(self.cls_score.bias, bias_value)
@classmethod
def from_config(cls, cfg, input_shape: List[ShapeSpec]):
num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors
assert (
len(set(num_anchors)) == 1
), "Using different number of anchors between levels is not currently supported!"
num_anchors = num_anchors[0]
return {
"input_shape": input_shape,
"num_classes": cfg.MODEL.RETINANET.NUM_CLASSES,
"conv_dims": [input_shape[0].channels] * cfg.MODEL.RETINANET.NUM_CONVS,
"prior_prob": cfg.MODEL.RETINANET.PRIOR_PROB,
"norm": cfg.MODEL.RETINANET.NORM,
"num_anchors": num_anchors,
}
def forward(self, features: List[Tensor]):
"""
Arguments:
features (list[Tensor]): FPN feature map tensors in high to low resolution.
Each tensor in the list correspond to different feature levels.
Returns:
logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi).
The tensor predicts the classification probability
at each spatial position for each of the A anchors and K object
classes.
bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi).
The tensor predicts 4-vector (dx,dy,dw,dh) box
regression values for every anchor. These values are the
relative offset between the anchor and the ground truth box.
"""
assert len(features) == self._num_features
logits = []
bbox_reg = []
for feature in features:
logits.append(self.cls_score(self.cls_subnet(feature)))
bbox_reg.append(self.bbox_pred(self.bbox_subnet(feature)))
return logits, bbox_reg
# Copyright (c) Facebook, Inc. and its affiliates.
import numpy as np
from typing import Callable, Dict, Optional, Tuple, Union
import fvcore.nn.weight_init as weight_init
import torch
from torch import nn
from torch.nn import functional as F
from detectron2.config import configurable
from detectron2.layers import Conv2d, ShapeSpec, get_norm
from detectron2.structures import ImageList
from detectron2.utils.registry import Registry
from ..backbone import Backbone, build_backbone
from ..postprocessing import sem_seg_postprocess
from .build import META_ARCH_REGISTRY
__all__ = [
"SemanticSegmentor",
"SEM_SEG_HEADS_REGISTRY",
"SemSegFPNHead",
"build_sem_seg_head",
]
SEM_SEG_HEADS_REGISTRY = Registry("SEM_SEG_HEADS")
SEM_SEG_HEADS_REGISTRY.__doc__ = """
Registry for semantic segmentation heads, which make semantic segmentation predictions
from feature maps.
"""
@META_ARCH_REGISTRY.register()
class SemanticSegmentor(nn.Module):
"""
Main class for semantic segmentation architectures.
"""
@configurable
def __init__(
self,
*,
backbone: Backbone,
sem_seg_head: nn.Module,
pixel_mean: Tuple[float],
pixel_std: Tuple[float],
):
"""
Args:
backbone: a backbone module, must follow detectron2's backbone interface
sem_seg_head: a module that predicts semantic segmentation from backbone features
pixel_mean, pixel_std: list or tuple with #channels element, representing
the per-channel mean and std to be used to normalize the input image
"""
super().__init__()
self.backbone = backbone
self.sem_seg_head = sem_seg_head
self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
@classmethod
def from_config(cls, cfg):
backbone = build_backbone(cfg)
sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
return {
"backbone": backbone,
"sem_seg_head": sem_seg_head,
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
"pixel_std": cfg.MODEL.PIXEL_STD,
}
@property
def device(self):
return self.pixel_mean.device
def forward(self, batched_inputs):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* "image": Tensor, image in (C, H, W) format.
* "sem_seg": semantic segmentation ground truth
* Other information that's included in the original dicts, such as:
"height", "width" (int): the output resolution of the model (may be different
from input resolution), used in inference.
Returns:
list[dict]:
Each dict is the output for one input image.
The dict contains one key "sem_seg" whose value is a
Tensor that represents the
per-pixel segmentation prediced by the head.
The prediction has shape KxHxW that represents the logits of
each class for each pixel.
"""
images = [x["image"].to(self.device) for x in batched_inputs]
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(
images,
self.backbone.size_divisibility,
padding_constraints=self.backbone.padding_constraints,
)
features = self.backbone(images.tensor)
if "sem_seg" in batched_inputs[0]:
targets = [x["sem_seg"].to(self.device) for x in batched_inputs]
targets = ImageList.from_tensors(
targets,
self.backbone.size_divisibility,
self.sem_seg_head.ignore_value,
self.backbone.padding_constraints,
).tensor
else:
targets = None
results, losses = self.sem_seg_head(features, targets)
if self.training:
return losses
processed_results = []
for result, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes):
height = input_per_image.get("height", image_size[0])
width = input_per_image.get("width", image_size[1])
r = sem_seg_postprocess(result, image_size, height, width)
processed_results.append({"sem_seg": r})
return processed_results
def build_sem_seg_head(cfg, input_shape):
"""
Build a semantic segmentation head from `cfg.MODEL.SEM_SEG_HEAD.NAME`.
"""
name = cfg.MODEL.SEM_SEG_HEAD.NAME
return SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
@SEM_SEG_HEADS_REGISTRY.register()
class SemSegFPNHead(nn.Module):
"""
A semantic segmentation head described in :paper:`PanopticFPN`.
It takes a list of FPN features as input, and applies a sequence of
3x3 convs and upsampling to scale all of them to the stride defined by
``common_stride``. Then these features are added and used to make final
predictions by another 1x1 conv layer.
"""
@configurable
def __init__(
self,
input_shape: Dict[str, ShapeSpec],
*,
num_classes: int,
conv_dims: int,
common_stride: int,
loss_weight: float = 1.0,
norm: Optional[Union[str, Callable]] = None,
ignore_value: int = -1,
):
"""
NOTE: this interface is experimental.
Args:
input_shape: shapes (channels and stride) of the input features
num_classes: number of classes to predict
conv_dims: number of output channels for the intermediate conv layers.
common_stride: the common stride that all features will be upscaled to
loss_weight: loss weight
norm (str or callable): normalization for all conv layers
ignore_value: category id to be ignored during training.
"""
super().__init__()
input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
if not len(input_shape):
raise ValueError("SemSegFPNHead(input_shape=) cannot be empty!")
self.in_features = [k for k, v in input_shape]
feature_strides = [v.stride for k, v in input_shape]
feature_channels = [v.channels for k, v in input_shape]
self.ignore_value = ignore_value
self.common_stride = common_stride
self.loss_weight = loss_weight
self.scale_heads = []
for in_feature, stride, channels in zip(
self.in_features, feature_strides, feature_channels
):
head_ops = []
head_length = max(1, int(np.log2(stride) - np.log2(self.common_stride)))
for k in range(head_length):
norm_module = get_norm(norm, conv_dims)
conv = Conv2d(
channels if k == 0 else conv_dims,
conv_dims,
kernel_size=3,
stride=1,
padding=1,
bias=not norm,
norm=norm_module,
activation=F.relu,
)
weight_init.c2_msra_fill(conv)
head_ops.append(conv)
if stride != self.common_stride:
head_ops.append(
nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
)
self.scale_heads.append(nn.Sequential(*head_ops))
self.add_module(in_feature, self.scale_heads[-1])
self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0)
weight_init.c2_msra_fill(self.predictor)
@classmethod
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
return {
"input_shape": {
k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
},
"ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
"num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
"conv_dims": cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM,
"common_stride": cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE,
"norm": cfg.MODEL.SEM_SEG_HEAD.NORM,
"loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
}
def forward(self, features, targets=None):
"""
Returns:
In training, returns (None, dict of losses)
In inference, returns (CxHxW logits, {})
"""
x = self.layers(features)
if self.training:
return None, self.losses(x, targets)
else:
x = F.interpolate(
x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
)
return x, {}
def layers(self, features):
for i, f in enumerate(self.in_features):
if i == 0:
x = self.scale_heads[i](features[f])
else:
x = x + self.scale_heads[i](features[f])
x = self.predictor(x)
return x
def losses(self, predictions, targets):
predictions = predictions.float() # https://github.com/pytorch/pytorch/issues/48163
predictions = F.interpolate(
predictions,
scale_factor=self.common_stride,
mode="bilinear",
align_corners=False,
)
loss = F.cross_entropy(
predictions, targets, reduction="mean", ignore_index=self.ignore_value
)
losses = {"loss_sem_seg": loss * self.loss_weight}
return losses
# Copyright (c) Facebook, Inc. and its affiliates.
import itertools
import logging
import numpy as np
from collections import OrderedDict
from collections.abc import Mapping
from typing import Dict, List, Optional, Tuple, Union
import torch
from omegaconf import DictConfig, OmegaConf
from torch import Tensor, nn
from detectron2.layers import ShapeSpec
from detectron2.structures import BitMasks, Boxes, ImageList, Instances
from detectron2.utils.events import get_event_storage
from .backbone import Backbone
logger = logging.getLogger(__name__)
def _to_container(cfg):
"""
mmdet will assert the type of dict/list.
So convert omegaconf objects to dict/list.
"""
if isinstance(cfg, DictConfig):
cfg = OmegaConf.to_container(cfg, resolve=True)
from mmcv.utils import ConfigDict
return ConfigDict(cfg)
class MMDetBackbone(Backbone):
"""
Wrapper of mmdetection backbones to use in detectron2.
mmdet backbones produce list/tuple of tensors, while detectron2 backbones
produce a dict of tensors. This class wraps the given backbone to produce
output in detectron2's convention, so it can be used in place of detectron2
backbones.
"""
def __init__(
self,
backbone: Union[nn.Module, Mapping],
neck: Union[nn.Module, Mapping, None] = None,
*,
output_shapes: List[ShapeSpec],
output_names: Optional[List[str]] = None,
):
"""
Args:
backbone: either a backbone module or a mmdet config dict that defines a
backbone. The backbone takes a 4D image tensor and returns a
sequence of tensors.
neck: either a backbone module or a mmdet config dict that defines a
neck. The neck takes outputs of backbone and returns a
sequence of tensors. If None, no neck is used.
output_shapes: shape for every output of the backbone (or neck, if given).
stride and channels are often needed.
output_names: names for every output of the backbone (or neck, if given).
By default, will use "out0", "out1", ...
"""
super().__init__()
if isinstance(backbone, Mapping):
from mmdet.models import build_backbone
backbone = build_backbone(_to_container(backbone))
self.backbone = backbone
if isinstance(neck, Mapping):
from mmdet.models import build_neck
neck = build_neck(_to_container(neck))
self.neck = neck
# "Neck" weights, if any, are part of neck itself. This is the interface
# of mmdet so we follow it. Reference:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py
logger.info("Initializing mmdet backbone weights...")
self.backbone.init_weights()
# train() in mmdet modules is non-trivial, and has to be explicitly
# called. Reference:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/backbones/resnet.py
self.backbone.train()
if self.neck is not None:
logger.info("Initializing mmdet neck weights ...")
if isinstance(self.neck, nn.Sequential):
for m in self.neck:
m.init_weights()
else:
self.neck.init_weights()
self.neck.train()
self._output_shapes = output_shapes
if not output_names:
output_names = [f"out{i}" for i in range(len(output_shapes))]
self._output_names = output_names
def forward(self, x) -> Dict[str, Tensor]:
outs = self.backbone(x)
if self.neck is not None:
outs = self.neck(outs)
assert isinstance(
outs, (list, tuple)
), "mmdet backbone should return a list/tuple of tensors!"
if len(outs) != len(self._output_shapes):
raise ValueError(
"Length of output_shapes does not match outputs from the mmdet backbone: "
f"{len(outs)} != {len(self._output_shapes)}"
)
return {k: v for k, v in zip(self._output_names, outs)}
def output_shape(self) -> Dict[str, ShapeSpec]:
return {k: v for k, v in zip(self._output_names, self._output_shapes)}
class MMDetDetector(nn.Module):
"""
Wrapper of a mmdetection detector model, for detection and instance segmentation.
Input/output formats of this class follow detectron2's convention, so a
mmdetection model can be trained and evaluated in detectron2.
"""
def __init__(
self,
detector: Union[nn.Module, Mapping],
*,
# Default is 32 regardless of model:
# https://github.com/open-mmlab/mmdetection/tree/master/configs/_base_/datasets
size_divisibility=32,
pixel_mean: Tuple[float],
pixel_std: Tuple[float],
):
"""
Args:
detector: a mmdet detector, or a mmdet config dict that defines a detector.
size_divisibility: pad input images to multiple of this number
pixel_mean: per-channel mean to normalize input image
pixel_std: per-channel stddev to normalize input image
"""
super().__init__()
if isinstance(detector, Mapping):
from mmdet.models import build_detector
detector = build_detector(_to_container(detector))
self.detector = detector
self.detector.init_weights()
self.size_divisibility = size_divisibility
self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
assert (
self.pixel_mean.shape == self.pixel_std.shape
), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
images = [x["image"].to(self.device) for x in batched_inputs]
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(images, size_divisibility=self.size_divisibility).tensor
metas = []
rescale = {"height" in x for x in batched_inputs}
if len(rescale) != 1:
raise ValueError("Some inputs have original height/width, but some don't!")
rescale = list(rescale)[0]
output_shapes = []
for input in batched_inputs:
meta = {}
c, h, w = input["image"].shape
meta["img_shape"] = meta["ori_shape"] = (h, w, c)
if rescale:
scale_factor = np.array(
[w / input["width"], h / input["height"]] * 2, dtype="float32"
)
ori_shape = (input["height"], input["width"])
output_shapes.append(ori_shape)
meta["ori_shape"] = ori_shape + (c,)
else:
scale_factor = 1.0
output_shapes.append((h, w))
meta["scale_factor"] = scale_factor
meta["flip"] = False
padh, padw = images.shape[-2:]
meta["pad_shape"] = (padh, padw, c)
metas.append(meta)
if self.training:
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
if gt_instances[0].has("gt_masks"):
from mmdet.core import PolygonMasks as mm_PolygonMasks, BitmapMasks as mm_BitMasks
def convert_mask(m, shape):
# mmdet mask format
if isinstance(m, BitMasks):
return mm_BitMasks(m.tensor.cpu().numpy(), shape[0], shape[1])
else:
return mm_PolygonMasks(m.polygons, shape[0], shape[1])
gt_masks = [convert_mask(x.gt_masks, x.image_size) for x in gt_instances]
losses_and_metrics = self.detector.forward_train(
images,
metas,
[x.gt_boxes.tensor for x in gt_instances],
[x.gt_classes for x in gt_instances],
gt_masks=gt_masks,
)
else:
losses_and_metrics = self.detector.forward_train(
images,
metas,
[x.gt_boxes.tensor for x in gt_instances],
[x.gt_classes for x in gt_instances],
)
return _parse_losses(losses_and_metrics)
else:
results = self.detector.simple_test(images, metas, rescale=rescale)
results = [
{"instances": _convert_mmdet_result(r, shape)}
for r, shape in zip(results, output_shapes)
]
return results
@property
def device(self):
return self.pixel_mean.device
# Reference: show_result() in
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py
def _convert_mmdet_result(result, shape: Tuple[int, int]) -> Instances:
if isinstance(result, tuple):
bbox_result, segm_result = result
if isinstance(segm_result, tuple):
segm_result = segm_result[0]
else:
bbox_result, segm_result = result, None
bboxes = torch.from_numpy(np.vstack(bbox_result)) # Nx5
bboxes, scores = bboxes[:, :4], bboxes[:, -1]
labels = [
torch.full((bbox.shape[0],), i, dtype=torch.int32) for i, bbox in enumerate(bbox_result)
]
labels = torch.cat(labels)
inst = Instances(shape)
inst.pred_boxes = Boxes(bboxes)
inst.scores = scores
inst.pred_classes = labels
if segm_result is not None and len(labels) > 0:
segm_result = list(itertools.chain(*segm_result))
segm_result = [torch.from_numpy(x) if isinstance(x, np.ndarray) else x for x in segm_result]
segm_result = torch.stack(segm_result, dim=0)
inst.pred_masks = segm_result
return inst
# reference: https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py
def _parse_losses(losses: Dict[str, Tensor]) -> Dict[str, Tensor]:
log_vars = OrderedDict()
for loss_name, loss_value in losses.items():
if isinstance(loss_value, torch.Tensor):
log_vars[loss_name] = loss_value.mean()
elif isinstance(loss_value, list):
log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
else:
raise TypeError(f"{loss_name} is not a tensor or list of tensors")
if "loss" not in loss_name:
# put metrics to storage; don't return them
storage = get_event_storage()
value = log_vars.pop(loss_name).cpu().item()
storage.put_scalar(loss_name, value)
return log_vars
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment