Commit 54a066bf authored by mashun1's avatar mashun1
Browse files

ootdiffusion

parents
Pipeline #1004 canceled with stages
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Model Zoo API for Detectron2: a collection of functions to create common model architectures and
optionally load pre-trained weights as released in
`MODEL_ZOO.md <https://github.com/facebookresearch/detectron2/blob/master/MODEL_ZOO.md>`_.
"""
from .model_zoo import get, get_config_file, get_checkpoint_url
__all__ = ["get_checkpoint_url", "get", "get_config_file"]
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import os
import pkg_resources
import torch
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.modeling import build_model
class _ModelZooUrls(object):
"""
Mapping from names to officially released Detectron2 pre-trained models.
"""
S3_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
# format: {config_path.yaml} -> model_id/model_final_{commit}.pkl
CONFIG_PATH_TO_URL_SUFFIX = {
# COCO Detection with Faster R-CNN
"COCO-Detection/faster_rcnn_R_50_C4_1x.yaml": "137257644/model_final_721ade.pkl",
"COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml": "137847829/model_final_51d356.pkl",
"COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml": "137257794/model_final_b275ba.pkl",
"COCO-Detection/faster_rcnn_R_50_C4_3x.yaml": "137849393/model_final_f97cb7.pkl",
"COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml": "137849425/model_final_68d202.pkl",
"COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml": "137849458/model_final_280758.pkl",
"COCO-Detection/faster_rcnn_R_101_C4_3x.yaml": "138204752/model_final_298dad.pkl",
"COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml": "138204841/model_final_3e0943.pkl",
"COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml": "137851257/model_final_f6e8b1.pkl",
"COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml": "139173657/model_final_68b088.pkl",
# COCO Detection with RetinaNet
"COCO-Detection/retinanet_R_50_FPN_1x.yaml": "137593951/model_final_b796dc.pkl",
"COCO-Detection/retinanet_R_50_FPN_3x.yaml": "137849486/model_final_4cafe0.pkl",
"COCO-Detection/retinanet_R_101_FPN_3x.yaml": "138363263/model_final_59f53c.pkl",
# COCO Detection with RPN and Fast R-CNN
"COCO-Detection/rpn_R_50_C4_1x.yaml": "137258005/model_final_450694.pkl",
"COCO-Detection/rpn_R_50_FPN_1x.yaml": "137258492/model_final_02ce48.pkl",
"COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml": "137635226/model_final_e5f7ce.pkl",
# COCO Instance Segmentation Baselines with Mask R-CNN
"COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml": "137259246/model_final_9243eb.pkl",
"COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml": "137260150/model_final_4f86c3.pkl",
"COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml": "137260431/model_final_a54504.pkl",
"COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml": "137849525/model_final_4ce675.pkl",
"COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml": "137849551/model_final_84107b.pkl",
"COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml": "137849600/model_final_f10217.pkl",
"COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml": "138363239/model_final_a2914c.pkl",
"COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml": "138363294/model_final_0464b7.pkl",
"COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml": "138205316/model_final_a3ec72.pkl",
"COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml": "139653917/model_final_2d9806.pkl", # noqa
# COCO Person Keypoint Detection Baselines with Keypoint R-CNN
"COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml": "137261548/model_final_04e291.pkl",
"COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml": "137849621/model_final_a6e10b.pkl",
"COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml": "138363331/model_final_997cc7.pkl",
"COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml": "139686956/model_final_5ad38f.pkl",
# COCO Panoptic Segmentation Baselines with Panoptic FPN
"COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml": "139514544/model_final_dbfeb4.pkl",
"COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml": "139514569/model_final_c10459.pkl",
"COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml": "139514519/model_final_cafdb1.pkl",
# LVIS Instance Segmentation Baselines with Mask R-CNN
"LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml": "144219072/model_final_571f7c.pkl",
"LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml": "144219035/model_final_824ab5.pkl",
"LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml": "144219108/model_final_5e3439.pkl", # noqa
# Cityscapes & Pascal VOC Baselines
"Cityscapes/mask_rcnn_R_50_FPN.yaml": "142423278/model_final_af9cf5.pkl",
"PascalVOC-Detection/faster_rcnn_R_50_C4.yaml": "142202221/model_final_b1acc2.pkl",
# Other Settings
"Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml": "138602867/model_final_65c703.pkl",
"Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml": "144998336/model_final_821d0b.pkl",
"Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml": "138602847/model_final_e9d89b.pkl",
"Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml": "144998488/model_final_480dd8.pkl",
"Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml": "169527823/model_final_3b3c51.pkl",
"Misc/mask_rcnn_R_50_FPN_3x_gn.yaml": "138602888/model_final_dc5d9e.pkl",
"Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml": "138602908/model_final_01ca85.pkl",
"Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml": "139797668/model_final_be35db.pkl",
"Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml": "18131413/model_0039999_e76410.pkl", # noqa
# D1 Comparisons
"Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml": "137781054/model_final_7ab50c.pkl", # noqa
"Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml": "137781281/model_final_62ca52.pkl", # noqa
"Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml": "137781195/model_final_cce136.pkl",
}
def get_checkpoint_url(config_path):
"""
Returns the URL to the model trained using the given config
Args:
config_path (str): config file name relative to detectron2's "configs/"
directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
Returns:
str: a URL to the model
"""
name = config_path.replace(".yaml", "")
if config_path in _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX:
suffix = _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX[config_path]
return _ModelZooUrls.S3_PREFIX + name + "/" + suffix
raise RuntimeError("{} not available in Model Zoo!".format(name))
def get_config_file(config_path):
"""
Returns path to a builtin config file.
Args:
config_path (str): config file name relative to detectron2's "configs/"
directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
Returns:
str: the real path to the config file.
"""
cfg_file = pkg_resources.resource_filename(
"detectron2.model_zoo", os.path.join("configs", config_path)
)
if not os.path.exists(cfg_file):
raise RuntimeError("{} not available in Model Zoo!".format(config_path))
return cfg_file
def get(config_path, trained: bool = False):
"""
Get a model specified by relative path under Detectron2's official ``configs/`` directory.
Args:
config_path (str): config file name relative to detectron2's "configs/"
directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
trained (bool): If True, will initialize the model with the trained model zoo weights.
If False, the checkpoint specified in the config file's ``MODEL.WEIGHTS`` is used
instead; this will typically (though not always) initialize a subset of weights using
an ImageNet pre-trained model, while randomly initializing the other weights.
Example:
.. code-block:: python
from detectron2 import model_zoo
model = model_zoo.get("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", trained=True)
"""
cfg_file = get_config_file(config_path)
cfg = get_cfg()
cfg.merge_from_file(cfg_file)
if trained:
cfg.MODEL.WEIGHTS = get_checkpoint_url(config_path)
if not torch.cuda.is_available():
cfg.MODEL.DEVICE = "cpu"
model = build_model(cfg)
DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
return model
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import torch
from detectron2.layers import ShapeSpec
from .anchor_generator import build_anchor_generator, ANCHOR_GENERATOR_REGISTRY
from .backbone import (
BACKBONE_REGISTRY,
FPN,
Backbone,
ResNet,
ResNetBlockBase,
build_backbone,
build_resnet_backbone,
make_stage,
)
from .meta_arch import (
META_ARCH_REGISTRY,
SEM_SEG_HEADS_REGISTRY,
GeneralizedRCNN,
PanopticFPN,
ProposalNetwork,
RetinaNet,
SemanticSegmentor,
build_model,
build_sem_seg_head,
)
from .postprocessing import detector_postprocess
from .proposal_generator import (
PROPOSAL_GENERATOR_REGISTRY,
build_proposal_generator,
RPN_HEAD_REGISTRY,
build_rpn_head,
)
from .roi_heads import (
ROI_BOX_HEAD_REGISTRY,
ROI_HEADS_REGISTRY,
ROI_KEYPOINT_HEAD_REGISTRY,
ROI_MASK_HEAD_REGISTRY,
ROIHeads,
StandardROIHeads,
BaseMaskRCNNHead,
BaseKeypointRCNNHead,
build_box_head,
build_keypoint_head,
build_mask_head,
build_roi_heads,
)
from .test_time_augmentation import DatasetMapperTTA, GeneralizedRCNNWithTTA
_EXCLUDE = {"torch", "ShapeSpec"}
__all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")]
assert (
torch.Tensor([1]) == torch.Tensor([2])
).dtype == torch.bool, "Your Pytorch is too old. Please update to contain https://github.com/pytorch/pytorch/pull/21113"
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import math
from typing import List
import torch
from torch import nn
from detectron2.config import configurable
from detectron2.layers import ShapeSpec
from detectron2.structures import Boxes, RotatedBoxes
from detectron2.utils.registry import Registry
ANCHOR_GENERATOR_REGISTRY = Registry("ANCHOR_GENERATOR")
ANCHOR_GENERATOR_REGISTRY.__doc__ = """
Registry for modules that creates object detection anchors for feature maps.
The registered object will be called with `obj(cfg, input_shape)`.
"""
class BufferList(nn.Module):
"""
Similar to nn.ParameterList, but for buffers
"""
def __init__(self, buffers=None):
super(BufferList, self).__init__()
if buffers is not None:
self.extend(buffers)
def extend(self, buffers):
offset = len(self)
for i, buffer in enumerate(buffers):
self.register_buffer(str(offset + i), buffer)
return self
def __len__(self):
return len(self._buffers)
def __iter__(self):
return iter(self._buffers.values())
def _create_grid_offsets(size: List[int], stride: int, offset: float, device: torch.device):
grid_height, grid_width = size
shifts_x = torch.arange(
offset * stride, grid_width * stride, step=stride, dtype=torch.float32, device=device
)
shifts_y = torch.arange(
offset * stride, grid_height * stride, step=stride, dtype=torch.float32, device=device
)
shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
shift_x = shift_x.reshape(-1)
shift_y = shift_y.reshape(-1)
return shift_x, shift_y
def _broadcast_params(params, num_features, name):
"""
If one size (or aspect ratio) is specified and there are multiple feature
maps, we "broadcast" anchors of that single size (or aspect ratio)
over all feature maps.
If params is list[float], or list[list[float]] with len(params) == 1, repeat
it num_features time.
Returns:
list[list[float]]: param for each feature
"""
assert isinstance(
params, (list, tuple)
), f"{name} in anchor generator has to be a list! Got {params}."
assert len(params), f"{name} in anchor generator cannot be empty!"
if not isinstance(params[0], (list, tuple)): # list[float]
return [params] * num_features
if len(params) == 1:
return list(params) * num_features
assert len(params) == num_features, (
f"Got {name} of length {len(params)} in anchor generator, "
f"but the number of input features is {num_features}!"
)
return params
@ANCHOR_GENERATOR_REGISTRY.register()
class DefaultAnchorGenerator(nn.Module):
"""
Compute anchors in the standard ways described in
"Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks".
"""
box_dim: int = 4
"""
the dimension of each anchor box.
"""
@configurable
def __init__(self, *, sizes, aspect_ratios, strides, offset=0.5):
"""
This interface is experimental.
Args:
sizes (list[list[float]] or list[float]):
If sizes is list[list[float]], sizes[i] is the list of anchor sizes
(i.e. sqrt of anchor area) to use for the i-th feature map.
If sizes is list[float], the sizes are used for all feature maps.
Anchor sizes are given in absolute lengths in units of
the input image; they do not dynamically scale if the input image size changes.
aspect_ratios (list[list[float]] or list[float]): list of aspect ratios
(i.e. height / width) to use for anchors. Same "broadcast" rule for `sizes` applies.
strides (list[int]): stride of each input feature.
offset (float): Relative offset between the center of the first anchor and the top-left
corner of the image. Value has to be in [0, 1).
Recommend to use 0.5, which means half stride.
"""
super().__init__()
self.strides = strides
self.num_features = len(self.strides)
sizes = _broadcast_params(sizes, self.num_features, "sizes")
aspect_ratios = _broadcast_params(aspect_ratios, self.num_features, "aspect_ratios")
self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios)
self.offset = offset
assert 0.0 <= self.offset < 1.0, self.offset
@classmethod
def from_config(cls, cfg, input_shape: List[ShapeSpec]):
return {
"sizes": cfg.MODEL.ANCHOR_GENERATOR.SIZES,
"aspect_ratios": cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS,
"strides": [x.stride for x in input_shape],
"offset": cfg.MODEL.ANCHOR_GENERATOR.OFFSET,
}
def _calculate_anchors(self, sizes, aspect_ratios):
cell_anchors = [
self.generate_cell_anchors(s, a).float() for s, a in zip(sizes, aspect_ratios)
]
return BufferList(cell_anchors)
@property
def num_cell_anchors(self):
"""
Alias of `num_anchors`.
"""
return self.num_anchors
@property
def num_anchors(self):
"""
Returns:
list[int]: Each int is the number of anchors at every pixel
location, on that feature map.
For example, if at every pixel we use anchors of 3 aspect
ratios and 5 sizes, the number of anchors is 15.
(See also ANCHOR_GENERATOR.SIZES and ANCHOR_GENERATOR.ASPECT_RATIOS in config)
In standard RPN models, `num_anchors` on every feature map is the same.
"""
return [len(cell_anchors) for cell_anchors in self.cell_anchors]
def _grid_anchors(self, grid_sizes: List[List[int]]):
"""
Returns:
list[Tensor]: #featuremap tensors, each is (#locations x #cell_anchors) x 4
"""
anchors = []
for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors):
shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors.device)
shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
anchors.append((shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4))
return anchors
def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)):
"""
Generate a tensor storing canonical anchor boxes, which are all anchor
boxes of different sizes and aspect_ratios centered at (0, 0).
We can later build the set of anchors for a full feature map by
shifting and tiling these tensors (see `meth:_grid_anchors`).
Args:
sizes (tuple[float]):
aspect_ratios (tuple[float]]):
Returns:
Tensor of shape (len(sizes) * len(aspect_ratios), 4) storing anchor boxes
in XYXY format.
"""
# This is different from the anchor generator defined in the original Faster R-CNN
# code or Detectron. They yield the same AP, however the old version defines cell
# anchors in a less natural way with a shift relative to the feature grid and
# quantization that results in slightly different sizes for different aspect ratios.
# See also https://github.com/facebookresearch/Detectron/issues/227
anchors = []
for size in sizes:
area = size ** 2.0
for aspect_ratio in aspect_ratios:
# s * s = w * h
# a = h / w
# ... some algebra ...
# w = sqrt(s * s / a)
# h = a * w
w = math.sqrt(area / aspect_ratio)
h = aspect_ratio * w
x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0
anchors.append([x0, y0, x1, y1])
return torch.tensor(anchors)
def forward(self, features):
"""
Args:
features (list[Tensor]): list of backbone feature maps on which to generate anchors.
Returns:
list[Boxes]: a list of Boxes containing all the anchors for each feature map
(i.e. the cell anchors repeated over all locations in the feature map).
The number of anchors of each feature map is Hi x Wi x num_cell_anchors,
where Hi, Wi are resolution of the feature map divided by anchor stride.
"""
grid_sizes = [feature_map.shape[-2:] for feature_map in features]
anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
return [Boxes(x) for x in anchors_over_all_feature_maps]
@ANCHOR_GENERATOR_REGISTRY.register()
class RotatedAnchorGenerator(nn.Module):
"""
Compute rotated anchors used by Rotated RPN (RRPN), described in
"Arbitrary-Oriented Scene Text Detection via Rotation Proposals".
"""
box_dim: int = 5
"""
the dimension of each anchor box.
"""
@configurable
def __init__(self, *, sizes, aspect_ratios, strides, angles, offset=0.5):
"""
This interface is experimental.
Args:
sizes (list[list[float]] or list[float]):
If sizes is list[list[float]], sizes[i] is the list of anchor sizes
(i.e. sqrt of anchor area) to use for the i-th feature map.
If sizes is list[float], the sizes are used for all feature maps.
Anchor sizes are given in absolute lengths in units of
the input image; they do not dynamically scale if the input image size changes.
aspect_ratios (list[list[float]] or list[float]): list of aspect ratios
(i.e. height / width) to use for anchors. Same "broadcast" rule for `sizes` applies.
strides (list[int]): stride of each input feature.
angles (list[list[float]] or list[float]): list of angles (in degrees CCW)
to use for anchors. Same "broadcast" rule for `sizes` applies.
offset (float): Relative offset between the center of the first anchor and the top-left
corner of the image. Value has to be in [0, 1).
Recommend to use 0.5, which means half stride.
"""
super().__init__()
self.strides = strides
self.num_features = len(self.strides)
sizes = _broadcast_params(sizes, self.num_features, "sizes")
aspect_ratios = _broadcast_params(aspect_ratios, self.num_features, "aspect_ratios")
angles = _broadcast_params(angles, self.num_features, "angles")
self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios, angles)
self.offset = offset
assert 0.0 <= self.offset < 1.0, self.offset
@classmethod
def from_config(cls, cfg, input_shape: List[ShapeSpec]):
return {
"sizes": cfg.MODEL.ANCHOR_GENERATOR.SIZES,
"aspect_ratios": cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS,
"strides": [x.stride for x in input_shape],
"offset": cfg.MODEL.ANCHOR_GENERATOR.OFFSET,
"angles": cfg.MODEL.ANCHOR_GENERATOR.ANGLES,
}
def _calculate_anchors(self, sizes, aspect_ratios, angles):
cell_anchors = [
self.generate_cell_anchors(size, aspect_ratio, angle).float()
for size, aspect_ratio, angle in zip(sizes, aspect_ratios, angles)
]
return BufferList(cell_anchors)
@property
def num_cell_anchors(self):
"""
Alias of `num_anchors`.
"""
return self.num_anchors
@property
def num_anchors(self):
"""
Returns:
list[int]: Each int is the number of anchors at every pixel
location, on that feature map.
For example, if at every pixel we use anchors of 3 aspect
ratios, 2 sizes and 5 angles, the number of anchors is 30.
(See also ANCHOR_GENERATOR.SIZES, ANCHOR_GENERATOR.ASPECT_RATIOS
and ANCHOR_GENERATOR.ANGLES in config)
In standard RRPN models, `num_anchors` on every feature map is the same.
"""
return [len(cell_anchors) for cell_anchors in self.cell_anchors]
def _grid_anchors(self, grid_sizes):
anchors = []
for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors):
shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors.device)
zeros = torch.zeros_like(shift_x)
shifts = torch.stack((shift_x, shift_y, zeros, zeros, zeros), dim=1)
anchors.append((shifts.view(-1, 1, 5) + base_anchors.view(1, -1, 5)).reshape(-1, 5))
return anchors
def generate_cell_anchors(
self,
sizes=(32, 64, 128, 256, 512),
aspect_ratios=(0.5, 1, 2),
angles=(-90, -60, -30, 0, 30, 60, 90),
):
"""
Generate a tensor storing canonical anchor boxes, which are all anchor
boxes of different sizes, aspect_ratios, angles centered at (0, 0).
We can later build the set of anchors for a full feature map by
shifting and tiling these tensors (see `meth:_grid_anchors`).
Args:
sizes (tuple[float]):
aspect_ratios (tuple[float]]):
angles (tuple[float]]):
Returns:
Tensor of shape (len(sizes) * len(aspect_ratios) * len(angles), 5)
storing anchor boxes in (x_ctr, y_ctr, w, h, angle) format.
"""
anchors = []
for size in sizes:
area = size ** 2.0
for aspect_ratio in aspect_ratios:
# s * s = w * h
# a = h / w
# ... some algebra ...
# w = sqrt(s * s / a)
# h = a * w
w = math.sqrt(area / aspect_ratio)
h = aspect_ratio * w
anchors.extend([0, 0, w, h, a] for a in angles)
return torch.tensor(anchors)
def forward(self, features):
"""
Args:
features (list[Tensor]): list of backbone feature maps on which to generate anchors.
Returns:
list[RotatedBoxes]: a list of Boxes containing all the anchors for each feature map
(i.e. the cell anchors repeated over all locations in the feature map).
The number of anchors of each feature map is Hi x Wi x num_cell_anchors,
where Hi, Wi are resolution of the feature map divided by anchor stride.
"""
grid_sizes = [feature_map.shape[-2:] for feature_map in features]
anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
return [RotatedBoxes(x) for x in anchors_over_all_feature_maps]
def build_anchor_generator(cfg, input_shape):
"""
Built an anchor generator from `cfg.MODEL.ANCHOR_GENERATOR.NAME`.
"""
anchor_generator = cfg.MODEL.ANCHOR_GENERATOR.NAME
return ANCHOR_GENERATOR_REGISTRY.get(anchor_generator)(cfg, input_shape)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .build import build_backbone, BACKBONE_REGISTRY # noqa F401 isort:skip
from .backbone import Backbone
from .fpn import FPN
from .resnet import ResNet, ResNetBlockBase, build_resnet_backbone, make_stage
__all__ = [k for k in globals().keys() if not k.startswith("_")]
# TODO can expose more resnet blocks after careful consideration
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from abc import ABCMeta, abstractmethod
import torch.nn as nn
from detectron2.layers import ShapeSpec
__all__ = ["Backbone"]
class Backbone(nn.Module, metaclass=ABCMeta):
"""
Abstract base class for network backbones.
"""
def __init__(self):
"""
The `__init__` method of any subclass can specify its own set of arguments.
"""
super().__init__()
@abstractmethod
def forward(self):
"""
Subclasses must override this method, but adhere to the same return type.
Returns:
dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor
"""
pass
@property
def size_divisibility(self):
"""
Some backbones require the input height and width to be divisible by a
specific integer. This is typically true for encoder / decoder type networks
with lateral connection (e.g., FPN) for which feature maps need to match
dimension in the "bottom up" and "top down" paths. Set to 0 if no specific
input size divisibility is required.
"""
return 0
def output_shape(self):
"""
Returns:
dict[str->ShapeSpec]
"""
# this is a backward-compatible default
return {
name: ShapeSpec(
channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
)
for name in self._out_features
}
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from detectron2.layers import ShapeSpec
from detectron2.utils.registry import Registry
from .backbone import Backbone
BACKBONE_REGISTRY = Registry("BACKBONE")
BACKBONE_REGISTRY.__doc__ = """
Registry for backbones, which extract feature maps from images
The registered object must be a callable that accepts two arguments:
1. A :class:`detectron2.config.CfgNode`
2. A :class:`detectron2.layers.ShapeSpec`, which contains the input shape specification.
It must returns an instance of :class:`Backbone`.
"""
def build_backbone(cfg, input_shape=None):
"""
Build a backbone from `cfg.MODEL.BACKBONE.NAME`.
Returns:
an instance of :class:`Backbone`
"""
if input_shape is None:
input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
backbone_name = cfg.MODEL.BACKBONE.NAME
backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape)
assert isinstance(backbone, Backbone)
return backbone
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import math
import fvcore.nn.weight_init as weight_init
import torch.nn.functional as F
from torch import nn
from detectron2.layers import Conv2d, ShapeSpec, get_norm
from .backbone import Backbone
from .build import BACKBONE_REGISTRY
from .resnet import build_resnet_backbone
__all__ = ["build_resnet_fpn_backbone", "build_retinanet_resnet_fpn_backbone", "FPN"]
class FPN(Backbone):
"""
This module implements :paper:`FPN`.
It creates pyramid features built on top of some input feature maps.
"""
def __init__(
self, bottom_up, in_features, out_channels, norm="", top_block=None, fuse_type="sum"
):
"""
Args:
bottom_up (Backbone): module representing the bottom up subnetwork.
Must be a subclass of :class:`Backbone`. The multi-scale feature
maps generated by the bottom up network, and listed in `in_features`,
are used to generate FPN levels.
in_features (list[str]): names of the input feature maps coming
from the backbone to which FPN is attached. For example, if the
backbone produces ["res2", "res3", "res4"], any *contiguous* sublist
of these may be used; order must be from high to low resolution.
out_channels (int): number of channels in the output feature maps.
norm (str): the normalization to use.
top_block (nn.Module or None): if provided, an extra operation will
be performed on the output of the last (smallest resolution)
FPN output, and the result will extend the result list. The top_block
further downsamples the feature map. It must have an attribute
"num_levels", meaning the number of extra FPN levels added by
this block, and "in_feature", which is a string representing
its input feature (e.g., p5).
fuse_type (str): types for fusing the top down features and the lateral
ones. It can be "sum" (default), which sums up element-wise; or "avg",
which takes the element-wise mean of the two.
"""
super(FPN, self).__init__()
assert isinstance(bottom_up, Backbone)
# Feature map strides and channels from the bottom up network (e.g. ResNet)
input_shapes = bottom_up.output_shape()
in_strides = [input_shapes[f].stride for f in in_features]
in_channels = [input_shapes[f].channels for f in in_features]
_assert_strides_are_log2_contiguous(in_strides)
lateral_convs = []
output_convs = []
use_bias = norm == ""
for idx, in_channels in enumerate(in_channels):
lateral_norm = get_norm(norm, out_channels)
output_norm = get_norm(norm, out_channels)
lateral_conv = Conv2d(
in_channels, out_channels, kernel_size=1, bias=use_bias, norm=lateral_norm
)
output_conv = Conv2d(
out_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
bias=use_bias,
norm=output_norm,
)
weight_init.c2_xavier_fill(lateral_conv)
weight_init.c2_xavier_fill(output_conv)
stage = int(math.log2(in_strides[idx]))
self.add_module("fpn_lateral{}".format(stage), lateral_conv)
self.add_module("fpn_output{}".format(stage), output_conv)
lateral_convs.append(lateral_conv)
output_convs.append(output_conv)
# Place convs into top-down order (from low to high resolution)
# to make the top-down computation in forward clearer.
self.lateral_convs = lateral_convs[::-1]
self.output_convs = output_convs[::-1]
self.top_block = top_block
self.in_features = in_features
self.bottom_up = bottom_up
# Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in in_strides}
# top block output feature maps.
if self.top_block is not None:
for s in range(stage, stage + self.top_block.num_levels):
self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
self._out_features = list(self._out_feature_strides.keys())
self._out_feature_channels = {k: out_channels for k in self._out_features}
self._size_divisibility = in_strides[-1]
assert fuse_type in {"avg", "sum"}
self._fuse_type = fuse_type
@property
def size_divisibility(self):
return self._size_divisibility
def forward(self, x):
"""
Args:
input (dict[str->Tensor]): mapping feature map name (e.g., "res5") to
feature map tensor for each feature level in high to low resolution order.
Returns:
dict[str->Tensor]:
mapping from feature map name to FPN feature map tensor
in high to low resolution order. Returned feature names follow the FPN
paper convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
["p2", "p3", ..., "p6"].
"""
# Reverse feature maps into top-down order (from low to high resolution)
bottom_up_features = self.bottom_up(x)
x = [bottom_up_features[f] for f in self.in_features[::-1]]
results = []
prev_features = self.lateral_convs[0](x[0])
results.append(self.output_convs[0](prev_features))
for features, lateral_conv, output_conv in zip(
x[1:], self.lateral_convs[1:], self.output_convs[1:]
):
top_down_features = F.interpolate(prev_features, scale_factor=2, mode="nearest")
lateral_features = lateral_conv(features)
prev_features = lateral_features + top_down_features
if self._fuse_type == "avg":
prev_features /= 2
results.insert(0, output_conv(prev_features))
if self.top_block is not None:
top_block_in_feature = bottom_up_features.get(self.top_block.in_feature, None)
if top_block_in_feature is None:
top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)]
results.extend(self.top_block(top_block_in_feature))
assert len(self._out_features) == len(results)
return dict(zip(self._out_features, results))
def output_shape(self):
return {
name: ShapeSpec(
channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
)
for name in self._out_features
}
def _assert_strides_are_log2_contiguous(strides):
"""
Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2".
"""
for i, stride in enumerate(strides[1:], 1):
assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format(
stride, strides[i - 1]
)
class LastLevelMaxPool(nn.Module):
"""
This module is used in the original FPN to generate a downsampled
P6 feature from P5.
"""
def __init__(self):
super().__init__()
self.num_levels = 1
self.in_feature = "p5"
def forward(self, x):
return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
class LastLevelP6P7(nn.Module):
"""
This module is used in RetinaNet to generate extra layers, P6 and P7 from
C5 feature.
"""
def __init__(self, in_channels, out_channels, in_feature="res5"):
super().__init__()
self.num_levels = 2
self.in_feature = in_feature
self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
for module in [self.p6, self.p7]:
weight_init.c2_xavier_fill(module)
def forward(self, c5):
p6 = self.p6(c5)
p7 = self.p7(F.relu(p6))
return [p6, p7]
@BACKBONE_REGISTRY.register()
def build_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
"""
Args:
cfg: a detectron2 CfgNode
Returns:
backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
"""
bottom_up = build_resnet_backbone(cfg, input_shape)
in_features = cfg.MODEL.FPN.IN_FEATURES
out_channels = cfg.MODEL.FPN.OUT_CHANNELS
backbone = FPN(
bottom_up=bottom_up,
in_features=in_features,
out_channels=out_channels,
norm=cfg.MODEL.FPN.NORM,
top_block=LastLevelMaxPool(),
fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
)
return backbone
@BACKBONE_REGISTRY.register()
def build_retinanet_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
"""
Args:
cfg: a detectron2 CfgNode
Returns:
backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
"""
bottom_up = build_resnet_backbone(cfg, input_shape)
in_features = cfg.MODEL.FPN.IN_FEATURES
out_channels = cfg.MODEL.FPN.OUT_CHANNELS
in_channels_p6p7 = bottom_up.output_shape()["res5"].channels
backbone = FPN(
bottom_up=bottom_up,
in_features=in_features,
out_channels=out_channels,
norm=cfg.MODEL.FPN.NORM,
top_block=LastLevelP6P7(in_channels_p6p7, out_channels),
fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
)
return backbone
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import numpy as np
import fvcore.nn.weight_init as weight_init
import torch
import torch.nn.functional as F
from torch import nn
from detectron2.layers import (
CNNBlockBase,
Conv2d,
DeformConv,
ModulatedDeformConv,
ShapeSpec,
get_norm,
)
from .backbone import Backbone
from .build import BACKBONE_REGISTRY
__all__ = [
"ResNetBlockBase",
"BasicBlock",
"BottleneckBlock",
"DeformBottleneckBlock",
"BasicStem",
"ResNet",
"make_stage",
"build_resnet_backbone",
]
ResNetBlockBase = CNNBlockBase
"""
Alias for backward compatibiltiy.
"""
class BasicBlock(CNNBlockBase):
"""
The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`,
with two 3x3 conv layers and a projection shortcut if needed.
"""
def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"):
"""
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
stride (int): Stride for the first conv.
norm (str or callable): normalization for all conv layers.
See :func:`layers.get_norm` for supported format.
"""
super().__init__(in_channels, out_channels, stride)
if in_channels != out_channels:
self.shortcut = Conv2d(
in_channels,
out_channels,
kernel_size=1,
stride=stride,
bias=False,
norm=get_norm(norm, out_channels),
)
else:
self.shortcut = None
self.conv1 = Conv2d(
in_channels,
out_channels,
kernel_size=3,
stride=stride,
padding=1,
bias=False,
norm=get_norm(norm, out_channels),
)
self.conv2 = Conv2d(
out_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
bias=False,
norm=get_norm(norm, out_channels),
)
for layer in [self.conv1, self.conv2, self.shortcut]:
if layer is not None: # shortcut can be None
weight_init.c2_msra_fill(layer)
def forward(self, x):
out = self.conv1(x)
out = F.relu_(out)
out = self.conv2(out)
if self.shortcut is not None:
shortcut = self.shortcut(x)
else:
shortcut = x
out += shortcut
out = F.relu_(out)
return out
class BottleneckBlock(CNNBlockBase):
"""
The standard bottleneck residual block used by ResNet-50, 101 and 152
defined in :paper:`ResNet`. It contains 3 conv layers with kernels
1x1, 3x3, 1x1, and a projection shortcut if needed.
"""
def __init__(
self,
in_channels,
out_channels,
*,
bottleneck_channels,
stride=1,
num_groups=1,
norm="BN",
stride_in_1x1=False,
dilation=1,
):
"""
Args:
bottleneck_channels (int): number of output channels for the 3x3
"bottleneck" conv layers.
num_groups (int): number of groups for the 3x3 conv layer.
norm (str or callable): normalization for all conv layers.
See :func:`layers.get_norm` for supported format.
stride_in_1x1 (bool): when stride>1, whether to put stride in the
first 1x1 convolution or the bottleneck 3x3 convolution.
dilation (int): the dilation rate of the 3x3 conv layer.
"""
super().__init__(in_channels, out_channels, stride)
if in_channels != out_channels:
self.shortcut = Conv2d(
in_channels,
out_channels,
kernel_size=1,
stride=stride,
bias=False,
norm=get_norm(norm, out_channels),
)
else:
self.shortcut = None
# The original MSRA ResNet models have stride in the first 1x1 conv
# The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
# stride in the 3x3 conv
stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
self.conv1 = Conv2d(
in_channels,
bottleneck_channels,
kernel_size=1,
stride=stride_1x1,
bias=False,
norm=get_norm(norm, bottleneck_channels),
)
self.conv2 = Conv2d(
bottleneck_channels,
bottleneck_channels,
kernel_size=3,
stride=stride_3x3,
padding=1 * dilation,
bias=False,
groups=num_groups,
dilation=dilation,
norm=get_norm(norm, bottleneck_channels),
)
self.conv3 = Conv2d(
bottleneck_channels,
out_channels,
kernel_size=1,
bias=False,
norm=get_norm(norm, out_channels),
)
for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
if layer is not None: # shortcut can be None
weight_init.c2_msra_fill(layer)
# Zero-initialize the last normalization in each residual branch,
# so that at the beginning, the residual branch starts with zeros,
# and each residual block behaves like an identity.
# See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
# "For BN layers, the learnable scaling coefficient γ is initialized
# to be 1, except for each residual block's last BN
# where γ is initialized to be 0."
# nn.init.constant_(self.conv3.norm.weight, 0)
# TODO this somehow hurts performance when training GN models from scratch.
# Add it as an option when we need to use this code to train a backbone.
def forward(self, x):
out = self.conv1(x)
out = F.relu_(out)
out = self.conv2(out)
out = F.relu_(out)
out = self.conv3(out)
if self.shortcut is not None:
shortcut = self.shortcut(x)
else:
shortcut = x
out += shortcut
out = F.relu_(out)
return out
class DeformBottleneckBlock(ResNetBlockBase):
"""
Similar to :class:`BottleneckBlock`, but with :paper:`deformable conv <deformconv>`
in the 3x3 convolution.
"""
def __init__(
self,
in_channels,
out_channels,
*,
bottleneck_channels,
stride=1,
num_groups=1,
norm="BN",
stride_in_1x1=False,
dilation=1,
deform_modulated=False,
deform_num_groups=1,
):
super().__init__(in_channels, out_channels, stride)
self.deform_modulated = deform_modulated
if in_channels != out_channels:
self.shortcut = Conv2d(
in_channels,
out_channels,
kernel_size=1,
stride=stride,
bias=False,
norm=get_norm(norm, out_channels),
)
else:
self.shortcut = None
stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
self.conv1 = Conv2d(
in_channels,
bottleneck_channels,
kernel_size=1,
stride=stride_1x1,
bias=False,
norm=get_norm(norm, bottleneck_channels),
)
if deform_modulated:
deform_conv_op = ModulatedDeformConv
# offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size
offset_channels = 27
else:
deform_conv_op = DeformConv
offset_channels = 18
self.conv2_offset = Conv2d(
bottleneck_channels,
offset_channels * deform_num_groups,
kernel_size=3,
stride=stride_3x3,
padding=1 * dilation,
dilation=dilation,
)
self.conv2 = deform_conv_op(
bottleneck_channels,
bottleneck_channels,
kernel_size=3,
stride=stride_3x3,
padding=1 * dilation,
bias=False,
groups=num_groups,
dilation=dilation,
deformable_groups=deform_num_groups,
norm=get_norm(norm, bottleneck_channels),
)
self.conv3 = Conv2d(
bottleneck_channels,
out_channels,
kernel_size=1,
bias=False,
norm=get_norm(norm, out_channels),
)
for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
if layer is not None: # shortcut can be None
weight_init.c2_msra_fill(layer)
nn.init.constant_(self.conv2_offset.weight, 0)
nn.init.constant_(self.conv2_offset.bias, 0)
def forward(self, x):
out = self.conv1(x)
out = F.relu_(out)
if self.deform_modulated:
offset_mask = self.conv2_offset(out)
offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1)
offset = torch.cat((offset_x, offset_y), dim=1)
mask = mask.sigmoid()
out = self.conv2(out, offset, mask)
else:
offset = self.conv2_offset(out)
out = self.conv2(out, offset)
out = F.relu_(out)
out = self.conv3(out)
if self.shortcut is not None:
shortcut = self.shortcut(x)
else:
shortcut = x
out += shortcut
out = F.relu_(out)
return out
def make_stage(block_class, num_blocks, first_stride, *, in_channels, out_channels, **kwargs):
"""
Create a list of blocks just like those in a ResNet stage.
Args:
block_class (type): a subclass of ResNetBlockBase
num_blocks (int):
first_stride (int): the stride of the first block. The other blocks will have stride=1.
in_channels (int): input channels of the entire stage.
out_channels (int): output channels of **every block** in the stage.
kwargs: other arguments passed to the constructor of every block.
Returns:
list[nn.Module]: a list of block module.
"""
assert "stride" not in kwargs, "Stride of blocks in make_stage cannot be changed."
blocks = []
for i in range(num_blocks):
blocks.append(
block_class(
in_channels=in_channels,
out_channels=out_channels,
stride=first_stride if i == 0 else 1,
**kwargs,
)
)
in_channels = out_channels
return blocks
class BasicStem(CNNBlockBase):
"""
The standard ResNet stem (layers before the first residual block).
"""
def __init__(self, in_channels=3, out_channels=64, norm="BN"):
"""
Args:
norm (str or callable): norm after the first conv layer.
See :func:`layers.get_norm` for supported format.
"""
super().__init__(in_channels, out_channels, 4)
self.in_channels = in_channels
self.conv1 = Conv2d(
in_channels,
out_channels,
kernel_size=7,
stride=2,
padding=3,
bias=False,
norm=get_norm(norm, out_channels),
)
weight_init.c2_msra_fill(self.conv1)
def forward(self, x):
x = self.conv1(x)
x = F.relu_(x)
x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
return x
class ResNet(Backbone):
"""
Implement :paper:`ResNet`.
"""
def __init__(self, stem, stages, num_classes=None, out_features=None):
"""
Args:
stem (nn.Module): a stem module
stages (list[list[CNNBlockBase]]): several (typically 4) stages,
each contains multiple :class:`CNNBlockBase`.
num_classes (None or int): if None, will not perform classification.
Otherwise, will create a linear layer.
out_features (list[str]): name of the layers whose outputs should
be returned in forward. Can be anything in "stem", "linear", or "res2" ...
If None, will return the output of the last layer.
"""
super(ResNet, self).__init__()
self.stem = stem
self.num_classes = num_classes
current_stride = self.stem.stride
self._out_feature_strides = {"stem": current_stride}
self._out_feature_channels = {"stem": self.stem.out_channels}
self.stages_and_names = []
for i, blocks in enumerate(stages):
assert len(blocks) > 0, len(blocks)
for block in blocks:
assert isinstance(block, CNNBlockBase), block
name = "res" + str(i + 2)
stage = nn.Sequential(*blocks)
self.add_module(name, stage)
self.stages_and_names.append((stage, name))
self._out_feature_strides[name] = current_stride = int(
current_stride * np.prod([k.stride for k in blocks])
)
self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels
if num_classes is not None:
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.linear = nn.Linear(curr_channels, num_classes)
# Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
# "The 1000-way fully-connected layer is initialized by
# drawing weights from a zero-mean Gaussian with standard deviation of 0.01."
nn.init.normal_(self.linear.weight, std=0.01)
name = "linear"
if out_features is None:
out_features = [name]
self._out_features = out_features
assert len(self._out_features)
children = [x[0] for x in self.named_children()]
for out_feature in self._out_features:
assert out_feature in children, "Available children: {}".format(", ".join(children))
def forward(self, x):
outputs = {}
x = self.stem(x)
if "stem" in self._out_features:
outputs["stem"] = x
for stage, name in self.stages_and_names:
x = stage(x)
if name in self._out_features:
outputs[name] = x
if self.num_classes is not None:
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.linear(x)
if "linear" in self._out_features:
outputs["linear"] = x
return outputs
def output_shape(self):
return {
name: ShapeSpec(
channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
)
for name in self._out_features
}
def freeze(self, freeze_at=0):
"""
Freeze the first several stages of the ResNet. Commonly used in
fine-tuning.
Layers that produce the same feature map spatial size are defined as one
"stage" by :paper:`FPN`.
Args:
freeze_at (int): number of stages to freeze.
`1` means freezing the stem. `2` means freezing the stem and
one residual stage, etc.
Returns:
nn.Module: this ResNet itself
"""
if freeze_at >= 1:
self.stem.freeze()
for idx, (stage, _) in enumerate(self.stages_and_names, start=2):
if freeze_at >= idx:
for block in stage.children():
block.freeze()
return self
@BACKBONE_REGISTRY.register()
def build_resnet_backbone(cfg, input_shape):
"""
Create a ResNet instance from config.
Returns:
ResNet: a :class:`ResNet` instance.
"""
# need registration of new blocks/stems?
norm = cfg.MODEL.RESNETS.NORM
stem = BasicStem(
in_channels=input_shape.channels,
out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS,
norm=norm,
)
# fmt: off
freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT
out_features = cfg.MODEL.RESNETS.OUT_FEATURES
depth = cfg.MODEL.RESNETS.DEPTH
num_groups = cfg.MODEL.RESNETS.NUM_GROUPS
width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
bottleneck_channels = num_groups * width_per_group
in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1
res5_dilation = cfg.MODEL.RESNETS.RES5_DILATION
deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
deform_modulated = cfg.MODEL.RESNETS.DEFORM_MODULATED
deform_num_groups = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS
# fmt: on
assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
num_blocks_per_stage = {
18: [2, 2, 2, 2],
34: [3, 4, 6, 3],
50: [3, 4, 6, 3],
101: [3, 4, 23, 3],
152: [3, 8, 36, 3],
}[depth]
if depth in [18, 34]:
assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34"
assert not any(
deform_on_per_stage
), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34"
assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34"
assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34"
stages = []
# Avoid creating variables without gradients
# It consumes extra memory and may cause allreduce to fail
out_stage_idx = [{"res2": 2, "res3": 3, "res4": 4, "res5": 5}[f] for f in out_features]
max_stage_idx = max(out_stage_idx)
for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)):
dilation = res5_dilation if stage_idx == 5 else 1
first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
stage_kargs = {
"num_blocks": num_blocks_per_stage[idx],
"first_stride": first_stride,
"in_channels": in_channels,
"out_channels": out_channels,
"norm": norm,
}
# Use BasicBlock for R18 and R34.
if depth in [18, 34]:
stage_kargs["block_class"] = BasicBlock
else:
stage_kargs["bottleneck_channels"] = bottleneck_channels
stage_kargs["stride_in_1x1"] = stride_in_1x1
stage_kargs["dilation"] = dilation
stage_kargs["num_groups"] = num_groups
if deform_on_per_stage[idx]:
stage_kargs["block_class"] = DeformBottleneckBlock
stage_kargs["deform_modulated"] = deform_modulated
stage_kargs["deform_num_groups"] = deform_num_groups
else:
stage_kargs["block_class"] = BottleneckBlock
blocks = make_stage(**stage_kargs)
in_channels = out_channels
out_channels *= 2
bottleneck_channels *= 2
stages.append(blocks)
return ResNet(stem, stages, out_features=out_features).freeze(freeze_at)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import math
from typing import Tuple
import torch
# Value for clamping large dw and dh predictions. The heuristic is that we clamp
# such that dw and dh are no larger than what would transform a 16px box into a
# 1000px box (based on a small anchor, 16px, and a typical image size, 1000px).
_DEFAULT_SCALE_CLAMP = math.log(1000.0 / 16)
__all__ = ["Box2BoxTransform", "Box2BoxTransformRotated"]
def apply_deltas_broadcast(box2box_transform, deltas, boxes):
"""
Apply transform deltas to boxes. Similar to `box2box_transform.apply_deltas`,
but allow broadcasting boxes when the second dimension of deltas is a multiple
of box dimension.
Args:
box2box_transform (Box2BoxTransform or Box2BoxTransformRotated): the transform to apply
deltas (Tensor): tensor of shape (N,B) or (N,KxB)
boxes (Tensor): tensor of shape (N,B)
Returns:
Tensor: same shape as deltas.
"""
assert deltas.dim() == boxes.dim() == 2, f"{deltas.shape}, {boxes.shape}"
N, B = boxes.shape
assert (
deltas.shape[1] % B == 0
), f"Second dim of deltas should be a multiple of {B}. Got {deltas.shape}"
K = deltas.shape[1] // B
ret = box2box_transform.apply_deltas(
deltas.view(N * K, B), boxes.unsqueeze(1).expand(N, K, B).reshape(N * K, B)
)
return ret.view(N, K * B)
@torch.jit.script
class Box2BoxTransform(object):
"""
The box-to-box transform defined in R-CNN. The transformation is parameterized
by 4 deltas: (dx, dy, dw, dh). The transformation scales the box's width and height
by exp(dw), exp(dh) and shifts a box's center by the offset (dx * width, dy * height).
"""
def __init__(
self, weights: Tuple[float, float, float, float], scale_clamp: float = _DEFAULT_SCALE_CLAMP
):
"""
Args:
weights (4-element tuple): Scaling factors that are applied to the
(dx, dy, dw, dh) deltas. In Fast R-CNN, these were originally set
such that the deltas have unit variance; now they are treated as
hyperparameters of the system.
scale_clamp (float): When predicting deltas, the predicted box scaling
factors (dw and dh) are clamped such that they are <= scale_clamp.
"""
self.weights = weights
self.scale_clamp = scale_clamp
def get_deltas(self, src_boxes, target_boxes):
"""
Get box regression transformation deltas (dx, dy, dw, dh) that can be used
to transform the `src_boxes` into the `target_boxes`. That is, the relation
``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
any delta is too large and is clamped).
Args:
src_boxes (Tensor): source boxes, e.g., object proposals
target_boxes (Tensor): target of the transformation, e.g., ground-truth
boxes.
"""
assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
src_widths = src_boxes[:, 2] - src_boxes[:, 0]
src_heights = src_boxes[:, 3] - src_boxes[:, 1]
src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths
src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights
target_widths = target_boxes[:, 2] - target_boxes[:, 0]
target_heights = target_boxes[:, 3] - target_boxes[:, 1]
target_ctr_x = target_boxes[:, 0] + 0.5 * target_widths
target_ctr_y = target_boxes[:, 1] + 0.5 * target_heights
wx, wy, ww, wh = self.weights
dx = wx * (target_ctr_x - src_ctr_x) / src_widths
dy = wy * (target_ctr_y - src_ctr_y) / src_heights
dw = ww * torch.log(target_widths / src_widths)
dh = wh * torch.log(target_heights / src_heights)
deltas = torch.stack((dx, dy, dw, dh), dim=1)
assert (src_widths > 0).all().item(), "Input boxes to Box2BoxTransform are not valid!"
return deltas
def apply_deltas(self, deltas, boxes):
"""
Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`.
Args:
deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
deltas[i] represents k potentially different class-specific
box transformations for the single box boxes[i].
boxes (Tensor): boxes to transform, of shape (N, 4)
"""
boxes = boxes.to(deltas.dtype)
widths = boxes[:, 2] - boxes[:, 0]
heights = boxes[:, 3] - boxes[:, 1]
ctr_x = boxes[:, 0] + 0.5 * widths
ctr_y = boxes[:, 1] + 0.5 * heights
wx, wy, ww, wh = self.weights
dx = deltas[:, 0::4] / wx
dy = deltas[:, 1::4] / wy
dw = deltas[:, 2::4] / ww
dh = deltas[:, 3::4] / wh
# Prevent sending too large values into torch.exp()
dw = torch.clamp(dw, max=self.scale_clamp)
dh = torch.clamp(dh, max=self.scale_clamp)
pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
pred_w = torch.exp(dw) * widths[:, None]
pred_h = torch.exp(dh) * heights[:, None]
pred_boxes = torch.zeros_like(deltas)
pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w # x1
pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h # y1
pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w # x2
pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h # y2
return pred_boxes
@torch.jit.script
class Box2BoxTransformRotated(object):
"""
The box-to-box transform defined in Rotated R-CNN. The transformation is parameterized
by 5 deltas: (dx, dy, dw, dh, da). The transformation scales the box's width and height
by exp(dw), exp(dh), shifts a box's center by the offset (dx * width, dy * height),
and rotate a box's angle by da (radians).
Note: angles of deltas are in radians while angles of boxes are in degrees.
"""
def __init__(
self,
weights: Tuple[float, float, float, float, float],
scale_clamp: float = _DEFAULT_SCALE_CLAMP,
):
"""
Args:
weights (5-element tuple): Scaling factors that are applied to the
(dx, dy, dw, dh, da) deltas. These are treated as
hyperparameters of the system.
scale_clamp (float): When predicting deltas, the predicted box scaling
factors (dw and dh) are clamped such that they are <= scale_clamp.
"""
self.weights = weights
self.scale_clamp = scale_clamp
def get_deltas(self, src_boxes, target_boxes):
"""
Get box regression transformation deltas (dx, dy, dw, dh, da) that can be used
to transform the `src_boxes` into the `target_boxes`. That is, the relation
``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
any delta is too large and is clamped).
Args:
src_boxes (Tensor): Nx5 source boxes, e.g., object proposals
target_boxes (Tensor): Nx5 target of the transformation, e.g., ground-truth
boxes.
"""
assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
src_ctr_x, src_ctr_y, src_widths, src_heights, src_angles = torch.unbind(src_boxes, dim=1)
target_ctr_x, target_ctr_y, target_widths, target_heights, target_angles = torch.unbind(
target_boxes, dim=1
)
wx, wy, ww, wh, wa = self.weights
dx = wx * (target_ctr_x - src_ctr_x) / src_widths
dy = wy * (target_ctr_y - src_ctr_y) / src_heights
dw = ww * torch.log(target_widths / src_widths)
dh = wh * torch.log(target_heights / src_heights)
# Angles of deltas are in radians while angles of boxes are in degrees.
# the conversion to radians serve as a way to normalize the values
da = target_angles - src_angles
da = (da + 180.0) % 360.0 - 180.0 # make it in [-180, 180)
da *= wa * math.pi / 180.0
deltas = torch.stack((dx, dy, dw, dh, da), dim=1)
assert (
(src_widths > 0).all().item()
), "Input boxes to Box2BoxTransformRotated are not valid!"
return deltas
def apply_deltas(self, deltas, boxes):
"""
Apply transformation `deltas` (dx, dy, dw, dh, da) to `boxes`.
Args:
deltas (Tensor): transformation deltas of shape (N, 5).
deltas[i] represents box transformation for the single box boxes[i].
boxes (Tensor): boxes to transform, of shape (N, 5)
"""
assert deltas.shape[1] == 5 and boxes.shape[1] == 5
boxes = boxes.to(deltas.dtype)
ctr_x = boxes[:, 0]
ctr_y = boxes[:, 1]
widths = boxes[:, 2]
heights = boxes[:, 3]
angles = boxes[:, 4]
wx, wy, ww, wh, wa = self.weights
dx = deltas[:, 0] / wx
dy = deltas[:, 1] / wy
dw = deltas[:, 2] / ww
dh = deltas[:, 3] / wh
da = deltas[:, 4] / wa
# Prevent sending too large values into torch.exp()
dw = torch.clamp(dw, max=self.scale_clamp)
dh = torch.clamp(dh, max=self.scale_clamp)
pred_boxes = torch.zeros_like(deltas)
pred_boxes[:, 0] = dx * widths + ctr_x # x_ctr
pred_boxes[:, 1] = dy * heights + ctr_y # y_ctr
pred_boxes[:, 2] = torch.exp(dw) * widths # width
pred_boxes[:, 3] = torch.exp(dh) * heights # height
# Following original RRPN implementation,
# angles of deltas are in radians while angles of boxes are in degrees.
pred_angle = da * 180.0 / math.pi + angles
pred_angle = (pred_angle + 180.0) % 360.0 - 180.0 # make it in [-180, 180)
pred_boxes[:, 4] = pred_angle
return pred_boxes
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from typing import List
import torch
class Matcher(object):
"""
This class assigns to each predicted "element" (e.g., a box) a ground-truth
element. Each predicted element will have exactly zero or one matches; each
ground-truth element may be matched to zero or more predicted elements.
The matching is determined by the MxN match_quality_matrix, that characterizes
how well each (ground-truth, prediction)-pair match each other. For example,
if the elements are boxes, this matrix may contain box intersection-over-union
overlap values.
The matcher returns (a) a vector of length N containing the index of the
ground-truth element m in [0, M) that matches to prediction n in [0, N).
(b) a vector of length N containing the labels for each prediction.
"""
def __init__(
self, thresholds: List[float], labels: List[int], allow_low_quality_matches: bool = False
):
"""
Args:
thresholds (list): a list of thresholds used to stratify predictions
into levels.
labels (list): a list of values to label predictions belonging at
each level. A label can be one of {-1, 0, 1} signifying
{ignore, negative class, positive class}, respectively.
allow_low_quality_matches (bool): if True, produce additional matches
for predictions with maximum match quality lower than high_threshold.
See set_low_quality_matches_ for more details.
For example,
thresholds = [0.3, 0.5]
labels = [0, -1, 1]
All predictions with iou < 0.3 will be marked with 0 and
thus will be considered as false positives while training.
All predictions with 0.3 <= iou < 0.5 will be marked with -1 and
thus will be ignored.
All predictions with 0.5 <= iou will be marked with 1 and
thus will be considered as true positives.
"""
# Add -inf and +inf to first and last position in thresholds
thresholds = thresholds[:]
assert thresholds[0] > 0
thresholds.insert(0, -float("inf"))
thresholds.append(float("inf"))
assert all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:]))
assert all(l in [-1, 0, 1] for l in labels)
assert len(labels) == len(thresholds) - 1
self.thresholds = thresholds
self.labels = labels
self.allow_low_quality_matches = allow_low_quality_matches
def __call__(self, match_quality_matrix):
"""
Args:
match_quality_matrix (Tensor[float]): an MxN tensor, containing the
pairwise quality between M ground-truth elements and N predicted
elements. All elements must be >= 0 (due to the us of `torch.nonzero`
for selecting indices in :meth:`set_low_quality_matches_`).
Returns:
matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
ground-truth index in [0, M)
match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
whether a prediction is a true or false positive or ignored
"""
assert match_quality_matrix.dim() == 2
if match_quality_matrix.numel() == 0:
default_matches = match_quality_matrix.new_full(
(match_quality_matrix.size(1),), 0, dtype=torch.int64
)
# When no gt boxes exist, we define IOU = 0 and therefore set labels
# to `self.labels[0]`, which usually defaults to background class 0
# To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds
default_match_labels = match_quality_matrix.new_full(
(match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
)
return default_matches, default_match_labels
assert torch.all(match_quality_matrix >= 0)
# match_quality_matrix is M (gt) x N (predicted)
# Max over gt elements (dim 0) to find best gt candidate for each prediction
matched_vals, matches = match_quality_matrix.max(dim=0)
match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
for (l, low, high) in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
low_high = (matched_vals >= low) & (matched_vals < high)
match_labels[low_high] = l
if self.allow_low_quality_matches:
self.set_low_quality_matches_(match_labels, match_quality_matrix)
return matches, match_labels
def set_low_quality_matches_(self, match_labels, match_quality_matrix):
"""
Produce additional matches for predictions that have only low-quality matches.
Specifically, for each ground-truth G find the set of predictions that have
maximum overlap with it (including ties); for each prediction in that set, if
it is unmatched, then match it to the ground-truth G.
This function implements the RPN assignment case (i) in Sec. 3.1.2 of
:paper:`Faster R-CNN`.
"""
# For each gt, find the prediction with which it has highest quality
highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
# Find the highest quality match available, even if it is low, including ties.
# Note that the matches qualities must be positive due to the use of
# `torch.nonzero`.
_, pred_inds_with_highest_quality = torch.nonzero(
match_quality_matrix == highest_quality_foreach_gt[:, None], as_tuple=True
)
# If an anchor was labeled positive only due to a low-quality match
# with gt_A, but it has larger overlap with gt_B, it's matched index will still be gt_B.
# This follows the implementation in Detectron, and is found to have no significant impact.
match_labels[pred_inds_with_highest_quality] = 1
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .build import META_ARCH_REGISTRY, build_model # isort:skip
from .panoptic_fpn import PanopticFPN
# import all the meta_arch, so they will be registered
from .rcnn import GeneralizedRCNN, ProposalNetwork
from .retinanet import RetinaNet
from .semantic_seg import SEM_SEG_HEADS_REGISTRY, SemanticSegmentor, build_sem_seg_head
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import torch
from detectron2.utils.registry import Registry
META_ARCH_REGISTRY = Registry("META_ARCH") # noqa F401 isort:skip
META_ARCH_REGISTRY.__doc__ = """
Registry for meta-architectures, i.e. the whole model.
The registered object will be called with `obj(cfg)`
and expected to return a `nn.Module` object.
"""
def build_model(cfg):
"""
Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
Note that it does not load any weights from ``cfg``.
"""
meta_arch = cfg.MODEL.META_ARCHITECTURE
model = META_ARCH_REGISTRY.get(meta_arch)(cfg)
model.to(torch.device(cfg.MODEL.DEVICE))
return model
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import torch
from torch import nn
from detectron2.structures import ImageList
from ..backbone import build_backbone
from ..postprocessing import detector_postprocess, sem_seg_postprocess
from ..proposal_generator import build_proposal_generator
from ..roi_heads import build_roi_heads
from .build import META_ARCH_REGISTRY
from .semantic_seg import build_sem_seg_head
__all__ = ["PanopticFPN"]
@META_ARCH_REGISTRY.register()
class PanopticFPN(nn.Module):
"""
Implement the paper :paper:`PanopticFPN`.
"""
def __init__(self, cfg):
super().__init__()
self.instance_loss_weight = cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT
# options when combining instance & semantic outputs
self.combine_on = cfg.MODEL.PANOPTIC_FPN.COMBINE.ENABLED
self.combine_overlap_threshold = cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH
self.combine_stuff_area_limit = cfg.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT
self.combine_instances_confidence_threshold = (
cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH
)
self.backbone = build_backbone(cfg)
self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape())
self.roi_heads = build_roi_heads(cfg, self.backbone.output_shape())
self.sem_seg_head = build_sem_seg_head(cfg, self.backbone.output_shape())
self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
@property
def device(self):
return self.pixel_mean.device
def forward(self, batched_inputs):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* "image": Tensor, image in (C, H, W) format.
* "instances": Instances
* "sem_seg": semantic segmentation ground truth.
* Other information that's included in the original dicts, such as:
"height", "width" (int): the output resolution of the model, used in inference.
See :meth:`postprocess` for details.
Returns:
list[dict]:
each dict is the results for one image. The dict contains the following keys:
* "instances": see :meth:`GeneralizedRCNN.forward` for its format.
* "sem_seg": see :meth:`SemanticSegmentor.forward` for its format.
* "panoptic_seg": available when `PANOPTIC_FPN.COMBINE.ENABLED`.
See the return value of
:func:`combine_semantic_and_instance_outputs` for its format.
"""
images = [x["image"].to(self.device) for x in batched_inputs]
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(images, self.backbone.size_divisibility)
features = self.backbone(images.tensor)
if "proposals" in batched_inputs[0]:
proposals = [x["proposals"].to(self.device) for x in batched_inputs]
proposal_losses = {}
if "sem_seg" in batched_inputs[0]:
gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs]
gt_sem_seg = ImageList.from_tensors(
gt_sem_seg, self.backbone.size_divisibility, self.sem_seg_head.ignore_value
).tensor
else:
gt_sem_seg = None
sem_seg_results, sem_seg_losses = self.sem_seg_head(features, gt_sem_seg)
if "instances" in batched_inputs[0]:
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
else:
gt_instances = None
if self.proposal_generator:
proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
detector_results, detector_losses = self.roi_heads(
images, features, proposals, gt_instances
)
if self.training:
losses = {}
losses.update(sem_seg_losses)
losses.update({k: v * self.instance_loss_weight for k, v in detector_losses.items()})
losses.update(proposal_losses)
return losses
processed_results = []
for sem_seg_result, detector_result, input_per_image, image_size in zip(
sem_seg_results, detector_results, batched_inputs, images.image_sizes
):
height = input_per_image.get("height", image_size[0])
width = input_per_image.get("width", image_size[1])
sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width)
detector_r = detector_postprocess(detector_result, height, width)
processed_results.append({"sem_seg": sem_seg_r, "instances": detector_r})
if self.combine_on:
panoptic_r = combine_semantic_and_instance_outputs(
detector_r,
sem_seg_r.argmax(dim=0),
self.combine_overlap_threshold,
self.combine_stuff_area_limit,
self.combine_instances_confidence_threshold,
)
processed_results[-1]["panoptic_seg"] = panoptic_r
return processed_results
def combine_semantic_and_instance_outputs(
instance_results,
semantic_results,
overlap_threshold,
stuff_area_limit,
instances_confidence_threshold,
):
"""
Implement a simple combining logic following
"combine_semantic_and_instance_predictions.py" in panopticapi
to produce panoptic segmentation outputs.
Args:
instance_results: output of :func:`detector_postprocess`.
semantic_results: an (H, W) tensor, each is the contiguous semantic
category id
Returns:
panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
segments_info (list[dict]): Describe each segment in `panoptic_seg`.
Each dict contains keys "id", "category_id", "isthing".
"""
panoptic_seg = torch.zeros_like(semantic_results, dtype=torch.int32)
# sort instance outputs by scores
sorted_inds = torch.argsort(-instance_results.scores)
current_segment_id = 0
segments_info = []
instance_masks = instance_results.pred_masks.to(dtype=torch.bool, device=panoptic_seg.device)
# Add instances one-by-one, check for overlaps with existing ones
for inst_id in sorted_inds:
score = instance_results.scores[inst_id].item()
if score < instances_confidence_threshold:
break
mask = instance_masks[inst_id] # H,W
mask_area = mask.sum().item()
if mask_area == 0:
continue
intersect = (mask > 0) & (panoptic_seg > 0)
intersect_area = intersect.sum().item()
if intersect_area * 1.0 / mask_area > overlap_threshold:
continue
if intersect_area > 0:
mask = mask & (panoptic_seg == 0)
current_segment_id += 1
panoptic_seg[mask] = current_segment_id
segments_info.append(
{
"id": current_segment_id,
"isthing": True,
"score": score,
"category_id": instance_results.pred_classes[inst_id].item(),
"instance_id": inst_id.item(),
}
)
# Add semantic results to remaining empty areas
semantic_labels = torch.unique(semantic_results).cpu().tolist()
for semantic_label in semantic_labels:
if semantic_label == 0: # 0 is a special "thing" class
continue
mask = (semantic_results == semantic_label) & (panoptic_seg == 0)
mask_area = mask.sum().item()
if mask_area < stuff_area_limit:
continue
current_segment_id += 1
panoptic_seg[mask] = current_segment_id
segments_info.append(
{
"id": current_segment_id,
"isthing": False,
"category_id": semantic_label,
"area": mask_area,
}
)
return panoptic_seg, segments_info
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import logging
import numpy as np
import torch
from torch import nn
from detectron2.structures import ImageList
from detectron2.utils.events import get_event_storage
from detectron2.utils.logger import log_first_n
from ..backbone import build_backbone
from ..postprocessing import detector_postprocess
from ..proposal_generator import build_proposal_generator
from ..roi_heads import build_roi_heads
from .build import META_ARCH_REGISTRY
__all__ = ["GeneralizedRCNN", "ProposalNetwork"]
@META_ARCH_REGISTRY.register()
class GeneralizedRCNN(nn.Module):
"""
Generalized R-CNN. Any models that contains the following three components:
1. Per-image feature extraction (aka backbone)
2. Region proposal generation
3. Per-region feature extraction and prediction
"""
def __init__(self, cfg):
super().__init__()
self.backbone = build_backbone(cfg)
self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape())
self.roi_heads = build_roi_heads(cfg, self.backbone.output_shape())
self.vis_period = cfg.VIS_PERIOD
self.input_format = cfg.INPUT.FORMAT
assert len(cfg.MODEL.PIXEL_MEAN) == len(cfg.MODEL.PIXEL_STD)
self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
@property
def device(self):
return self.pixel_mean.device
def visualize_training(self, batched_inputs, proposals):
"""
A function used to visualize images and proposals. It shows ground truth
bounding boxes on the original image and up to 20 predicted object
proposals on the original image. Users can implement different
visualization functions for different models.
Args:
batched_inputs (list): a list that contains input to the model.
proposals (list): a list that contains predicted proposals. Both
batched_inputs and proposals should have the same length.
"""
from detectron2.utils.visualizer import Visualizer
storage = get_event_storage()
max_vis_prop = 20
for input, prop in zip(batched_inputs, proposals):
img = input["image"].cpu().numpy()
assert img.shape[0] == 3, "Images should have 3 channels."
if self.input_format == "BGR":
img = img[::-1, :, :]
img = img.transpose(1, 2, 0)
v_gt = Visualizer(img, None)
v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
anno_img = v_gt.get_image()
box_size = min(len(prop.proposal_boxes), max_vis_prop)
v_pred = Visualizer(img, None)
v_pred = v_pred.overlay_instances(
boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
)
prop_img = v_pred.get_image()
vis_img = np.concatenate((anno_img, prop_img), axis=1)
vis_img = vis_img.transpose(2, 0, 1)
vis_name = "Left: GT bounding boxes; Right: Predicted proposals"
storage.put_image(vis_name, vis_img)
break # only visualize one image in a batch
def forward(self, batched_inputs):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* image: Tensor, image in (C, H, W) format.
* instances (optional): groundtruth :class:`Instances`
* proposals (optional): :class:`Instances`, precomputed proposals.
Other information that's included in the original dicts, such as:
* "height", "width" (int): the output resolution of the model, used in inference.
See :meth:`postprocess` for details.
Returns:
list[dict]:
Each dict is the output for one input image.
The dict contains one key "instances" whose value is a :class:`Instances`.
The :class:`Instances` object has the following keys:
"pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
"""
if not self.training:
return self.inference(batched_inputs)
images = self.preprocess_image(batched_inputs)
if "instances" in batched_inputs[0]:
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
elif "targets" in batched_inputs[0]:
log_first_n(
logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
)
gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
else:
gt_instances = None
features = self.backbone(images.tensor)
if self.proposal_generator:
proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
else:
assert "proposals" in batched_inputs[0]
proposals = [x["proposals"].to(self.device) for x in batched_inputs]
proposal_losses = {}
_, detector_losses = self.roi_heads(images, features, proposals, gt_instances)
if self.vis_period > 0:
storage = get_event_storage()
if storage.iter % self.vis_period == 0:
self.visualize_training(batched_inputs, proposals)
losses = {}
losses.update(detector_losses)
losses.update(proposal_losses)
return losses
def inference(self, batched_inputs, detected_instances=None, do_postprocess=True):
"""
Run inference on the given inputs.
Args:
batched_inputs (list[dict]): same as in :meth:`forward`
detected_instances (None or list[Instances]): if not None, it
contains an `Instances` object per image. The `Instances`
object contains "pred_boxes" and "pred_classes" which are
known boxes in the image.
The inference will then skip the detection of bounding boxes,
and only predict other per-ROI outputs.
do_postprocess (bool): whether to apply post-processing on the outputs.
Returns:
same as in :meth:`forward`.
"""
assert not self.training
images = self.preprocess_image(batched_inputs)
features = self.backbone(images.tensor)
if detected_instances is None:
if self.proposal_generator:
proposals, _ = self.proposal_generator(images, features, None)
else:
assert "proposals" in batched_inputs[0]
proposals = [x["proposals"].to(self.device) for x in batched_inputs]
results, _ = self.roi_heads(images, features, proposals, None)
else:
detected_instances = [x.to(self.device) for x in detected_instances]
results = self.roi_heads.forward_with_given_boxes(features, detected_instances)
if do_postprocess:
return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
else:
return results
def preprocess_image(self, batched_inputs):
"""
Normalize, pad and batch the input images.
"""
images = [x["image"].to(self.device) for x in batched_inputs]
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(images, self.backbone.size_divisibility)
return images
@staticmethod
def _postprocess(instances, batched_inputs, image_sizes):
"""
Rescale the output instances to the target size.
"""
# note: private function; subject to changes
processed_results = []
for results_per_image, input_per_image, image_size in zip(
instances, batched_inputs, image_sizes
):
height = input_per_image.get("height", image_size[0])
width = input_per_image.get("width", image_size[1])
r = detector_postprocess(results_per_image, height, width)
processed_results.append({"instances": r})
return processed_results
@META_ARCH_REGISTRY.register()
class ProposalNetwork(nn.Module):
"""
A meta architecture that only predicts object proposals.
"""
def __init__(self, cfg):
super().__init__()
self.backbone = build_backbone(cfg)
self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape())
self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
@property
def device(self):
return self.pixel_mean.device
def forward(self, batched_inputs):
"""
Args:
Same as in :class:`GeneralizedRCNN.forward`
Returns:
list[dict]:
Each dict is the output for one input image.
The dict contains one key "proposals" whose value is a
:class:`Instances` with keys "proposal_boxes" and "objectness_logits".
"""
images = [x["image"].to(self.device) for x in batched_inputs]
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(images, self.backbone.size_divisibility)
features = self.backbone(images.tensor)
if "instances" in batched_inputs[0]:
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
elif "targets" in batched_inputs[0]:
log_first_n(
logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
)
gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
else:
gt_instances = None
proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
# In training, the proposals are not useful at all but we generate them anyway.
# This makes RPN-only models about 5% slower.
if self.training:
return proposal_losses
processed_results = []
for results_per_image, input_per_image, image_size in zip(
proposals, batched_inputs, images.image_sizes
):
height = input_per_image.get("height", image_size[0])
width = input_per_image.get("width", image_size[1])
r = detector_postprocess(results_per_image, height, width)
processed_results.append({"proposals": r})
return processed_results
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import logging
import math
import numpy as np
from typing import List
import torch
from fvcore.nn import sigmoid_focal_loss_jit, smooth_l1_loss
from torch import nn
from detectron2.layers import ShapeSpec, batched_nms, cat
from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou
from detectron2.utils.events import get_event_storage
from detectron2.utils.logger import log_first_n
from ..anchor_generator import build_anchor_generator
from ..backbone import build_backbone
from ..box_regression import Box2BoxTransform
from ..matcher import Matcher
from ..postprocessing import detector_postprocess
from .build import META_ARCH_REGISTRY
__all__ = ["RetinaNet"]
def permute_to_N_HWA_K(tensor, K):
"""
Transpose/reshape a tensor from (N, (A x K), H, W) to (N, (HxWxA), K)
"""
assert tensor.dim() == 4, tensor.shape
N, _, H, W = tensor.shape
tensor = tensor.view(N, -1, K, H, W)
tensor = tensor.permute(0, 3, 4, 1, 2)
tensor = tensor.reshape(N, -1, K) # Size=(N,HWA,K)
return tensor
def permute_all_cls_and_box_to_N_HWA_K_and_concat(box_cls, box_delta, num_classes=80):
"""
Rearrange the tensor layout from the network output, i.e.:
list[Tensor]: #lvl tensors of shape (N, A x K, Hi, Wi)
to per-image predictions, i.e.:
Tensor: of shape (N x sum(Hi x Wi x A), K)
"""
# for each feature level, permute the outputs to make them be in the
# same format as the labels. Note that the labels are computed for
# all feature levels concatenated, so we keep the same representation
# for the objectness and the box_delta
box_cls_flattened = [permute_to_N_HWA_K(x, num_classes) for x in box_cls]
box_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in box_delta]
# concatenate on the first dimension (representing the feature levels), to
# take into account the way the labels were generated (with all feature maps
# being concatenated as well)
box_cls = cat(box_cls_flattened, dim=1).view(-1, num_classes)
box_delta = cat(box_delta_flattened, dim=1).view(-1, 4)
return box_cls, box_delta
@META_ARCH_REGISTRY.register()
class RetinaNet(nn.Module):
"""
Implement RetinaNet in :paper:`RetinaNet`.
"""
def __init__(self, cfg):
super().__init__()
# fmt: off
self.num_classes = cfg.MODEL.RETINANET.NUM_CLASSES
self.in_features = cfg.MODEL.RETINANET.IN_FEATURES
# Loss parameters:
self.focal_loss_alpha = cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA
self.focal_loss_gamma = cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA
self.smooth_l1_loss_beta = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA
# Inference parameters:
self.score_threshold = cfg.MODEL.RETINANET.SCORE_THRESH_TEST
self.topk_candidates = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST
self.nms_threshold = cfg.MODEL.RETINANET.NMS_THRESH_TEST
self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE
# Vis parameters
self.vis_period = cfg.VIS_PERIOD
self.input_format = cfg.INPUT.FORMAT
# fmt: on
self.backbone = build_backbone(cfg)
backbone_shape = self.backbone.output_shape()
feature_shapes = [backbone_shape[f] for f in self.in_features]
self.head = RetinaNetHead(cfg, feature_shapes)
self.anchor_generator = build_anchor_generator(cfg, feature_shapes)
# Matching and loss
self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS)
self.matcher = Matcher(
cfg.MODEL.RETINANET.IOU_THRESHOLDS,
cfg.MODEL.RETINANET.IOU_LABELS,
allow_low_quality_matches=True,
)
self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
"""
In Detectron1, loss is normalized by number of foreground samples in the batch.
When batch size is 1 per GPU, #foreground has a large variance and
using it lead to lower performance. Here we maintain an EMA of #foreground to
stabilize the normalizer.
"""
self.loss_normalizer = 100 # initialize with any reasonable #fg that's not too small
self.loss_normalizer_momentum = 0.9
@property
def device(self):
return self.pixel_mean.device
def visualize_training(self, batched_inputs, results):
"""
A function used to visualize ground truth images and final network predictions.
It shows ground truth bounding boxes on the original image and up to 20
predicted object bounding boxes on the original image.
Args:
batched_inputs (list): a list that contains input to the model.
results (List[Instances]): a list of #images elements.
"""
from detectron2.utils.visualizer import Visualizer
assert len(batched_inputs) == len(
results
), "Cannot visualize inputs and results of different sizes"
storage = get_event_storage()
max_boxes = 20
image_index = 0 # only visualize a single image
img = batched_inputs[image_index]["image"].cpu().numpy()
assert img.shape[0] == 3, "Images should have 3 channels."
if self.input_format == "BGR":
img = img[::-1, :, :]
img = img.transpose(1, 2, 0)
v_gt = Visualizer(img, None)
v_gt = v_gt.overlay_instances(boxes=batched_inputs[image_index]["instances"].gt_boxes)
anno_img = v_gt.get_image()
processed_results = detector_postprocess(results[image_index], img.shape[0], img.shape[1])
predicted_boxes = processed_results.pred_boxes.tensor.detach().cpu().numpy()
v_pred = Visualizer(img, None)
v_pred = v_pred.overlay_instances(boxes=predicted_boxes[0:max_boxes])
prop_img = v_pred.get_image()
vis_img = np.vstack((anno_img, prop_img))
vis_img = vis_img.transpose(2, 0, 1)
vis_name = f"Top: GT bounding boxes; Bottom: {max_boxes} Highest Scoring Results"
storage.put_image(vis_name, vis_img)
def forward(self, batched_inputs):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* image: Tensor, image in (C, H, W) format.
* instances: Instances
Other information that's included in the original dicts, such as:
* "height", "width" (int): the output resolution of the model, used in inference.
See :meth:`postprocess` for details.
Returns:
dict[str: Tensor]:
mapping from a named loss to a tensor storing the loss. Used during training only.
"""
images = self.preprocess_image(batched_inputs)
if "instances" in batched_inputs[0]:
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
elif "targets" in batched_inputs[0]:
log_first_n(
logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
)
gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
else:
gt_instances = None
features = self.backbone(images.tensor)
features = [features[f] for f in self.in_features]
box_cls, box_delta = self.head(features)
anchors = self.anchor_generator(features)
if self.training:
gt_classes, gt_anchors_reg_deltas = self.get_ground_truth(anchors, gt_instances)
losses = self.losses(gt_classes, gt_anchors_reg_deltas, box_cls, box_delta)
if self.vis_period > 0:
storage = get_event_storage()
if storage.iter % self.vis_period == 0:
results = self.inference(box_cls, box_delta, anchors, images.image_sizes)
self.visualize_training(batched_inputs, results)
return losses
else:
results = self.inference(box_cls, box_delta, anchors, images.image_sizes)
processed_results = []
for results_per_image, input_per_image, image_size in zip(
results, batched_inputs, images.image_sizes
):
height = input_per_image.get("height", image_size[0])
width = input_per_image.get("width", image_size[1])
r = detector_postprocess(results_per_image, height, width)
processed_results.append({"instances": r})
return processed_results
def losses(self, gt_classes, gt_anchors_deltas, pred_class_logits, pred_anchor_deltas):
"""
Args:
For `gt_classes` and `gt_anchors_deltas` parameters, see
:meth:`RetinaNet.get_ground_truth`.
Their shapes are (N, R) and (N, R, 4), respectively, where R is
the total number of anchors across levels, i.e. sum(Hi x Wi x A)
For `pred_class_logits` and `pred_anchor_deltas`, see
:meth:`RetinaNetHead.forward`.
Returns:
dict[str, Tensor]:
mapping from a named loss to a scalar tensor
storing the loss. Used during training only. The dict keys are:
"loss_cls" and "loss_box_reg"
"""
pred_class_logits, pred_anchor_deltas = permute_all_cls_and_box_to_N_HWA_K_and_concat(
pred_class_logits, pred_anchor_deltas, self.num_classes
) # Shapes: (N x R, K) and (N x R, 4), respectively.
gt_classes = gt_classes.flatten()
gt_anchors_deltas = gt_anchors_deltas.view(-1, 4)
valid_idxs = gt_classes >= 0
foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes)
num_foreground = foreground_idxs.sum().item()
get_event_storage().put_scalar("num_foreground", num_foreground)
self.loss_normalizer = (
self.loss_normalizer_momentum * self.loss_normalizer
+ (1 - self.loss_normalizer_momentum) * num_foreground
)
gt_classes_target = torch.zeros_like(pred_class_logits)
gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1
# logits loss
loss_cls = sigmoid_focal_loss_jit(
pred_class_logits[valid_idxs],
gt_classes_target[valid_idxs],
alpha=self.focal_loss_alpha,
gamma=self.focal_loss_gamma,
reduction="sum",
) / max(1, self.loss_normalizer)
# regression loss
loss_box_reg = smooth_l1_loss(
pred_anchor_deltas[foreground_idxs],
gt_anchors_deltas[foreground_idxs],
beta=self.smooth_l1_loss_beta,
reduction="sum",
) / max(1, self.loss_normalizer)
return {"loss_cls": loss_cls, "loss_box_reg": loss_box_reg}
@torch.no_grad()
def get_ground_truth(self, anchors, targets):
"""
Args:
anchors (list[Boxes]): A list of #feature level Boxes.
The Boxes contains anchors of this image on the specific feature level.
targets (list[Instances]): a list of N `Instances`s. The i-th
`Instances` contains the ground-truth per-instance annotations
for the i-th input image. Specify `targets` during training only.
Returns:
gt_classes (Tensor):
An integer tensor of shape (N, R) storing ground-truth labels for each anchor.
R is the total number of anchors, i.e. the sum of Hi x Wi x A for all levels.
Anchors with an IoU with some target higher than the foreground threshold
are assigned their corresponding label in the [0, K-1] range.
Anchors whose IoU are below the background threshold are assigned
the label "K". Anchors whose IoU are between the foreground and background
thresholds are assigned a label "-1", i.e. ignore.
gt_anchors_deltas (Tensor):
Shape (N, R, 4).
The last dimension represents ground-truth box2box transform
targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box.
The values in the tensor are meaningful only when the corresponding
anchor is labeled as foreground.
"""
gt_classes = []
gt_anchors_deltas = []
anchors = Boxes.cat(anchors) # Rx4
for targets_per_image in targets:
match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, anchors)
gt_matched_idxs, anchor_labels = self.matcher(match_quality_matrix)
has_gt = len(targets_per_image) > 0
if has_gt:
# ground truth box regression
matched_gt_boxes = targets_per_image.gt_boxes[gt_matched_idxs]
gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas(
anchors.tensor, matched_gt_boxes.tensor
)
gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs]
# Anchors with label 0 are treated as background.
gt_classes_i[anchor_labels == 0] = self.num_classes
# Anchors with label -1 are ignored.
gt_classes_i[anchor_labels == -1] = -1
else:
gt_classes_i = torch.zeros_like(gt_matched_idxs) + self.num_classes
gt_anchors_reg_deltas_i = torch.zeros_like(anchors.tensor)
gt_classes.append(gt_classes_i)
gt_anchors_deltas.append(gt_anchors_reg_deltas_i)
return torch.stack(gt_classes), torch.stack(gt_anchors_deltas)
def inference(self, box_cls, box_delta, anchors, image_sizes):
"""
Arguments:
box_cls, box_delta: Same as the output of :meth:`RetinaNetHead.forward`
anchors (list[Boxes]): A list of #feature level Boxes.
The Boxes contain anchors of this image on the specific feature level.
image_sizes (List[torch.Size]): the input image sizes
Returns:
results (List[Instances]): a list of #images elements.
"""
results = []
box_cls = [permute_to_N_HWA_K(x, self.num_classes) for x in box_cls]
box_delta = [permute_to_N_HWA_K(x, 4) for x in box_delta]
# list[Tensor], one per level, each has shape (N, Hi x Wi x A, K or 4)
for img_idx, image_size in enumerate(image_sizes):
box_cls_per_image = [box_cls_per_level[img_idx] for box_cls_per_level in box_cls]
box_reg_per_image = [box_reg_per_level[img_idx] for box_reg_per_level in box_delta]
results_per_image = self.inference_single_image(
box_cls_per_image, box_reg_per_image, anchors, tuple(image_size)
)
results.append(results_per_image)
return results
def inference_single_image(self, box_cls, box_delta, anchors, image_size):
"""
Single-image inference. Return bounding-box detection results by thresholding
on scores and applying non-maximum suppression (NMS).
Arguments:
box_cls (list[Tensor]): list of #feature levels. Each entry contains
tensor of size (H x W x A, K)
box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4.
anchors (list[Boxes]): list of #feature levels. Each entry contains
a Boxes object, which contains all the anchors for that
image in that feature level.
image_size (tuple(H, W)): a tuple of the image height and width.
Returns:
Same as `inference`, but for only one image.
"""
boxes_all = []
scores_all = []
class_idxs_all = []
# Iterate over every feature level
for box_cls_i, box_reg_i, anchors_i in zip(box_cls, box_delta, anchors):
# (HxWxAxK,)
box_cls_i = box_cls_i.flatten().sigmoid_()
# Keep top k top scoring indices only.
num_topk = min(self.topk_candidates, box_reg_i.size(0))
# torch.sort is actually faster than .topk (at least on GPUs)
predicted_prob, topk_idxs = box_cls_i.sort(descending=True)
predicted_prob = predicted_prob[:num_topk]
topk_idxs = topk_idxs[:num_topk]
# filter out the proposals with low confidence score
keep_idxs = predicted_prob > self.score_threshold
predicted_prob = predicted_prob[keep_idxs]
topk_idxs = topk_idxs[keep_idxs]
anchor_idxs = topk_idxs // self.num_classes
classes_idxs = topk_idxs % self.num_classes
box_reg_i = box_reg_i[anchor_idxs]
anchors_i = anchors_i[anchor_idxs]
# predict boxes
predicted_boxes = self.box2box_transform.apply_deltas(box_reg_i, anchors_i.tensor)
boxes_all.append(predicted_boxes)
scores_all.append(predicted_prob)
class_idxs_all.append(classes_idxs)
boxes_all, scores_all, class_idxs_all = [
cat(x) for x in [boxes_all, scores_all, class_idxs_all]
]
keep = batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold)
keep = keep[: self.max_detections_per_image]
result = Instances(image_size)
result.pred_boxes = Boxes(boxes_all[keep])
result.scores = scores_all[keep]
result.pred_classes = class_idxs_all[keep]
return result
def preprocess_image(self, batched_inputs):
"""
Normalize, pad and batch the input images.
"""
images = [x["image"].to(self.device) for x in batched_inputs]
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(images, self.backbone.size_divisibility)
return images
class RetinaNetHead(nn.Module):
"""
The head used in RetinaNet for object classification and box regression.
It has two subnets for the two tasks, with a common structure but separate parameters.
"""
def __init__(self, cfg, input_shape: List[ShapeSpec]):
super().__init__()
# fmt: off
in_channels = input_shape[0].channels
num_classes = cfg.MODEL.RETINANET.NUM_CLASSES
num_convs = cfg.MODEL.RETINANET.NUM_CONVS
prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB
num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors
# fmt: on
assert (
len(set(num_anchors)) == 1
), "Using different number of anchors between levels is not currently supported!"
num_anchors = num_anchors[0]
cls_subnet = []
bbox_subnet = []
for _ in range(num_convs):
cls_subnet.append(
nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
)
cls_subnet.append(nn.ReLU())
bbox_subnet.append(
nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
)
bbox_subnet.append(nn.ReLU())
self.cls_subnet = nn.Sequential(*cls_subnet)
self.bbox_subnet = nn.Sequential(*bbox_subnet)
self.cls_score = nn.Conv2d(
in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1
)
self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1)
# Initialization
for modules in [self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred]:
for layer in modules.modules():
if isinstance(layer, nn.Conv2d):
torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
torch.nn.init.constant_(layer.bias, 0)
# Use prior in model initialization to improve stability
bias_value = -(math.log((1 - prior_prob) / prior_prob))
torch.nn.init.constant_(self.cls_score.bias, bias_value)
def forward(self, features):
"""
Arguments:
features (list[Tensor]): FPN feature map tensors in high to low resolution.
Each tensor in the list correspond to different feature levels.
Returns:
logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi).
The tensor predicts the classification probability
at each spatial position for each of the A anchors and K object
classes.
bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi).
The tensor predicts 4-vector (dx,dy,dw,dh) box
regression values for every anchor. These values are the
relative offset between the anchor and the ground truth box.
"""
logits = []
bbox_reg = []
for feature in features:
logits.append(self.cls_score(self.cls_subnet(feature)))
bbox_reg.append(self.bbox_pred(self.bbox_subnet(feature)))
return logits, bbox_reg
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import numpy as np
from typing import Dict
import fvcore.nn.weight_init as weight_init
import torch
from torch import nn
from torch.nn import functional as F
from detectron2.layers import Conv2d, ShapeSpec
from detectron2.structures import ImageList
from detectron2.utils.registry import Registry
from ..backbone import build_backbone
from ..postprocessing import sem_seg_postprocess
from .build import META_ARCH_REGISTRY
__all__ = ["SemanticSegmentor", "SEM_SEG_HEADS_REGISTRY", "SemSegFPNHead", "build_sem_seg_head"]
SEM_SEG_HEADS_REGISTRY = Registry("SEM_SEG_HEADS")
SEM_SEG_HEADS_REGISTRY.__doc__ = """
Registry for semantic segmentation heads, which make semantic segmentation predictions
from feature maps.
"""
@META_ARCH_REGISTRY.register()
class SemanticSegmentor(nn.Module):
"""
Main class for semantic segmentation architectures.
"""
def __init__(self, cfg):
super().__init__()
self.backbone = build_backbone(cfg)
self.sem_seg_head = build_sem_seg_head(cfg, self.backbone.output_shape())
self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
@property
def device(self):
return self.pixel_mean.device
def forward(self, batched_inputs):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* "image": Tensor, image in (C, H, W) format.
* "sem_seg": semantic segmentation ground truth
* Other information that's included in the original dicts, such as:
"height", "width" (int): the output resolution of the model, used in inference.
See :meth:`postprocess` for details.
Returns:
list[dict]:
Each dict is the output for one input image.
The dict contains one key "sem_seg" whose value is a
Tensor that represents the
per-pixel segmentation prediced by the head.
The prediction has shape KxHxW that represents the logits of
each class for each pixel.
"""
images = [x["image"].to(self.device) for x in batched_inputs]
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(images, self.backbone.size_divisibility)
features = self.backbone(images.tensor)
if "sem_seg" in batched_inputs[0]:
targets = [x["sem_seg"].to(self.device) for x in batched_inputs]
targets = ImageList.from_tensors(
targets, self.backbone.size_divisibility, self.sem_seg_head.ignore_value
).tensor
else:
targets = None
results, losses = self.sem_seg_head(features, targets)
if self.training:
return losses
processed_results = []
for result, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes):
height = input_per_image.get("height")
width = input_per_image.get("width")
r = sem_seg_postprocess(result, image_size, height, width)
processed_results.append({"sem_seg": r})
return processed_results
def build_sem_seg_head(cfg, input_shape):
"""
Build a semantic segmentation head from `cfg.MODEL.SEM_SEG_HEAD.NAME`.
"""
name = cfg.MODEL.SEM_SEG_HEAD.NAME
return SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
@SEM_SEG_HEADS_REGISTRY.register()
class SemSegFPNHead(nn.Module):
"""
A semantic segmentation head described in :paper:`PanopticFPN`.
It takes FPN features as input and merges information from all
levels of the FPN into single output.
"""
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
super().__init__()
# fmt: off
self.in_features = cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
feature_strides = {k: v.stride for k, v in input_shape.items()}
feature_channels = {k: v.channels for k, v in input_shape.items()}
self.ignore_value = cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE
num_classes = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
conv_dims = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
self.common_stride = cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE
norm = cfg.MODEL.SEM_SEG_HEAD.NORM
self.loss_weight = cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT
# fmt: on
self.scale_heads = []
for in_feature in self.in_features:
head_ops = []
head_length = max(
1, int(np.log2(feature_strides[in_feature]) - np.log2(self.common_stride))
)
for k in range(head_length):
norm_module = nn.GroupNorm(32, conv_dims) if norm == "GN" else None
conv = Conv2d(
feature_channels[in_feature] if k == 0 else conv_dims,
conv_dims,
kernel_size=3,
stride=1,
padding=1,
bias=not norm,
norm=norm_module,
activation=F.relu,
)
weight_init.c2_msra_fill(conv)
head_ops.append(conv)
if feature_strides[in_feature] != self.common_stride:
head_ops.append(
nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
)
self.scale_heads.append(nn.Sequential(*head_ops))
self.add_module(in_feature, self.scale_heads[-1])
self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0)
weight_init.c2_msra_fill(self.predictor)
def forward(self, features, targets=None):
"""
Returns:
In training, returns (None, dict of losses)
In inference, returns (CxHxW logits, {})
"""
x = self.layers(features)
if self.training:
return None, self.losses(x, targets)
else:
x = F.interpolate(
x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
)
return x, {}
def layers(self, features):
for i, f in enumerate(self.in_features):
if i == 0:
x = self.scale_heads[i](features[f])
else:
x = x + self.scale_heads[i](features[f])
x = self.predictor(x)
return x
def losses(self, predictions, targets):
predictions = F.interpolate(
predictions, scale_factor=self.common_stride, mode="bilinear", align_corners=False
)
loss = F.cross_entropy(
predictions, targets, reduction="mean", ignore_index=self.ignore_value
)
losses = {"loss_sem_seg": loss * self.loss_weight}
return losses
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import math
import sys
import torch
from torch import nn
from torchvision.ops import RoIPool
from detectron2.layers import ROIAlign, ROIAlignRotated, cat
__all__ = ["ROIPooler"]
def assign_boxes_to_levels(box_lists, min_level, max_level, canonical_box_size, canonical_level):
"""
Map each box in `box_lists` to a feature map level index and return the assignment
vector.
Args:
box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes,
where N is the number of images in the batch.
min_level (int): Smallest feature map level index. The input is considered index 0,
the output of stage 1 is index 1, and so.
max_level (int): Largest feature map level index.
canonical_box_size (int): A canonical box size in pixels (sqrt(box area)).
canonical_level (int): The feature map level index on which a canonically-sized box
should be placed.
Returns:
A tensor of length M, where M is the total number of boxes aggregated over all
N batch images. The memory layout corresponds to the concatenation of boxes
from all images. Each element is the feature map index, as an offset from
`self.min_level`, for the corresponding box (so value i means the box is at
`self.min_level + i`).
"""
eps = sys.float_info.epsilon
box_sizes = torch.sqrt(cat([boxes.area() for boxes in box_lists]))
# Eqn.(1) in FPN paper
level_assignments = torch.floor(
canonical_level + torch.log2(box_sizes / canonical_box_size + eps)
)
# clamp level to (min, max), in case the box size is too large or too small
# for the available feature maps
level_assignments = torch.clamp(level_assignments, min=min_level, max=max_level)
return level_assignments.to(torch.int64) - min_level
def convert_boxes_to_pooler_format(box_lists):
"""
Convert all boxes in `box_lists` to the low-level format used by ROI pooling ops
(see description under Returns).
Args:
box_lists (list[Boxes] | list[RotatedBoxes]):
A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.
Returns:
When input is list[Boxes]:
A tensor of shape (M, 5), where M is the total number of boxes aggregated over all
N batch images.
The 5 columns are (batch index, x0, y0, x1, y1), where batch index
is the index in [0, N) identifying which batch image the box with corners at
(x0, y0, x1, y1) comes from.
When input is list[RotatedBoxes]:
A tensor of shape (M, 6), where M is the total number of boxes aggregated over all
N batch images.
The 6 columns are (batch index, x_ctr, y_ctr, width, height, angle_degrees),
where batch index is the index in [0, N) identifying which batch image the
rotated box (x_ctr, y_ctr, width, height, angle_degrees) comes from.
"""
def fmt_box_list(box_tensor, batch_index):
repeated_index = torch.full(
(len(box_tensor), 1), batch_index, dtype=box_tensor.dtype, device=box_tensor.device
)
return cat((repeated_index, box_tensor), dim=1)
pooler_fmt_boxes = cat(
[fmt_box_list(box_list.tensor, i) for i, box_list in enumerate(box_lists)], dim=0
)
return pooler_fmt_boxes
class ROIPooler(nn.Module):
"""
Region of interest feature map pooler that supports pooling from one or more
feature maps.
"""
def __init__(
self,
output_size,
scales,
sampling_ratio,
pooler_type,
canonical_box_size=224,
canonical_level=4,
):
"""
Args:
output_size (int, tuple[int] or list[int]): output size of the pooled region,
e.g., 14 x 14. If tuple or list is given, the length must be 2.
scales (list[float]): The scale for each low-level pooling op relative to
the input image. For a feature map with stride s relative to the input
image, scale is defined as a 1 / s. The stride must be power of 2.
When there are multiple scales, they must form a pyramid, i.e. they must be
a monotically decreasing geometric sequence with a factor of 1/2.
sampling_ratio (int): The `sampling_ratio` parameter for the ROIAlign op.
pooler_type (string): Name of the type of pooling operation that should be applied.
For instance, "ROIPool" or "ROIAlignV2".
canonical_box_size (int): A canonical box size in pixels (sqrt(box area)). The default
is heuristically defined as 224 pixels in the FPN paper (based on ImageNet
pre-training).
canonical_level (int): The feature map level index from which a canonically-sized box
should be placed. The default is defined as level 4 (stride=16) in the FPN paper,
i.e., a box of size 224x224 will be placed on the feature with stride=16.
The box placement for all boxes will be determined from their sizes w.r.t
canonical_box_size. For example, a box whose area is 4x that of a canonical box
should be used to pool features from feature level ``canonical_level+1``.
Note that the actual input feature maps given to this module may not have
sufficiently many levels for the input boxes. If the boxes are too large or too
small for the input feature maps, the closest level will be used.
"""
super().__init__()
if isinstance(output_size, int):
output_size = (output_size, output_size)
assert len(output_size) == 2
assert isinstance(output_size[0], int) and isinstance(output_size[1], int)
self.output_size = output_size
if pooler_type == "ROIAlign":
self.level_poolers = nn.ModuleList(
ROIAlign(
output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=False
)
for scale in scales
)
elif pooler_type == "ROIAlignV2":
self.level_poolers = nn.ModuleList(
ROIAlign(
output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=True
)
for scale in scales
)
elif pooler_type == "ROIPool":
self.level_poolers = nn.ModuleList(
RoIPool(output_size, spatial_scale=scale) for scale in scales
)
elif pooler_type == "ROIAlignRotated":
self.level_poolers = nn.ModuleList(
ROIAlignRotated(output_size, spatial_scale=scale, sampling_ratio=sampling_ratio)
for scale in scales
)
else:
raise ValueError("Unknown pooler type: {}".format(pooler_type))
# Map scale (defined as 1 / stride) to its feature map level under the
# assumption that stride is a power of 2.
min_level = -(math.log2(scales[0]))
max_level = -(math.log2(scales[-1]))
assert math.isclose(min_level, int(min_level)) and math.isclose(
max_level, int(max_level)
), "Featuremap stride is not power of 2!"
self.min_level = int(min_level)
self.max_level = int(max_level)
assert (
len(scales) == self.max_level - self.min_level + 1
), "[ROIPooler] Sizes of input featuremaps do not form a pyramid!"
assert 0 < self.min_level and self.min_level <= self.max_level
self.canonical_level = canonical_level
assert canonical_box_size > 0
self.canonical_box_size = canonical_box_size
def forward(self, x, box_lists):
"""
Args:
x (list[Tensor]): A list of feature maps of NCHW shape, with scales matching those
used to construct this module.
box_lists (list[Boxes] | list[RotatedBoxes]):
A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.
The box coordinates are defined on the original image and
will be scaled by the `scales` argument of :class:`ROIPooler`.
Returns:
Tensor:
A tensor of shape (M, C, output_size, output_size) where M is the total number of
boxes aggregated over all N batch images and C is the number of channels in `x`.
"""
num_level_assignments = len(self.level_poolers)
assert isinstance(x, list) and isinstance(
box_lists, list
), "Arguments to pooler must be lists"
assert (
len(x) == num_level_assignments
), "unequal value, num_level_assignments={}, but x is list of {} Tensors".format(
num_level_assignments, len(x)
)
assert len(box_lists) == x[0].size(
0
), "unequal value, x[0] batch dim 0 is {}, but box_list has length {}".format(
x[0].size(0), len(box_lists)
)
pooler_fmt_boxes = convert_boxes_to_pooler_format(box_lists)
if num_level_assignments == 1:
return self.level_poolers[0](x[0], pooler_fmt_boxes)
level_assignments = assign_boxes_to_levels(
box_lists, self.min_level, self.max_level, self.canonical_box_size, self.canonical_level
)
num_boxes = len(pooler_fmt_boxes)
num_channels = x[0].shape[1]
output_size = self.output_size[0]
dtype, device = x[0].dtype, x[0].device
output = torch.zeros(
(num_boxes, num_channels, output_size, output_size), dtype=dtype, device=device
)
for level, (x_level, pooler) in enumerate(zip(x, self.level_poolers)):
inds = torch.nonzero(level_assignments == level, as_tuple=True)[0]
pooler_fmt_boxes_level = pooler_fmt_boxes[inds]
output[inds] = pooler(x_level, pooler_fmt_boxes_level)
return output
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from torch.nn import functional as F
from detectron2.layers import paste_masks_in_image
from detectron2.structures import Instances
from detectron2.utils.memory import retry_if_cuda_oom
def detector_postprocess(results, output_height, output_width, mask_threshold=0.5):
"""
Resize the output instances.
The input images are often resized when entering an object detector.
As a result, we often need the outputs of the detector in a different
resolution from its inputs.
This function will resize the raw outputs of an R-CNN detector
to produce outputs according to the desired output resolution.
Args:
results (Instances): the raw outputs from the detector.
`results.image_size` contains the input image resolution the detector sees.
This object might be modified in-place.
output_height, output_width: the desired output resolution.
Returns:
Instances: the resized output from the model, based on the output resolution
"""
scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0])
results = Instances((output_height, output_width), **results.get_fields())
if results.has("pred_boxes"):
output_boxes = results.pred_boxes
elif results.has("proposal_boxes"):
output_boxes = results.proposal_boxes
output_boxes.scale(scale_x, scale_y)
output_boxes.clip(results.image_size)
results = results[output_boxes.nonempty()]
if results.has("pred_masks"):
results.pred_masks = retry_if_cuda_oom(paste_masks_in_image)(
results.pred_masks[:, 0, :, :], # N, 1, M, M
results.pred_boxes,
results.image_size,
threshold=mask_threshold,
)
if results.has("pred_keypoints"):
results.pred_keypoints[:, :, 0] *= scale_x
results.pred_keypoints[:, :, 1] *= scale_y
return results
def sem_seg_postprocess(result, img_size, output_height, output_width):
"""
Return semantic segmentation predictions in the original resolution.
The input images are often resized when entering semantic segmentor. Moreover, in same
cases, they also padded inside segmentor to be divisible by maximum network stride.
As a result, we often need the predictions of the segmentor in a different
resolution from its inputs.
Args:
result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W),
where C is the number of classes, and H, W are the height and width of the prediction.
img_size (tuple): image size that segmentor is taking as input.
output_height, output_width: the desired output resolution.
Returns:
semantic segmentation prediction (Tensor): A tensor of the shape
(C, output_height, output_width) that contains per-pixel soft predictions.
"""
result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1)
result = F.interpolate(
result, size=(output_height, output_width), mode="bilinear", align_corners=False
)[0]
return result
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .build import PROPOSAL_GENERATOR_REGISTRY, build_proposal_generator
from .rpn import RPN_HEAD_REGISTRY, build_rpn_head, RPN
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment