Commit c732df65 authored by limm's avatar limm
Browse files

push v0.1.3 version commit bd2ea47

parent 5b3792fc
Pipeline #706 failed with stages
in 0 seconds
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import copy
import numpy as np
from contextlib import contextmanager
from itertools import count
import torch
from torch import nn
from torch.nn.parallel import DistributedDataParallel
from detectron2.data.detection_utils import read_image
from detectron2.data.transforms import ResizeShortestEdge
from detectron2.structures import Instances
from .meta_arch import GeneralizedRCNN
from .postprocessing import detector_postprocess
from .roi_heads.fast_rcnn import fast_rcnn_inference_single_image
__all__ = ["DatasetMapperTTA", "GeneralizedRCNNWithTTA"]
class DatasetMapperTTA:
"""
Implement test-time augmentation for detection data.
It is a callable which takes a dataset dict from a detection dataset,
and returns a list of dataset dicts where the images
are augmented from the input image by the transformations defined in the config.
This is used for test-time augmentation.
"""
def __init__(self, cfg):
self.min_sizes = cfg.TEST.AUG.MIN_SIZES
self.max_size = cfg.TEST.AUG.MAX_SIZE
self.flip = cfg.TEST.AUG.FLIP
self.image_format = cfg.INPUT.FORMAT
def __call__(self, dataset_dict):
"""
Args:
dict: a detection dataset dict
Returns:
list[dict]:
a list of dataset dicts, which contain augmented version of the input image.
The total number of dicts is ``len(min_sizes) * (2 if flip else 1)``.
"""
ret = []
if "image" not in dataset_dict:
numpy_image = read_image(dataset_dict["file_name"], self.image_format)
else:
numpy_image = dataset_dict["image"].permute(1, 2, 0).numpy().astype("uint8")
for min_size in self.min_sizes:
image = np.copy(numpy_image)
tfm = ResizeShortestEdge(min_size, self.max_size).get_transform(image)
resized = tfm.apply_image(image)
resized = torch.as_tensor(resized.transpose(2, 0, 1).astype("float32"))
dic = copy.deepcopy(dataset_dict)
dic["horiz_flip"] = False
dic["image"] = resized
ret.append(dic)
if self.flip:
dic = copy.deepcopy(dataset_dict)
dic["horiz_flip"] = True
dic["image"] = torch.flip(resized, dims=[2])
ret.append(dic)
return ret
class GeneralizedRCNNWithTTA(nn.Module):
"""
A GeneralizedRCNN with test-time augmentation enabled.
Its :meth:`__call__` method has the same interface as :meth:`GeneralizedRCNN.forward`.
"""
def __init__(self, cfg, model, tta_mapper=None, batch_size=3):
"""
Args:
cfg (CfgNode):
model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on.
tta_mapper (callable): takes a dataset dict and returns a list of
augmented versions of the dataset dict. Defaults to
`DatasetMapperTTA(cfg)`.
batch_size (int): batch the augmented images into this batch size for inference.
"""
super().__init__()
if isinstance(model, DistributedDataParallel):
model = model.module
assert isinstance(
model, GeneralizedRCNN
), "TTA is only supported on GeneralizedRCNN. Got a model of type {}".format(type(model))
self.cfg = cfg.clone()
assert not self.cfg.MODEL.KEYPOINT_ON, "TTA for keypoint is not supported yet"
assert (
not self.cfg.MODEL.LOAD_PROPOSALS
), "TTA for pre-computed proposals is not supported yet"
self.model = model
if tta_mapper is None:
tta_mapper = DatasetMapperTTA(cfg)
self.tta_mapper = tta_mapper
self.batch_size = batch_size
@contextmanager
def _turn_off_roi_heads(self, attrs):
"""
Open a context where some heads in `model.roi_heads` are temporarily turned off.
Args:
attr (list[str]): the attribute in `model.roi_heads` which can be used
to turn off a specific head, e.g., "mask_on", "keypoint_on".
"""
roi_heads = self.model.roi_heads
old = {}
for attr in attrs:
try:
old[attr] = getattr(roi_heads, attr)
except AttributeError:
# The head may not be implemented in certain ROIHeads
pass
if len(old.keys()) == 0:
yield
else:
for attr in old.keys():
setattr(roi_heads, attr, False)
yield
for attr in old.keys():
setattr(roi_heads, attr, old[attr])
def _batch_inference(self, batched_inputs, detected_instances=None, do_postprocess=True):
"""
Execute inference on a list of inputs,
using batch size = self.batch_size, instead of the length of the list.
Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference`
"""
if detected_instances is None:
detected_instances = [None] * len(batched_inputs)
outputs = []
inputs, instances = [], []
for idx, input, instance in zip(count(), batched_inputs, detected_instances):
inputs.append(input)
instances.append(instance)
if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1:
outputs.extend(
self.model.inference(
inputs,
instances if instances[0] is not None else None,
do_postprocess=do_postprocess,
)
)
inputs, instances = [], []
return outputs
def __call__(self, batched_inputs):
"""
Same input/output format as :meth:`GeneralizedRCNN.forward`
"""
return [self._inference_one_image(x) for x in batched_inputs]
def _detector_postprocess(self, outputs, aug_vars):
return detector_postprocess(outputs, aug_vars["height"], aug_vars["width"])
def _inference_one_image(self, input):
"""
Args:
input (dict): one dataset dict
Returns:
dict: one output dict
"""
augmented_inputs, aug_vars = self._get_augmented_inputs(input)
# Detect boxes from all augmented versions
with self._turn_off_roi_heads(["mask_on", "keypoint_on"]):
# temporarily disable roi heads
all_boxes, all_scores, all_classes = self._get_augmented_boxes(
augmented_inputs, aug_vars
)
merged_instances = self._merge_detections(
all_boxes, all_scores, all_classes, (aug_vars["height"], aug_vars["width"])
)
if self.cfg.MODEL.MASK_ON:
# Use the detected boxes to obtain new fields
augmented_instances = self._rescale_detected_boxes(
augmented_inputs, merged_instances, aug_vars
)
# run forward on the detected boxes
outputs = self._batch_inference(
augmented_inputs, augmented_instances, do_postprocess=False
)
# Delete now useless variables to avoid being out of memory
del augmented_inputs, augmented_instances, merged_instances
# average the predictions
outputs[0].pred_masks = self._reduce_pred_masks(outputs, aug_vars)
# postprocess
output = self._detector_postprocess(outputs[0], aug_vars)
return {"instances": output}
else:
return {"instances": merged_instances}
def _get_augmented_inputs(self, input):
augmented_inputs = self.tta_mapper(input)
do_hflip = [k.pop("horiz_flip", False) for k in augmented_inputs]
heights = [k["height"] for k in augmented_inputs]
widths = [k["width"] for k in augmented_inputs]
assert (
len(set(heights)) == 1 and len(set(widths)) == 1
), "Augmented version of the inputs should have the same original resolution!"
height = heights[0]
width = widths[0]
aug_vars = {"height": height, "width": width, "do_hflip": do_hflip}
return augmented_inputs, aug_vars
def _get_augmented_boxes(self, augmented_inputs, aug_vars):
# 1: forward with all augmented images
outputs = self._batch_inference(augmented_inputs, do_postprocess=False)
# 2: union the results
all_boxes = []
all_scores = []
all_classes = []
for idx, output in enumerate(outputs):
rescaled_output = self._detector_postprocess(output, aug_vars)
pred_boxes = rescaled_output.pred_boxes.tensor
if aug_vars["do_hflip"][idx]:
pred_boxes[:, [0, 2]] = aug_vars["width"] - pred_boxes[:, [2, 0]]
all_boxes.append(pred_boxes)
all_scores.extend(rescaled_output.scores)
all_classes.extend(rescaled_output.pred_classes)
all_boxes = torch.cat(all_boxes, dim=0).cpu()
return all_boxes, all_scores, all_classes
def _merge_detections(self, all_boxes, all_scores, all_classes, shape_hw):
# select from the union of all results
num_boxes = len(all_boxes)
num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES
# +1 because fast_rcnn_inference expects background scores as well
all_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device)
for idx, cls, score in zip(count(), all_classes, all_scores):
all_scores_2d[idx, cls] = score
merged_instances, _ = fast_rcnn_inference_single_image(
all_boxes,
all_scores_2d,
shape_hw,
1e-8,
self.cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,
self.cfg.TEST.DETECTIONS_PER_IMAGE,
)
return merged_instances
def _rescale_detected_boxes(self, augmented_inputs, merged_instances, aug_vars):
augmented_instances = []
for idx, input in enumerate(augmented_inputs):
actual_height, actual_width = input["image"].shape[1:3]
scale_x = actual_width * 1.0 / aug_vars["width"]
scale_y = actual_height * 1.0 / aug_vars["height"]
pred_boxes = merged_instances.pred_boxes.clone()
pred_boxes.tensor[:, 0::2] *= scale_x
pred_boxes.tensor[:, 1::2] *= scale_y
if aug_vars["do_hflip"][idx]:
pred_boxes.tensor[:, [0, 2]] = actual_width - pred_boxes.tensor[:, [2, 0]]
aug_instances = Instances(
image_size=(actual_height, actual_width),
pred_boxes=pred_boxes,
pred_classes=merged_instances.pred_classes,
scores=merged_instances.scores,
)
augmented_instances.append(aug_instances)
return augmented_instances
def _reduce_pred_masks(self, outputs, aug_vars):
for idx, output in enumerate(outputs):
if aug_vars["do_hflip"][idx]:
output.pred_masks = output.pred_masks.flip(dims=[3])
all_pred_masks = torch.stack([o.pred_masks for o in outputs], dim=0)
avg_pred_masks = torch.mean(all_pred_masks, dim=0)
return avg_pred_masks
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .build import build_lr_scheduler, build_optimizer
from .lr_scheduler import WarmupCosineLR, WarmupMultiStepLR
__all__ = [k for k in globals().keys() if not k.startswith("_")]
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from enum import Enum
from typing import Any, Callable, Dict, Iterable, List, Set, Type, Union
import torch
from detectron2.config import CfgNode
from .lr_scheduler import WarmupCosineLR, WarmupMultiStepLR
_GradientClipperInput = Union[torch.Tensor, Iterable[torch.Tensor]]
_GradientClipper = Callable[[_GradientClipperInput], None]
class GradientClipType(Enum):
VALUE = "value"
NORM = "norm"
def _create_gradient_clipper(cfg: CfgNode) -> _GradientClipper:
"""
Creates gradient clipping closure to clip by value or by norm,
according to the provided config.
"""
cfg = cfg.clone()
def clip_grad_norm(p: _GradientClipperInput):
torch.nn.utils.clip_grad_norm_(p, cfg.CLIP_VALUE, cfg.NORM_TYPE)
def clip_grad_value(p: _GradientClipperInput):
torch.nn.utils.clip_grad_value_(p, cfg.CLIP_VALUE)
_GRADIENT_CLIP_TYPE_TO_CLIPPER = {
GradientClipType.VALUE: clip_grad_value,
GradientClipType.NORM: clip_grad_norm,
}
return _GRADIENT_CLIP_TYPE_TO_CLIPPER[GradientClipType(cfg.CLIP_TYPE)]
def _generate_optimizer_class_with_gradient_clipping(
optimizer_type: Type[torch.optim.Optimizer], gradient_clipper: _GradientClipper
) -> Type[torch.optim.Optimizer]:
"""
Dynamically creates a new type that inherits the type of a given instance
and overrides the `step` method to add gradient clipping
"""
def optimizer_wgc_step(self, closure=None):
for group in self.param_groups:
for p in group["params"]:
gradient_clipper(p)
super(type(self), self).step(closure)
OptimizerWithGradientClip = type(
optimizer_type.__name__ + "WithGradientClip",
(optimizer_type,),
{"step": optimizer_wgc_step},
)
return OptimizerWithGradientClip
def maybe_add_gradient_clipping(
cfg: CfgNode, optimizer: torch.optim.Optimizer
) -> torch.optim.Optimizer:
"""
If gradient clipping is enabled through config options, wraps the existing
optimizer instance of some type OptimizerType to become an instance
of the new dynamically created class OptimizerTypeWithGradientClip
that inherits OptimizerType and overrides the `step` method to
include gradient clipping.
Args:
cfg: CfgNode
configuration options
optimizer: torch.optim.Optimizer
existing optimizer instance
Return:
optimizer: torch.optim.Optimizer
either the unmodified optimizer instance (if gradient clipping is
disabled), or the same instance with adjusted __class__ to override
the `step` method and include gradient clipping
"""
if not cfg.SOLVER.CLIP_GRADIENTS.ENABLED:
return optimizer
grad_clipper = _create_gradient_clipper(cfg.SOLVER.CLIP_GRADIENTS)
OptimizerWithGradientClip = _generate_optimizer_class_with_gradient_clipping(
type(optimizer), grad_clipper
)
optimizer.__class__ = OptimizerWithGradientClip
return optimizer
def build_optimizer(cfg: CfgNode, model: torch.nn.Module) -> torch.optim.Optimizer:
"""
Build an optimizer from config.
"""
norm_module_types = (
torch.nn.BatchNorm1d,
torch.nn.BatchNorm2d,
torch.nn.BatchNorm3d,
torch.nn.SyncBatchNorm,
# NaiveSyncBatchNorm inherits from BatchNorm2d
torch.nn.GroupNorm,
torch.nn.InstanceNorm1d,
torch.nn.InstanceNorm2d,
torch.nn.InstanceNorm3d,
torch.nn.LayerNorm,
torch.nn.LocalResponseNorm,
)
params: List[Dict[str, Any]] = []
memo: Set[torch.nn.parameter.Parameter] = set()
for module in model.modules():
for key, value in module.named_parameters(recurse=False):
if not value.requires_grad:
continue
# Avoid duplicating parameters
if value in memo:
continue
memo.add(value)
lr = cfg.SOLVER.BASE_LR
weight_decay = cfg.SOLVER.WEIGHT_DECAY
if isinstance(module, norm_module_types):
weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM
elif key == "bias":
# NOTE: unlike Detectron v1, we now default BIAS_LR_FACTOR to 1.0
# and WEIGHT_DECAY_BIAS to WEIGHT_DECAY so that bias optimizer
# hyperparameters are by default exactly the same as for regular
# weights.
lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR
weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
optimizer = torch.optim.SGD(
params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM, nesterov=cfg.SOLVER.NESTEROV
)
optimizer = maybe_add_gradient_clipping(cfg, optimizer)
return optimizer
def build_lr_scheduler(
cfg: CfgNode, optimizer: torch.optim.Optimizer
) -> torch.optim.lr_scheduler._LRScheduler:
"""
Build a LR scheduler from config.
"""
name = cfg.SOLVER.LR_SCHEDULER_NAME
if name == "WarmupMultiStepLR":
return WarmupMultiStepLR(
optimizer,
cfg.SOLVER.STEPS,
cfg.SOLVER.GAMMA,
warmup_factor=cfg.SOLVER.WARMUP_FACTOR,
warmup_iters=cfg.SOLVER.WARMUP_ITERS,
warmup_method=cfg.SOLVER.WARMUP_METHOD,
)
elif name == "WarmupCosineLR":
return WarmupCosineLR(
optimizer,
cfg.SOLVER.MAX_ITER,
warmup_factor=cfg.SOLVER.WARMUP_FACTOR,
warmup_iters=cfg.SOLVER.WARMUP_ITERS,
warmup_method=cfg.SOLVER.WARMUP_METHOD,
)
else:
raise ValueError("Unknown LR scheduler: {}".format(name))
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import math
from bisect import bisect_right
from typing import List
import torch
# NOTE: PyTorch's LR scheduler interface uses names that assume the LR changes
# only on epoch boundaries. We typically use iteration based schedules instead.
# As a result, "epoch" (e.g., as in self.last_epoch) should be understood to mean
# "iteration" instead.
# FIXME: ideally this would be achieved with a CombinedLRScheduler, separating
# MultiStepLR with WarmupLR but the current LRScheduler design doesn't allow it.
class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
def __init__(
self,
optimizer: torch.optim.Optimizer,
milestones: List[int],
gamma: float = 0.1,
warmup_factor: float = 0.001,
warmup_iters: int = 1000,
warmup_method: str = "linear",
last_epoch: int = -1,
):
if not list(milestones) == sorted(milestones):
raise ValueError(
"Milestones should be a list of" " increasing integers. Got {}", milestones
)
self.milestones = milestones
self.gamma = gamma
self.warmup_factor = warmup_factor
self.warmup_iters = warmup_iters
self.warmup_method = warmup_method
super().__init__(optimizer, last_epoch)
def get_lr(self) -> List[float]:
warmup_factor = _get_warmup_factor_at_iter(
self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor
)
return [
base_lr * warmup_factor * self.gamma ** bisect_right(self.milestones, self.last_epoch)
for base_lr in self.base_lrs
]
def _compute_values(self) -> List[float]:
# The new interface
return self.get_lr()
class WarmupCosineLR(torch.optim.lr_scheduler._LRScheduler):
def __init__(
self,
optimizer: torch.optim.Optimizer,
max_iters: int,
warmup_factor: float = 0.001,
warmup_iters: int = 1000,
warmup_method: str = "linear",
last_epoch: int = -1,
):
self.max_iters = max_iters
self.warmup_factor = warmup_factor
self.warmup_iters = warmup_iters
self.warmup_method = warmup_method
super().__init__(optimizer, last_epoch)
def get_lr(self) -> List[float]:
warmup_factor = _get_warmup_factor_at_iter(
self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor
)
# Different definitions of half-cosine with warmup are possible. For
# simplicity we multiply the standard half-cosine schedule by the warmup
# factor. An alternative is to start the period of the cosine at warmup_iters
# instead of at 0. In the case that warmup_iters << max_iters the two are
# very close to each other.
return [
base_lr
* warmup_factor
* 0.5
* (1.0 + math.cos(math.pi * self.last_epoch / self.max_iters))
for base_lr in self.base_lrs
]
def _compute_values(self) -> List[float]:
# The new interface
return self.get_lr()
def _get_warmup_factor_at_iter(
method: str, iter: int, warmup_iters: int, warmup_factor: float
) -> float:
"""
Return the learning rate warmup factor at a specific iteration.
See :paper:`in1k1h` for more details.
Args:
method (str): warmup method; either "constant" or "linear".
iter (int): iteration at which to calculate the warmup factor.
warmup_iters (int): the number of warmup iterations.
warmup_factor (float): the base warmup factor (the meaning changes according
to the method used).
Returns:
float: the effective warmup factor at the given iteration.
"""
if iter >= warmup_iters:
return 1.0
if method == "constant":
return warmup_factor
elif method == "linear":
alpha = iter / warmup_iters
return warmup_factor * (1 - alpha) + alpha
else:
raise ValueError("Unknown warmup method: {}".format(method))
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .boxes import Boxes, BoxMode, pairwise_iou
from .image_list import ImageList
from .instances import Instances
from .keypoints import Keypoints, heatmaps_to_keypoints
from .masks import BitMasks, PolygonMasks, rasterize_polygons_within_box, polygons_to_bitmask
from .rotated_boxes import RotatedBoxes
from .rotated_boxes import pairwise_iou as pairwise_iou_rotated
__all__ = [k for k in globals().keys() if not k.startswith("_")]
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import math
import numpy as np
from enum import IntEnum, unique
from typing import Iterator, List, Tuple, Union
import torch
_RawBoxType = Union[List[float], Tuple[float, ...], torch.Tensor, np.ndarray]
@unique
class BoxMode(IntEnum):
"""
Enum of different ways to represent a box.
"""
XYXY_ABS = 0
"""
(x0, y0, x1, y1) in absolute floating points coordinates.
The coordinates in range [0, width or height].
"""
XYWH_ABS = 1
"""
(x0, y0, w, h) in absolute floating points coordinates.
"""
XYXY_REL = 2
"""
Not yet supported!
(x0, y0, x1, y1) in range [0, 1]. They are relative to the size of the image.
"""
XYWH_REL = 3
"""
Not yet supported!
(x0, y0, w, h) in range [0, 1]. They are relative to the size of the image.
"""
XYWHA_ABS = 4
"""
(xc, yc, w, h, a) in absolute floating points coordinates.
(xc, yc) is the center of the rotated box, and the angle a is in degrees ccw.
"""
@staticmethod
def convert(box: _RawBoxType, from_mode: "BoxMode", to_mode: "BoxMode") -> _RawBoxType:
"""
Args:
box: can be a k-tuple, k-list or an Nxk array/tensor, where k = 4 or 5
from_mode, to_mode (BoxMode)
Returns:
The converted box of the same type.
"""
if from_mode == to_mode:
return box
original_type = type(box)
is_numpy = isinstance(box, np.ndarray)
single_box = isinstance(box, (list, tuple))
if single_box:
assert len(box) == 4 or len(box) == 5, (
"BoxMode.convert takes either a k-tuple/list or an Nxk array/tensor,"
" where k == 4 or 5"
)
arr = torch.tensor(box)[None, :]
else:
# avoid modifying the input box
if is_numpy:
arr = torch.from_numpy(np.asarray(box)).clone()
else:
arr = box.clone()
assert to_mode.value not in [
BoxMode.XYXY_REL,
BoxMode.XYWH_REL,
] and from_mode.value not in [
BoxMode.XYXY_REL,
BoxMode.XYWH_REL,
], "Relative mode not yet supported!"
if from_mode == BoxMode.XYWHA_ABS and to_mode == BoxMode.XYXY_ABS:
assert (
arr.shape[-1] == 5
), "The last dimension of input shape must be 5 for XYWHA format"
original_dtype = arr.dtype
arr = arr.double()
w = arr[:, 2]
h = arr[:, 3]
a = arr[:, 4]
c = torch.abs(torch.cos(a * math.pi / 180.0))
s = torch.abs(torch.sin(a * math.pi / 180.0))
# This basically computes the horizontal bounding rectangle of the rotated box
new_w = c * w + s * h
new_h = c * h + s * w
# convert center to top-left corner
arr[:, 0] -= new_w / 2.0
arr[:, 1] -= new_h / 2.0
# bottom-right corner
arr[:, 2] = arr[:, 0] + new_w
arr[:, 3] = arr[:, 1] + new_h
arr = arr[:, :4].to(dtype=original_dtype)
elif from_mode == BoxMode.XYWH_ABS and to_mode == BoxMode.XYWHA_ABS:
original_dtype = arr.dtype
arr = arr.double()
arr[:, 0] += arr[:, 2] / 2.0
arr[:, 1] += arr[:, 3] / 2.0
angles = torch.zeros((arr.shape[0], 1), dtype=arr.dtype)
arr = torch.cat((arr, angles), axis=1).to(dtype=original_dtype)
else:
if to_mode == BoxMode.XYXY_ABS and from_mode == BoxMode.XYWH_ABS:
arr[:, 2] += arr[:, 0]
arr[:, 3] += arr[:, 1]
elif from_mode == BoxMode.XYXY_ABS and to_mode == BoxMode.XYWH_ABS:
arr[:, 2] -= arr[:, 0]
arr[:, 3] -= arr[:, 1]
else:
raise NotImplementedError(
"Conversion from BoxMode {} to {} is not supported yet".format(
from_mode, to_mode
)
)
if single_box:
return original_type(arr.flatten().tolist())
if is_numpy:
return arr.numpy()
else:
return arr
class Boxes:
"""
This structure stores a list of boxes as a Nx4 torch.Tensor.
It supports some common methods about boxes
(`area`, `clip`, `nonempty`, etc),
and also behaves like a Tensor
(support indexing, `to(device)`, `.device`, and iteration over all boxes)
Attributes:
tensor (torch.Tensor): float matrix of Nx4. Each row is (x1, y1, x2, y2).
"""
BoxSizeType = Union[List[int], Tuple[int, int]]
def __init__(self, tensor: torch.Tensor):
"""
Args:
tensor (Tensor[float]): a Nx4 matrix. Each row is (x1, y1, x2, y2).
"""
device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu")
tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
if tensor.numel() == 0:
# Use reshape, so we don't end up creating a new tensor that does not depend on
# the inputs (and consequently confuses jit)
tensor = tensor.reshape((0, 4)).to(dtype=torch.float32, device=device)
assert tensor.dim() == 2 and tensor.size(-1) == 4, tensor.size()
self.tensor = tensor
def clone(self) -> "Boxes":
"""
Clone the Boxes.
Returns:
Boxes
"""
return Boxes(self.tensor.clone())
def to(self, device: str) -> "Boxes":
return Boxes(self.tensor.to(device))
def area(self) -> torch.Tensor:
"""
Computes the area of all the boxes.
Returns:
torch.Tensor: a vector with areas of each box.
"""
box = self.tensor
area = (box[:, 2] - box[:, 0]) * (box[:, 3] - box[:, 1])
return area
def clip(self, box_size: BoxSizeType) -> None:
"""
Clip (in place) the boxes by limiting x coordinates to the range [0, width]
and y coordinates to the range [0, height].
Args:
box_size (height, width): The clipping box's size.
"""
assert torch.isfinite(self.tensor).all(), "Box tensor contains infinite or NaN!"
h, w = box_size
self.tensor[:, 0].clamp_(min=0, max=w)
self.tensor[:, 1].clamp_(min=0, max=h)
self.tensor[:, 2].clamp_(min=0, max=w)
self.tensor[:, 3].clamp_(min=0, max=h)
def nonempty(self, threshold: float = 0.0) -> torch.Tensor:
"""
Find boxes that are non-empty.
A box is considered empty, if either of its side is no larger than threshold.
Returns:
Tensor:
a binary vector which represents whether each box is empty
(False) or non-empty (True).
"""
box = self.tensor
widths = box[:, 2] - box[:, 0]
heights = box[:, 3] - box[:, 1]
keep = (widths > threshold) & (heights > threshold)
return keep
def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Boxes":
"""
Returns:
Boxes: Create a new :class:`Boxes` by indexing.
The following usage are allowed:
1. `new_boxes = boxes[3]`: return a `Boxes` which contains only one box.
2. `new_boxes = boxes[2:10]`: return a slice of boxes.
3. `new_boxes = boxes[vector]`, where vector is a torch.BoolTensor
with `length = len(boxes)`. Nonzero elements in the vector will be selected.
Note that the returned Boxes might share storage with this Boxes,
subject to Pytorch's indexing semantics.
"""
if isinstance(item, int):
return Boxes(self.tensor[item].view(1, -1))
b = self.tensor[item]
assert b.dim() == 2, "Indexing on Boxes with {} failed to return a matrix!".format(item)
return Boxes(b)
def __len__(self) -> int:
return self.tensor.shape[0]
def __repr__(self) -> str:
return "Boxes(" + str(self.tensor) + ")"
def inside_box(self, box_size: BoxSizeType, boundary_threshold: int = 0) -> torch.Tensor:
"""
Args:
box_size (height, width): Size of the reference box.
boundary_threshold (int): Boxes that extend beyond the reference box
boundary by more than boundary_threshold are considered "outside".
Returns:
a binary vector, indicating whether each box is inside the reference box.
"""
height, width = box_size
inds_inside = (
(self.tensor[..., 0] >= -boundary_threshold)
& (self.tensor[..., 1] >= -boundary_threshold)
& (self.tensor[..., 2] < width + boundary_threshold)
& (self.tensor[..., 3] < height + boundary_threshold)
)
return inds_inside
def get_centers(self) -> torch.Tensor:
"""
Returns:
The box centers in a Nx2 array of (x, y).
"""
return (self.tensor[:, :2] + self.tensor[:, 2:]) / 2
def scale(self, scale_x: float, scale_y: float) -> None:
"""
Scale the box with horizontal and vertical scaling factors
"""
self.tensor[:, 0::2] *= scale_x
self.tensor[:, 1::2] *= scale_y
@classmethod
def cat(cls, boxes_list: List["Boxes"]) -> "Boxes":
"""
Concatenates a list of Boxes into a single Boxes
Arguments:
boxes_list (list[Boxes])
Returns:
Boxes: the concatenated Boxes
"""
assert isinstance(boxes_list, (list, tuple))
if len(boxes_list) == 0:
return cls(torch.empty(0))
assert all(isinstance(box, Boxes) for box in boxes_list)
# use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input
cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0))
return cat_boxes
@property
def device(self) -> torch.device:
return self.tensor.device
def __iter__(self) -> Iterator[torch.Tensor]:
"""
Yield a box as a Tensor of shape (4,) at a time.
"""
yield from self.tensor
# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
# with slight modifications
def pairwise_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
"""
Given two lists of boxes of size N and M,
compute the IoU (intersection over union)
between __all__ N x M pairs of boxes.
The box order must be (xmin, ymin, xmax, ymax).
Args:
boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively.
Returns:
Tensor: IoU, sized [N,M].
"""
area1 = boxes1.area()
area2 = boxes2.area()
boxes1, boxes2 = boxes1.tensor, boxes2.tensor
width_height = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) - torch.max(
boxes1[:, None, :2], boxes2[:, :2]
) # [N,M,2]
width_height.clamp_(min=0) # [N,M,2]
inter = width_height.prod(dim=2) # [N,M]
del width_height
# handle empty boxes
iou = torch.where(
inter > 0,
inter / (area1[:, None] + area2 - inter),
torch.zeros(1, dtype=inter.dtype, device=inter.device),
)
return iou
def matched_boxlist_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
"""
Compute pairwise intersection over union (IOU) of two sets of matched
boxes. The box order must be (xmin, ymin, xmax, ymax).
Similar to boxlist_iou, but computes only diagonal elements of the matrix
Arguments:
boxes1: (Boxes) bounding boxes, sized [N,4].
boxes2: (Boxes) bounding boxes, sized [N,4].
Returns:
(tensor) iou, sized [N].
"""
assert len(boxes1) == len(
boxes2
), "boxlists should have the same" "number of entries, got {}, {}".format(
len(boxes1), len(boxes2)
)
area1 = boxes1.area() # [N]
area2 = boxes2.area() # [N]
box1, box2 = boxes1.tensor, boxes2.tensor
lt = torch.max(box1[:, :2], box2[:, :2]) # [N,2]
rb = torch.min(box1[:, 2:], box2[:, 2:]) # [N,2]
wh = (rb - lt).clamp(min=0) # [N,2]
inter = wh[:, 0] * wh[:, 1] # [N]
iou = inter / (area1 + area2 - inter) # [N]
return iou
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
from __future__ import division
from typing import Any, List, Sequence, Tuple, Union
import torch
from torch.nn import functional as F
class ImageList(object):
"""
Structure that holds a list of images (of possibly
varying sizes) as a single tensor.
This works by padding the images to the same size,
and storing in a field the original sizes of each image
Attributes:
image_sizes (list[tuple[int, int]]): each tuple is (h, w)
"""
def __init__(self, tensor: torch.Tensor, image_sizes: List[Tuple[int, int]]):
"""
Arguments:
tensor (Tensor): of shape (N, H, W) or (N, C_1, ..., C_K, H, W) where K >= 1
image_sizes (list[tuple[int, int]]): Each tuple is (h, w). It can
be smaller than (H, W) due to padding.
"""
self.tensor = tensor
self.image_sizes = image_sizes
def __len__(self) -> int:
return len(self.image_sizes)
def __getitem__(self, idx: Union[int, slice]) -> torch.Tensor:
"""
Access the individual image in its original size.
Returns:
Tensor: an image of shape (H, W) or (C_1, ..., C_K, H, W) where K >= 1
"""
size = self.image_sizes[idx]
return self.tensor[idx, ..., : size[0], : size[1]] # type: ignore
def to(self, *args: Any, **kwargs: Any) -> "ImageList":
cast_tensor = self.tensor.to(*args, **kwargs)
return ImageList(cast_tensor, self.image_sizes)
@property
def device(self) -> torch.device:
return self.tensor.device
@staticmethod
def from_tensors(
tensors: Sequence[torch.Tensor], size_divisibility: int = 0, pad_value: float = 0.0
) -> "ImageList":
"""
Args:
tensors: a tuple or list of `torch.Tensors`, each of shape (Hi, Wi) or
(C_1, ..., C_K, Hi, Wi) where K >= 1. The Tensors will be padded
to the same shape with `pad_value`.
size_divisibility (int): If `size_divisibility > 0`, add padding to ensure
the common height and width is divisible by `size_divisibility`.
This depends on the model and many models need a divisibility of 32.
pad_value (float): value to pad
Returns:
an `ImageList`.
"""
assert len(tensors) > 0
assert isinstance(tensors, (tuple, list))
for t in tensors:
assert isinstance(t, torch.Tensor), type(t)
assert t.shape[1:-2] == tensors[0].shape[1:-2], t.shape
# per dimension maximum (H, W) or (C_1, ..., C_K, H, W) where K >= 1 among all tensors
max_size = (
# In tracing mode, x.shape[i] is Tensor, and should not be converted
# to int: this will cause the traced graph to have hard-coded shapes.
# Instead we should make max_size a Tensor that depends on these tensors.
# Using torch.stack twice seems to be the best way to convert
# list[list[ScalarTensor]] to a Tensor
torch.stack(
[
torch.stack([torch.as_tensor(dim) for dim in size])
for size in [tuple(img.shape) for img in tensors]
]
)
.max(0)
.values
)
if size_divisibility > 0:
stride = size_divisibility
# the last two dims are H,W, both subject to divisibility requirement
max_size = torch.cat([max_size[:-2], (max_size[-2:] + (stride - 1)) // stride * stride])
image_sizes = [tuple(im.shape[-2:]) for im in tensors]
if len(tensors) == 1:
# This seems slightly (2%) faster.
# TODO: check whether it's faster for multiple images as well
image_size = image_sizes[0]
padding_size = [0, max_size[-1] - image_size[1], 0, max_size[-2] - image_size[0]]
if all(x == 0 for x in padding_size): # https://github.com/pytorch/pytorch/issues/31734
batched_imgs = tensors[0].unsqueeze(0)
else:
padded = F.pad(tensors[0], padding_size, value=pad_value)
batched_imgs = padded.unsqueeze_(0)
else:
# max_size can be a tensor in tracing mode, therefore use tuple()
batch_shape = (len(tensors),) + tuple(max_size)
batched_imgs = tensors[0].new_full(batch_shape, pad_value)
for img, pad_img in zip(tensors, batched_imgs):
pad_img[..., : img.shape[-2], : img.shape[-1]].copy_(img)
return ImageList(batched_imgs.contiguous(), image_sizes)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import itertools
from typing import Any, Dict, List, Tuple, Union
import torch
class Instances:
"""
This class represents a list of instances in an image.
It stores the attributes of instances (e.g., boxes, masks, labels, scores) as "fields".
All fields must have the same ``__len__`` which is the number of instances.
All other (non-field) attributes of this class are considered private:
they must start with '_' and are not modifiable by a user.
Some basic usage:
1. Set/Get a field:
.. code-block:: python
instances.gt_boxes = Boxes(...)
print(instances.pred_masks) # a tensor of shape (N, H, W)
print('gt_masks' in instances)
2. ``len(instances)`` returns the number of instances
3. Indexing: ``instances[indices]`` will apply the indexing on all the fields
and returns a new :class:`Instances`.
Typically, ``indices`` is a integer vector of indices,
or a binary mask of length ``num_instances``,
"""
def __init__(self, image_size: Tuple[int, int], **kwargs: Any):
"""
Args:
image_size (height, width): the spatial size of the image.
kwargs: fields to add to this `Instances`.
"""
self._image_size = image_size
self._fields: Dict[str, Any] = {}
for k, v in kwargs.items():
self.set(k, v)
@property
def image_size(self) -> Tuple[int, int]:
"""
Returns:
tuple: height, width
"""
return self._image_size
def __setattr__(self, name: str, val: Any) -> None:
if name.startswith("_"):
super().__setattr__(name, val)
else:
self.set(name, val)
def __getattr__(self, name: str) -> Any:
if name == "_fields" or name not in self._fields:
raise AttributeError("Cannot find field '{}' in the given Instances!".format(name))
return self._fields[name]
def set(self, name: str, value: Any) -> None:
"""
Set the field named `name` to `value`.
The length of `value` must be the number of instances,
and must agree with other existing fields in this object.
"""
data_len = len(value)
if len(self._fields):
assert (
len(self) == data_len
), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self))
self._fields[name] = value
def has(self, name: str) -> bool:
"""
Returns:
bool: whether the field called `name` exists.
"""
return name in self._fields
def remove(self, name: str) -> None:
"""
Remove the field called `name`.
"""
del self._fields[name]
def get(self, name: str) -> Any:
"""
Returns the field called `name`.
"""
return self._fields[name]
def get_fields(self) -> Dict[str, Any]:
"""
Returns:
dict: a dict which maps names (str) to data of the fields
Modifying the returned dict will modify this instance.
"""
return self._fields
# Tensor-like methods
def to(self, device: str) -> "Instances":
"""
Returns:
Instances: all fields are called with a `to(device)`, if the field has this method.
"""
ret = Instances(self._image_size)
for k, v in self._fields.items():
if hasattr(v, "to"):
v = v.to(device)
ret.set(k, v)
return ret
def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Instances":
"""
Args:
item: an index-like object and will be used to index all the fields.
Returns:
If `item` is a string, return the data in the corresponding field.
Otherwise, returns an `Instances` where all fields are indexed by `item`.
"""
if type(item) == int:
if item >= len(self) or item < -len(self):
raise IndexError("Instances index out of range!")
else:
item = slice(item, None, len(self))
ret = Instances(self._image_size)
for k, v in self._fields.items():
ret.set(k, v[item])
return ret
def __len__(self) -> int:
for v in self._fields.values():
return len(v)
raise NotImplementedError("Empty Instances does not support __len__!")
def __iter__(self):
raise NotImplementedError("`Instances` object is not iterable!")
@staticmethod
def cat(instance_lists: List["Instances"]) -> "Instances":
"""
Args:
instance_lists (list[Instances])
Returns:
Instances
"""
assert all(isinstance(i, Instances) for i in instance_lists)
assert len(instance_lists) > 0
if len(instance_lists) == 1:
return instance_lists[0]
image_size = instance_lists[0].image_size
for i in instance_lists[1:]:
assert i.image_size == image_size
ret = Instances(image_size)
for k in instance_lists[0]._fields.keys():
values = [i.get(k) for i in instance_lists]
v0 = values[0]
if isinstance(v0, torch.Tensor):
values = torch.cat(values, dim=0)
elif isinstance(v0, list):
values = list(itertools.chain(*values))
elif hasattr(type(v0), "cat"):
values = type(v0).cat(values)
else:
raise ValueError("Unsupported type {} for concatenation".format(type(v0)))
ret.set(k, values)
return ret
def __str__(self) -> str:
s = self.__class__.__name__ + "("
s += "num_instances={}, ".format(len(self))
s += "image_height={}, ".format(self._image_size[0])
s += "image_width={}, ".format(self._image_size[1])
s += "fields=[{}])".format(", ".join((f"{k}: {v}" for k, v in self._fields.items())))
return s
__repr__ = __str__
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import numpy as np
from typing import Any, List, Tuple, Union
import torch
from detectron2.layers import interpolate
class Keypoints:
"""
Stores keypoint annotation data. GT Instances have a `gt_keypoints` property
containing the x,y location and visibility flag of each keypoint. This tensor has shape
(N, K, 3) where N is the number of instances and K is the number of keypoints per instance.
The visibility flag follows the COCO format and must be one of three integers:
* v=0: not labeled (in which case x=y=0)
* v=1: labeled but not visible
* v=2: labeled and visible
"""
def __init__(self, keypoints: Union[torch.Tensor, np.ndarray, List[List[float]]]):
"""
Arguments:
keypoints: A Tensor, numpy array, or list of the x, y, and visibility of each keypoint.
The shape should be (N, K, 3) where N is the number of
instances, and K is the number of keypoints per instance.
"""
device = keypoints.device if isinstance(keypoints, torch.Tensor) else torch.device("cpu")
keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=device)
assert keypoints.dim() == 3 and keypoints.shape[2] == 3, keypoints.shape
self.tensor = keypoints
def __len__(self) -> int:
return self.tensor.size(0)
def to(self, *args: Any, **kwargs: Any) -> "Keypoints":
return type(self)(self.tensor.to(*args, **kwargs))
@property
def device(self) -> torch.device:
return self.tensor.device
def to_heatmap(self, boxes: torch.Tensor, heatmap_size: int) -> torch.Tensor:
"""
Arguments:
boxes: Nx4 tensor, the boxes to draw the keypoints to
Returns:
heatmaps:
A tensor of shape (N, K) containing an integer spatial label
in the range [0, heatmap_size**2 - 1] for each keypoint in the input.
valid:
A tensor of shape (N, K) containing whether each keypoint is in the roi or not.
"""
return _keypoints_to_heatmap(self.tensor, boxes, heatmap_size)
def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Keypoints":
"""
Create a new `Keypoints` by indexing on this `Keypoints`.
The following usage are allowed:
1. `new_kpts = kpts[3]`: return a `Keypoints` which contains only one instance.
2. `new_kpts = kpts[2:10]`: return a slice of key points.
3. `new_kpts = kpts[vector]`, where vector is a torch.ByteTensor
with `length = len(kpts)`. Nonzero elements in the vector will be selected.
Note that the returned Keypoints might share storage with this Keypoints,
subject to Pytorch's indexing semantics.
"""
if isinstance(item, int):
return Keypoints([self.tensor[item]])
return Keypoints(self.tensor[item])
def __repr__(self) -> str:
s = self.__class__.__name__ + "("
s += "num_instances={})".format(len(self.tensor))
return s
# TODO make this nicer, this is a direct translation from C2 (but removing the inner loop)
def _keypoints_to_heatmap(
keypoints: torch.Tensor, rois: torch.Tensor, heatmap_size: int
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Encode keypoint locations into a target heatmap for use in SoftmaxWithLoss across space.
Maps keypoints from the half-open interval [x1, x2) on continuous image coordinates to the
closed interval [0, heatmap_size - 1] on discrete image coordinates. We use the
continuous-discrete conversion from Heckbert 1990 ("What is the coordinate of a pixel?"):
d = floor(c) and c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate.
Arguments:
keypoints: tensor of keypoint locations in of shape (N, K, 3).
rois: Nx4 tensor of rois in xyxy format
heatmap_size: integer side length of square heatmap.
Returns:
heatmaps: A tensor of shape (N, K) containing an integer spatial label
in the range [0, heatmap_size**2 - 1] for each keypoint in the input.
valid: A tensor of shape (N, K) containing whether each keypoint is in
the roi or not.
"""
if rois.numel() == 0:
return rois.new().long(), rois.new().long()
offset_x = rois[:, 0]
offset_y = rois[:, 1]
scale_x = heatmap_size / (rois[:, 2] - rois[:, 0])
scale_y = heatmap_size / (rois[:, 3] - rois[:, 1])
offset_x = offset_x[:, None]
offset_y = offset_y[:, None]
scale_x = scale_x[:, None]
scale_y = scale_y[:, None]
x = keypoints[..., 0]
y = keypoints[..., 1]
x_boundary_inds = x == rois[:, 2][:, None]
y_boundary_inds = y == rois[:, 3][:, None]
x = (x - offset_x) * scale_x
x = x.floor().long()
y = (y - offset_y) * scale_y
y = y.floor().long()
x[x_boundary_inds] = heatmap_size - 1
y[y_boundary_inds] = heatmap_size - 1
valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size)
vis = keypoints[..., 2] > 0
valid = (valid_loc & vis).long()
lin_ind = y * heatmap_size + x
heatmaps = lin_ind * valid
return heatmaps, valid
@torch.no_grad()
def heatmaps_to_keypoints(maps: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
"""
Extract predicted keypoint locations from heatmaps.
Args:
maps (Tensor): (#ROIs, #keypoints, POOL_H, POOL_W). The predicted heatmap of logits for
each ROI and each keypoint.
rois (Tensor): (#ROIs, 4). The box of each ROI.
Returns:
Tensor of shape (#ROIs, #keypoints, 4) with the last dimension corresponding to
(x, y, logit, score) for each keypoint.
When converting discrete pixel indices in an NxN image to a continuous keypoint coordinate,
we maintain consistency with :meth:`Keypoints.to_heatmap` by using the conversion from
Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate.
"""
offset_x = rois[:, 0]
offset_y = rois[:, 1]
widths = (rois[:, 2] - rois[:, 0]).clamp(min=1)
heights = (rois[:, 3] - rois[:, 1]).clamp(min=1)
widths_ceil = widths.ceil()
heights_ceil = heights.ceil()
num_rois, num_keypoints = maps.shape[:2]
xy_preds = maps.new_zeros(rois.shape[0], num_keypoints, 4)
width_corrections = widths / widths_ceil
height_corrections = heights / heights_ceil
keypoints_idx = torch.arange(num_keypoints, device=maps.device)
for i in range(num_rois):
outsize = (int(heights_ceil[i]), int(widths_ceil[i]))
roi_map = interpolate(maps[[i]], size=outsize, mode="bicubic", align_corners=False).squeeze(
0
) # #keypoints x H x W
# softmax over the spatial region
max_score, _ = roi_map.view(num_keypoints, -1).max(1)
max_score = max_score.view(num_keypoints, 1, 1)
tmp_full_resolution = (roi_map - max_score).exp_()
tmp_pool_resolution = (maps[i] - max_score).exp_()
# Produce scores over the region H x W, but normalize with POOL_H x POOL_W,
# so that the scores of objects of different absolute sizes will be more comparable
roi_map_scores = tmp_full_resolution / tmp_pool_resolution.sum((1, 2), keepdim=True)
w = roi_map.shape[2]
pos = roi_map.view(num_keypoints, -1).argmax(1)
x_int = pos % w
y_int = (pos - x_int) // w
assert (
roi_map_scores[keypoints_idx, y_int, x_int]
== roi_map_scores.view(num_keypoints, -1).max(1)[0]
).all()
x = (x_int.float() + 0.5) * width_corrections[i]
y = (y_int.float() + 0.5) * height_corrections[i]
xy_preds[i, :, 0] = x + offset_x[i]
xy_preds[i, :, 1] = y + offset_y[i]
xy_preds[i, :, 2] = roi_map[keypoints_idx, y_int, x_int]
xy_preds[i, :, 3] = roi_map_scores[keypoints_idx, y_int, x_int]
return xy_preds
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import copy
import itertools
import numpy as np
from typing import Any, Iterator, List, Union
import pycocotools.mask as mask_utils
import torch
from detectron2.layers.roi_align import ROIAlign
from .boxes import Boxes
def polygon_area(x, y):
# Using the shoelace formula
# https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
return 0.5 * np.abs(np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))
def polygons_to_bitmask(polygons: List[np.ndarray], height: int, width: int) -> np.ndarray:
"""
Args:
polygons (list[ndarray]): each array has shape (Nx2,)
height, width (int)
Returns:
ndarray: a bool mask of shape (height, width)
"""
assert len(polygons) > 0, "COCOAPI does not support empty polygons"
rles = mask_utils.frPyObjects(polygons, height, width)
rle = mask_utils.merge(rles)
return mask_utils.decode(rle).astype(np.bool)
def rasterize_polygons_within_box(
polygons: List[np.ndarray], box: np.ndarray, mask_size: int
) -> torch.Tensor:
"""
Rasterize the polygons into a mask image and
crop the mask content in the given box.
The cropped mask is resized to (mask_size, mask_size).
This function is used when generating training targets for mask head in Mask R-CNN.
Given original ground-truth masks for an image, new ground-truth mask
training targets in the size of `mask_size x mask_size`
must be provided for each predicted box. This function will be called to
produce such targets.
Args:
polygons (list[ndarray[float]]): a list of polygons, which represents an instance.
box: 4-element numpy array
mask_size (int):
Returns:
Tensor: BoolTensor of shape (mask_size, mask_size)
"""
# 1. Shift the polygons w.r.t the boxes
w, h = box[2] - box[0], box[3] - box[1]
polygons = copy.deepcopy(polygons)
for p in polygons:
p[0::2] = p[0::2] - box[0]
p[1::2] = p[1::2] - box[1]
# 2. Rescale the polygons to the new box size
# max() to avoid division by small number
ratio_h = mask_size / max(h, 0.1)
ratio_w = mask_size / max(w, 0.1)
if ratio_h == ratio_w:
for p in polygons:
p *= ratio_h
else:
for p in polygons:
p[0::2] *= ratio_w
p[1::2] *= ratio_h
# 3. Rasterize the polygons with coco api
mask = polygons_to_bitmask(polygons, mask_size, mask_size)
mask = torch.from_numpy(mask)
return mask
class BitMasks:
"""
This class stores the segmentation masks for all objects in one image, in
the form of bitmaps.
Attributes:
tensor: bool Tensor of N,H,W, representing N instances in the image.
"""
def __init__(self, tensor: Union[torch.Tensor, np.ndarray]):
"""
Args:
tensor: bool Tensor of N,H,W, representing N instances in the image.
"""
device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu")
tensor = torch.as_tensor(tensor, dtype=torch.bool, device=device)
assert tensor.dim() == 3, tensor.size()
self.image_size = tensor.shape[1:]
self.tensor = tensor
def to(self, device: str) -> "BitMasks":
return BitMasks(self.tensor.to(device))
@property
def device(self) -> torch.device:
return self.tensor.device
def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "BitMasks":
"""
Returns:
BitMasks: Create a new :class:`BitMasks` by indexing.
The following usage are allowed:
1. `new_masks = masks[3]`: return a `BitMasks` which contains only one mask.
2. `new_masks = masks[2:10]`: return a slice of masks.
3. `new_masks = masks[vector]`, where vector is a torch.BoolTensor
with `length = len(masks)`. Nonzero elements in the vector will be selected.
Note that the returned object might share storage with this object,
subject to Pytorch's indexing semantics.
"""
if isinstance(item, int):
return BitMasks(self.tensor[item].view(1, -1))
m = self.tensor[item]
assert m.dim() == 3, "Indexing on BitMasks with {} returns a tensor with shape {}!".format(
item, m.shape
)
return BitMasks(m)
def __iter__(self) -> torch.Tensor:
yield from self.tensor
def __repr__(self) -> str:
s = self.__class__.__name__ + "("
s += "num_instances={})".format(len(self.tensor))
return s
def __len__(self) -> int:
return self.tensor.shape[0]
def nonempty(self) -> torch.Tensor:
"""
Find masks that are non-empty.
Returns:
Tensor: a BoolTensor which represents
whether each mask is empty (False) or non-empty (True).
"""
return self.tensor.flatten(1).any(dim=1)
@staticmethod
def from_polygon_masks(
polygon_masks: Union["PolygonMasks", List[List[np.ndarray]]], height: int, width: int
) -> "BitMasks":
"""
Args:
polygon_masks (list[list[ndarray]] or PolygonMasks)
height, width (int)
"""
if isinstance(polygon_masks, PolygonMasks):
polygon_masks = polygon_masks.polygons
masks = [polygons_to_bitmask(p, height, width) for p in polygon_masks]
return BitMasks(torch.stack([torch.from_numpy(x) for x in masks]))
def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor:
"""
Crop each bitmask by the given box, and resize results to (mask_size, mask_size).
This can be used to prepare training targets for Mask R-CNN.
It has less reconstruction error compared to rasterization with polygons.
However we observe no difference in accuracy,
but BitMasks requires more memory to store all the masks.
Args:
boxes (Tensor): Nx4 tensor storing the boxes for each mask
mask_size (int): the size of the rasterized mask.
Returns:
Tensor:
A bool tensor of shape (N, mask_size, mask_size), where
N is the number of predicted boxes for this image.
"""
assert len(boxes) == len(self), "{} != {}".format(len(boxes), len(self))
device = self.tensor.device
batch_inds = torch.arange(len(boxes), device=device).to(dtype=boxes.dtype)[:, None]
rois = torch.cat([batch_inds, boxes], dim=1) # Nx5
bit_masks = self.tensor.to(dtype=torch.float32)
rois = rois.to(device=device)
output = (
ROIAlign((mask_size, mask_size), 1.0, 0, aligned=True)
.forward(bit_masks[:, None, :, :], rois)
.squeeze(1)
)
output = output >= 0.5
return output
def get_bounding_boxes(self) -> None:
# not needed now
raise NotImplementedError
@staticmethod
def cat(bitmasks_list: List["BitMasks"]) -> "BitMasks":
"""
Concatenates a list of BitMasks into a single BitMasks
Arguments:
bitmasks_list (list[BitMasks])
Returns:
BitMasks: the concatenated BitMasks
"""
assert isinstance(bitmasks_list, (list, tuple))
assert len(bitmasks_list) > 0
assert all(isinstance(bitmask, BitMasks) for bitmask in bitmasks_list)
cat_bitmasks = type(bitmasks_list[0])(torch.cat([bm.tensor for bm in bitmasks_list], dim=0))
return cat_bitmasks
class PolygonMasks:
"""
This class stores the segmentation masks for all objects in one image, in the form of polygons.
Attributes:
polygons: list[list[ndarray]]. Each ndarray is a float64 vector representing a polygon.
"""
def __init__(self, polygons: List[List[Union[torch.Tensor, np.ndarray]]]):
"""
Arguments:
polygons (list[list[np.ndarray]]): The first
level of the list correspond to individual instances,
the second level to all the polygons that compose the
instance, and the third level to the polygon coordinates.
The third level array should have the format of
[x0, y0, x1, y1, ..., xn, yn] (n >= 3).
"""
assert isinstance(polygons, list), (
"Cannot create PolygonMasks: Expect a list of list of polygons per image. "
"Got '{}' instead.".format(type(polygons))
)
def _make_array(t: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
# Use float64 for higher precision, because why not?
# Always put polygons on CPU (self.to is a no-op) since they
# are supposed to be small tensors.
# May need to change this assumption if GPU placement becomes useful
if isinstance(t, torch.Tensor):
t = t.cpu().numpy()
return np.asarray(t).astype("float64")
def process_polygons(
polygons_per_instance: List[Union[torch.Tensor, np.ndarray]]
) -> List[np.ndarray]:
assert isinstance(polygons_per_instance, list), (
"Cannot create polygons: Expect a list of polygons per instance. "
"Got '{}' instead.".format(type(polygons_per_instance))
)
# transform the polygon to a tensor
polygons_per_instance = [_make_array(p) for p in polygons_per_instance]
for polygon in polygons_per_instance:
assert len(polygon) % 2 == 0 and len(polygon) >= 6
return polygons_per_instance
self.polygons: List[List[np.ndarray]] = [
process_polygons(polygons_per_instance) for polygons_per_instance in polygons
]
def to(self, *args: Any, **kwargs: Any) -> "PolygonMasks":
return self
@property
def device(self) -> torch.device:
return torch.device("cpu")
def get_bounding_boxes(self) -> Boxes:
"""
Returns:
Boxes: tight bounding boxes around polygon masks.
"""
boxes = torch.zeros(len(self.polygons), 4, dtype=torch.float32)
for idx, polygons_per_instance in enumerate(self.polygons):
minxy = torch.as_tensor([float("inf"), float("inf")], dtype=torch.float32)
maxxy = torch.zeros(2, dtype=torch.float32)
for polygon in polygons_per_instance:
coords = torch.from_numpy(polygon).view(-1, 2).to(dtype=torch.float32)
minxy = torch.min(minxy, torch.min(coords, dim=0).values)
maxxy = torch.max(maxxy, torch.max(coords, dim=0).values)
boxes[idx, :2] = minxy
boxes[idx, 2:] = maxxy
return Boxes(boxes)
def nonempty(self) -> torch.Tensor:
"""
Find masks that are non-empty.
Returns:
Tensor:
a BoolTensor which represents whether each mask is empty (False) or not (True).
"""
keep = [1 if len(polygon) > 0 else 0 for polygon in self.polygons]
return torch.from_numpy(np.asarray(keep, dtype=np.bool))
def __getitem__(self, item: Union[int, slice, List[int], torch.BoolTensor]) -> "PolygonMasks":
"""
Support indexing over the instances and return a `PolygonMasks` object.
`item` can be:
1. An integer. It will return an object with only one instance.
2. A slice. It will return an object with the selected instances.
3. A list[int]. It will return an object with the selected instances,
correpsonding to the indices in the list.
4. A vector mask of type BoolTensor, whose length is num_instances.
It will return an object with the instances whose mask is nonzero.
"""
if isinstance(item, int):
selected_polygons = [self.polygons[item]]
elif isinstance(item, slice):
selected_polygons = self.polygons[item]
elif isinstance(item, list):
selected_polygons = [self.polygons[i] for i in item]
elif isinstance(item, torch.Tensor):
# Polygons is a list, so we have to move the indices back to CPU.
if item.dtype == torch.bool:
assert item.dim() == 1, item.shape
item = item.nonzero().squeeze(1).cpu().numpy().tolist()
elif item.dtype in [torch.int32, torch.int64]:
item = item.cpu().numpy().tolist()
else:
raise ValueError("Unsupported tensor dtype={} for indexing!".format(item.dtype))
selected_polygons = [self.polygons[i] for i in item]
return PolygonMasks(selected_polygons)
def __iter__(self) -> Iterator[List[np.ndarray]]:
"""
Yields:
list[ndarray]: the polygons for one instance.
Each Tensor is a float64 vector representing a polygon.
"""
return iter(self.polygons)
def __repr__(self) -> str:
s = self.__class__.__name__ + "("
s += "num_instances={})".format(len(self.polygons))
return s
def __len__(self) -> int:
return len(self.polygons)
def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor:
"""
Crop each mask by the given box, and resize results to (mask_size, mask_size).
This can be used to prepare training targets for Mask R-CNN.
Args:
boxes (Tensor): Nx4 tensor storing the boxes for each mask
mask_size (int): the size of the rasterized mask.
Returns:
Tensor: A bool tensor of shape (N, mask_size, mask_size), where
N is the number of predicted boxes for this image.
"""
assert len(boxes) == len(self), "{} != {}".format(len(boxes), len(self))
device = boxes.device
# Put boxes on the CPU, as the polygon representation is not efficient GPU-wise
# (several small tensors for representing a single instance mask)
boxes = boxes.to(torch.device("cpu"))
results = [
rasterize_polygons_within_box(poly, box.numpy(), mask_size)
for poly, box in zip(self.polygons, boxes)
]
"""
poly: list[list[float]], the polygons for one instance
box: a tensor of shape (4,)
"""
if len(results) == 0:
return torch.empty(0, mask_size, mask_size, dtype=torch.bool, device=device)
return torch.stack(results, dim=0).to(device=device)
def area(self):
"""
Computes area of the mask.
Only works with Polygons, using the shoelace formula:
https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
Returns:
Tensor: a vector, area for each instance
"""
area = []
for polygons_per_instance in self.polygons:
area_per_instance = 0
for p in polygons_per_instance:
area_per_instance += polygon_area(p[0::2], p[1::2])
area.append(area_per_instance)
return torch.tensor(area)
@staticmethod
def cat(polymasks_list: List["PolygonMasks"]) -> "PolygonMasks":
"""
Concatenates a list of PolygonMasks into a single PolygonMasks
Arguments:
polymasks_list (list[PolygonMasks])
Returns:
PolygonMasks: the concatenated PolygonMasks
"""
assert isinstance(polymasks_list, (list, tuple))
assert len(polymasks_list) > 0
assert all(isinstance(polymask, PolygonMasks) for polymask in polymasks_list)
cat_polymasks = type(polymasks_list[0])(
list(itertools.chain.from_iterable(pm.polygons for pm in polymasks_list))
)
return cat_polymasks
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import math
from typing import Iterator, Union
import torch
from detectron2.layers.rotated_boxes import pairwise_iou_rotated
from .boxes import Boxes
class RotatedBoxes(Boxes):
"""
This structure stores a list of rotated boxes as a Nx5 torch.Tensor.
It supports some common methods about boxes
(`area`, `clip`, `nonempty`, etc),
and also behaves like a Tensor
(support indexing, `to(device)`, `.device`, and iteration over all boxes)
"""
def __init__(self, tensor: torch.Tensor):
"""
Args:
tensor (Tensor[float]): a Nx5 matrix. Each row is
(x_center, y_center, width, height, angle),
in which angle is represented in degrees.
While there's no strict range restriction for it,
the recommended principal range is between [-180, 180) degrees.
Assume we have a horizontal box B = (x_center, y_center, width, height),
where width is along the x-axis and height is along the y-axis.
The rotated box B_rot (x_center, y_center, width, height, angle)
can be seen as:
1. When angle == 0:
B_rot == B
2. When angle > 0:
B_rot is obtained by rotating B w.r.t its center by :math:`|angle|` degrees CCW;
3. When angle < 0:
B_rot is obtained by rotating B w.r.t its center by :math:`|angle|` degrees CW.
Mathematically, since the right-handed coordinate system for image space
is (y, x), where y is top->down and x is left->right, the 4 vertices of the
rotated rectangle :math:`(yr_i, xr_i)` (i = 1, 2, 3, 4) can be obtained from
the vertices of the horizontal rectangle (y_i, x_i) (i = 1, 2, 3, 4)
in the following way (:math:`\\theta = angle*\\pi/180` is the angle in radians,
(y_c, x_c) is the center of the rectangle):
.. math::
yr_i = \\cos(\\theta) (y_i - y_c) - \\sin(\\theta) (x_i - x_c) + y_c,
xr_i = \\sin(\\theta) (y_i - y_c) + \\cos(\\theta) (x_i - x_c) + x_c,
which is the standard rigid-body rotation transformation.
Intuitively, the angle is
(1) the rotation angle from y-axis in image space
to the height vector (top->down in the box's local coordinate system)
of the box in CCW, and
(2) the rotation angle from x-axis in image space
to the width vector (left->right in the box's local coordinate system)
of the box in CCW.
More intuitively, consider the following horizontal box ABCD represented
in (x1, y1, x2, y2): (3, 2, 7, 4),
covering the [3, 7] x [2, 4] region of the continuous coordinate system
which looks like this:
.. code:: none
O--------> x
|
| A---B
| | |
| D---C
|
v y
Note that each capital letter represents one 0-dimensional geometric point
instead of a 'square pixel' here.
In the example above, using (x, y) to represent a point we have:
.. math::
O = (0, 0), A = (3, 2), B = (7, 2), C = (7, 4), D = (3, 4)
We name vector AB = vector DC as the width vector in box's local coordinate system, and
vector AD = vector BC as the height vector in box's local coordinate system. Initially,
when angle = 0 degree, they're aligned with the positive directions of x-axis and y-axis
in the image space, respectively.
For better illustration, we denote the center of the box as E,
.. code:: none
O--------> x
|
| A---B
| | E |
| D---C
|
v y
where the center E = ((3+7)/2, (2+4)/2) = (5, 3).
Also,
.. math::
width = |AB| = |CD| = 7 - 3 = 4,
height = |AD| = |BC| = 4 - 2 = 2.
Therefore, the corresponding representation for the same shape in rotated box in
(x_center, y_center, width, height, angle) format is:
(5, 3, 4, 2, 0),
Now, let's consider (5, 3, 4, 2, 90), which is rotated by 90 degrees
CCW (counter-clockwise) by definition. It looks like this:
.. code:: none
O--------> x
| B-C
| | |
| |E|
| | |
| A-D
v y
The center E is still located at the same point (5, 3), while the vertices
ABCD are rotated by 90 degrees CCW with regard to E:
A = (4, 5), B = (4, 1), C = (6, 1), D = (6, 5)
Here, 90 degrees can be seen as the CCW angle to rotate from y-axis to
vector AD or vector BC (the top->down height vector in box's local coordinate system),
or the CCW angle to rotate from x-axis to vector AB or vector DC (the left->right
width vector in box's local coordinate system).
.. math::
width = |AB| = |CD| = 5 - 1 = 4,
height = |AD| = |BC| = 6 - 4 = 2.
Next, how about (5, 3, 4, 2, -90), which is rotated by 90 degrees CW (clockwise)
by definition? It looks like this:
.. code:: none
O--------> x
| D-A
| | |
| |E|
| | |
| C-B
v y
The center E is still located at the same point (5, 3), while the vertices
ABCD are rotated by 90 degrees CW with regard to E:
A = (6, 1), B = (6, 5), C = (4, 5), D = (4, 1)
.. math::
width = |AB| = |CD| = 5 - 1 = 4,
height = |AD| = |BC| = 6 - 4 = 2.
This covers exactly the same region as (5, 3, 4, 2, 90) does, and their IoU
will be 1. However, these two will generate different RoI Pooling results and
should not be treated as an identical box.
On the other hand, it's easy to see that (X, Y, W, H, A) is identical to
(X, Y, W, H, A+360N), for any integer N. For example (5, 3, 4, 2, 270) would be
identical to (5, 3, 4, 2, -90), because rotating the shape 270 degrees CCW is
equivalent to rotating the same shape 90 degrees CW.
We could rotate further to get (5, 3, 4, 2, 180), or (5, 3, 4, 2, -180):
.. code:: none
O--------> x
|
| C---D
| | E |
| B---A
|
v y
.. math::
A = (7, 4), B = (3, 4), C = (3, 2), D = (7, 2),
width = |AB| = |CD| = 7 - 3 = 4,
height = |AD| = |BC| = 4 - 2 = 2.
Finally, this is a very inaccurate (heavily quantized) illustration of
how (5, 3, 4, 2, 60) looks like in case anyone wonders:
.. code:: none
O--------> x
| B\
| / C
| /E /
| A /
| `D
v y
It's still a rectangle with center of (5, 3), width of 4 and height of 2,
but its angle (and thus orientation) is somewhere between
(5, 3, 4, 2, 0) and (5, 3, 4, 2, 90).
"""
device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu")
tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
if tensor.numel() == 0:
# Use reshape, so we don't end up creating a new tensor that does not depend on
# the inputs (and consequently confuses jit)
tensor = tensor.reshape((0, 5)).to(dtype=torch.float32, device=device)
assert tensor.dim() == 2 and tensor.size(-1) == 5, tensor.size()
self.tensor = tensor
def clone(self) -> "RotatedBoxes":
"""
Clone the RotatedBoxes.
Returns:
RotatedBoxes
"""
return RotatedBoxes(self.tensor.clone())
def to(self, device: str) -> "RotatedBoxes":
return RotatedBoxes(self.tensor.to(device))
def area(self) -> torch.Tensor:
"""
Computes the area of all the boxes.
Returns:
torch.Tensor: a vector with areas of each box.
"""
box = self.tensor
area = box[:, 2] * box[:, 3]
return area
def normalize_angles(self) -> None:
"""
Restrict angles to the range of [-180, 180) degrees
"""
self.tensor[:, 4] = (self.tensor[:, 4] + 180.0) % 360.0 - 180.0
def clip(self, box_size: Boxes.BoxSizeType, clip_angle_threshold: float = 1.0) -> None:
"""
Clip (in place) the boxes by limiting x coordinates to the range [0, width]
and y coordinates to the range [0, height].
For RRPN:
Only clip boxes that are almost horizontal with a tolerance of
clip_angle_threshold to maintain backward compatibility.
Rotated boxes beyond this threshold are not clipped for two reasons:
1. There are potentially multiple ways to clip a rotated box to make it
fit within the image.
2. It's tricky to make the entire rectangular box fit within the image
and still be able to not leave out pixels of interest.
Therefore we rely on ops like RoIAlignRotated to safely handle this.
Args:
box_size (height, width): The clipping box's size.
clip_angle_threshold:
Iff. abs(normalized(angle)) <= clip_angle_threshold (in degrees),
we do the clipping as horizontal boxes.
"""
h, w = box_size
# normalize angles to be within (-180, 180] degrees
self.normalize_angles()
idx = torch.where(torch.abs(self.tensor[:, 4]) <= clip_angle_threshold)[0]
# convert to (x1, y1, x2, y2)
x1 = self.tensor[idx, 0] - self.tensor[idx, 2] / 2.0
y1 = self.tensor[idx, 1] - self.tensor[idx, 3] / 2.0
x2 = self.tensor[idx, 0] + self.tensor[idx, 2] / 2.0
y2 = self.tensor[idx, 1] + self.tensor[idx, 3] / 2.0
# clip
x1.clamp_(min=0, max=w)
y1.clamp_(min=0, max=h)
x2.clamp_(min=0, max=w)
y2.clamp_(min=0, max=h)
# convert back to (xc, yc, w, h)
self.tensor[idx, 0] = (x1 + x2) / 2.0
self.tensor[idx, 1] = (y1 + y2) / 2.0
# make sure widths and heights do not increase due to numerical errors
self.tensor[idx, 2] = torch.min(self.tensor[idx, 2], x2 - x1)
self.tensor[idx, 3] = torch.min(self.tensor[idx, 3], y2 - y1)
def nonempty(self, threshold: float = 0.0) -> torch.Tensor:
"""
Find boxes that are non-empty.
A box is considered empty, if either of its side is no larger than threshold.
Returns:
Tensor: a binary vector which represents
whether each box is empty (False) or non-empty (True).
"""
box = self.tensor
widths = box[:, 2]
heights = box[:, 3]
keep = (widths > threshold) & (heights > threshold)
return keep
def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "RotatedBoxes":
"""
Returns:
RotatedBoxes: Create a new :class:`RotatedBoxes` by indexing.
The following usage are allowed:
1. `new_boxes = boxes[3]`: return a `RotatedBoxes` which contains only one box.
2. `new_boxes = boxes[2:10]`: return a slice of boxes.
3. `new_boxes = boxes[vector]`, where vector is a torch.ByteTensor
with `length = len(boxes)`. Nonzero elements in the vector will be selected.
Note that the returned RotatedBoxes might share storage with this RotatedBoxes,
subject to Pytorch's indexing semantics.
"""
if isinstance(item, int):
return RotatedBoxes(self.tensor[item].view(1, -1))
b = self.tensor[item]
assert b.dim() == 2, "Indexing on RotatedBoxes with {} failed to return a matrix!".format(
item
)
return RotatedBoxes(b)
def __len__(self) -> int:
return self.tensor.shape[0]
def __repr__(self) -> str:
return "RotatedBoxes(" + str(self.tensor) + ")"
def inside_box(self, box_size: Boxes.BoxSizeType, boundary_threshold: int = 0) -> torch.Tensor:
"""
Args:
box_size (height, width): Size of the reference box covering
[0, width] x [0, height]
boundary_threshold (int): Boxes that extend beyond the reference box
boundary by more than boundary_threshold are considered "outside".
For RRPN, it might not be necessary to call this function since it's common
for rotated box to extend to outside of the image boundaries
(the clip function only clips the near-horizontal boxes)
Returns:
a binary vector, indicating whether each box is inside the reference box.
"""
height, width = box_size
cnt_x = self.tensor[..., 0]
cnt_y = self.tensor[..., 1]
half_w = self.tensor[..., 2] / 2.0
half_h = self.tensor[..., 3] / 2.0
a = self.tensor[..., 4]
c = torch.abs(torch.cos(a * math.pi / 180.0))
s = torch.abs(torch.sin(a * math.pi / 180.0))
# This basically computes the horizontal bounding rectangle of the rotated box
max_rect_dx = c * half_w + s * half_h
max_rect_dy = c * half_h + s * half_w
inds_inside = (
(cnt_x - max_rect_dx >= -boundary_threshold)
& (cnt_y - max_rect_dy >= -boundary_threshold)
& (cnt_x + max_rect_dx < width + boundary_threshold)
& (cnt_y + max_rect_dy < height + boundary_threshold)
)
return inds_inside
def get_centers(self) -> torch.Tensor:
"""
Returns:
The box centers in a Nx2 array of (x, y).
"""
return self.tensor[:, :2]
def scale(self, scale_x: float, scale_y: float) -> None:
"""
Scale the rotated box with horizontal and vertical scaling factors
Note: when scale_factor_x != scale_factor_y,
the rotated box does not preserve the rectangular shape when the angle
is not a multiple of 90 degrees under resize transformation.
Instead, the shape is a parallelogram (that has skew)
Here we make an approximation by fitting a rotated rectangle to the parallelogram.
"""
self.tensor[:, 0] *= scale_x
self.tensor[:, 1] *= scale_y
theta = self.tensor[:, 4] * math.pi / 180.0
c = torch.cos(theta)
s = torch.sin(theta)
# In image space, y is top->down and x is left->right
# Consider the local coordintate system for the rotated box,
# where the box center is located at (0, 0), and the four vertices ABCD are
# A(-w / 2, -h / 2), B(w / 2, -h / 2), C(w / 2, h / 2), D(-w / 2, h / 2)
# the midpoint of the left edge AD of the rotated box E is:
# E = (A+D)/2 = (-w / 2, 0)
# the midpoint of the top edge AB of the rotated box F is:
# F(0, -h / 2)
# To get the old coordinates in the global system, apply the rotation transformation
# (Note: the right-handed coordinate system for image space is yOx):
# (old_x, old_y) = (s * y + c * x, c * y - s * x)
# E(old) = (s * 0 + c * (-w/2), c * 0 - s * (-w/2)) = (-c * w / 2, s * w / 2)
# F(old) = (s * (-h / 2) + c * 0, c * (-h / 2) - s * 0) = (-s * h / 2, -c * h / 2)
# After applying the scaling factor (sfx, sfy):
# E(new) = (-sfx * c * w / 2, sfy * s * w / 2)
# F(new) = (-sfx * s * h / 2, -sfy * c * h / 2)
# The new width after scaling tranformation becomes:
# w(new) = |E(new) - O| * 2
# = sqrt[(sfx * c * w / 2)^2 + (sfy * s * w / 2)^2] * 2
# = sqrt[(sfx * c)^2 + (sfy * s)^2] * w
# i.e., scale_factor_w = sqrt[(sfx * c)^2 + (sfy * s)^2]
#
# For example,
# when angle = 0 or 180, |c| = 1, s = 0, scale_factor_w == scale_factor_x;
# when |angle| = 90, c = 0, |s| = 1, scale_factor_w == scale_factor_y
self.tensor[:, 2] *= torch.sqrt((scale_x * c) ** 2 + (scale_y * s) ** 2)
# h(new) = |F(new) - O| * 2
# = sqrt[(sfx * s * h / 2)^2 + (sfy * c * h / 2)^2] * 2
# = sqrt[(sfx * s)^2 + (sfy * c)^2] * h
# i.e., scale_factor_h = sqrt[(sfx * s)^2 + (sfy * c)^2]
#
# For example,
# when angle = 0 or 180, |c| = 1, s = 0, scale_factor_h == scale_factor_y;
# when |angle| = 90, c = 0, |s| = 1, scale_factor_h == scale_factor_x
self.tensor[:, 3] *= torch.sqrt((scale_x * s) ** 2 + (scale_y * c) ** 2)
# The angle is the rotation angle from y-axis in image space to the height
# vector (top->down in the box's local coordinate system) of the box in CCW.
#
# angle(new) = angle_yOx(O - F(new))
# = angle_yOx( (sfx * s * h / 2, sfy * c * h / 2) )
# = atan2(sfx * s * h / 2, sfy * c * h / 2)
# = atan2(sfx * s, sfy * c)
#
# For example,
# when sfx == sfy, angle(new) == atan2(s, c) == angle(old)
self.tensor[:, 4] = torch.atan2(scale_x * s, scale_y * c) * 180 / math.pi
@property
def device(self) -> str:
return self.tensor.device
def __iter__(self) -> Iterator[torch.Tensor]:
"""
Yield a box as a Tensor of shape (5,) at a time.
"""
yield from self.tensor
def pairwise_iou(boxes1: RotatedBoxes, boxes2: RotatedBoxes) -> None:
"""
Given two lists of rotated boxes of size N and M,
compute the IoU (intersection over union)
between __all__ N x M pairs of boxes.
The box order must be (x_center, y_center, width, height, angle).
Args:
boxes1, boxes2 (RotatedBoxes):
two `RotatedBoxes`. Contains N & M rotated boxes, respectively.
Returns:
Tensor: IoU, sized [N,M].
"""
return pairwise_iou_rotated(boxes1.tensor, boxes2.tensor)
# Utility functions
This folder contain utility functions that are not used in the
core library, but are useful for building models or training
code using the config system.
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# -*- coding: utf-8 -*-
import logging
import typing
import torch
from fvcore.nn import activation_count, flop_count, parameter_count, parameter_count_table
from torch import nn
from detectron2.structures import BitMasks, Boxes, ImageList, Instances
from .logger import log_first_n
__all__ = [
"activation_count_operators",
"flop_count_operators",
"parameter_count_table",
"parameter_count",
]
FLOPS_MODE = "flops"
ACTIVATIONS_MODE = "activations"
# some extra ops to ignore from counting.
_IGNORED_OPS = [
"aten::add",
"aten::add_",
"aten::batch_norm",
"aten::constant_pad_nd",
"aten::div",
"aten::div_",
"aten::exp",
"aten::log2",
"aten::max_pool2d",
"aten::meshgrid",
"aten::mul",
"aten::mul_",
"aten::nonzero_numpy",
"aten::relu",
"aten::relu_",
"aten::rsub",
"aten::sigmoid",
"aten::sigmoid_",
"aten::softmax",
"aten::sort",
"aten::sqrt",
"aten::sub",
"aten::upsample_nearest2d",
"prim::PythonOp",
"torchvision::nms",
]
def flop_count_operators(
model: nn.Module, inputs: list, **kwargs
) -> typing.DefaultDict[str, float]:
"""
Implement operator-level flops counting using jit.
This is a wrapper of fvcore.nn.flop_count, that supports standard detection models
in detectron2.
Note:
The function runs the input through the model to compute flops.
The flops of a detection model is often input-dependent, for example,
the flops of box & mask head depends on the number of proposals &
the number of detected objects.
Therefore, the flops counting using a single input may not accurately
reflect the computation cost of a model.
Args:
model: a detectron2 model that takes `list[dict]` as input.
inputs (list[dict]): inputs to model, in detectron2's standard format.
"""
return _wrapper_count_operators(model=model, inputs=inputs, mode=FLOPS_MODE, **kwargs)
def activation_count_operators(
model: nn.Module, inputs: list, **kwargs
) -> typing.DefaultDict[str, float]:
"""
Implement operator-level activations counting using jit.
This is a wrapper of fvcore.nn.activation_count, that supports standard detection models
in detectron2.
Note:
The function runs the input through the model to compute activations.
The activations of a detection model is often input-dependent, for example,
the activations of box & mask head depends on the number of proposals &
the number of detected objects.
Args:
model: a detectron2 model that takes `list[dict]` as input.
inputs (list[dict]): inputs to model, in detectron2's standard format.
"""
return _wrapper_count_operators(model=model, inputs=inputs, mode=ACTIVATIONS_MODE, **kwargs)
def _flatten_to_tuple(outputs):
result = []
if isinstance(outputs, torch.Tensor):
result.append(outputs)
elif isinstance(outputs, (list, tuple)):
for v in outputs:
result.extend(_flatten_to_tuple(v))
elif isinstance(outputs, dict):
for _, v in outputs.items():
result.extend(_flatten_to_tuple(v))
elif isinstance(outputs, Instances):
result.extend(_flatten_to_tuple(outputs.get_fields()))
elif isinstance(outputs, (Boxes, BitMasks, ImageList)):
result.append(outputs.tensor)
else:
log_first_n(
logging.WARN,
f"Output of type {type(outputs)} not included in flops/activations count.",
n=10,
)
return tuple(result)
def _wrapper_count_operators(
model: nn.Module, inputs: list, mode: str, **kwargs
) -> typing.DefaultDict[str, float]:
# ignore some ops
supported_ops = {k: lambda *args, **kwargs: {} for k in _IGNORED_OPS}
supported_ops.update(kwargs.pop("supported_ops", {}))
kwargs["supported_ops"] = supported_ops
assert len(inputs) == 1, "Please use batch size=1"
tensor_input = inputs[0]["image"]
class WrapModel(nn.Module):
def __init__(self, model):
super().__init__()
if isinstance(
model, (nn.parallel.distributed.DistributedDataParallel, nn.DataParallel)
):
self.model = model.module
else:
self.model = model
def forward(self, image):
# jit requires the input/output to be Tensors
inputs = [{"image": image}]
outputs = self.model.forward(inputs)
# Only the subgraph that computes the returned tuple of tensor will be
# counted. So we flatten everything we found to tuple of tensors.
return _flatten_to_tuple(outputs)
old_train = model.training
with torch.no_grad():
if mode == FLOPS_MODE:
ret = flop_count(WrapModel(model).train(False), (tensor_input,), **kwargs)
elif mode == ACTIVATIONS_MODE:
ret = activation_count(WrapModel(model).train(False), (tensor_input,), **kwargs)
else:
raise NotImplementedError("Count for mode {} is not supported yet.".format(mode))
# compatible with change in fvcore
if isinstance(ret, tuple):
ret = ret[0]
model.train(old_train)
return ret
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import importlib
import numpy as np
import os
import re
import subprocess
import sys
from collections import defaultdict
import PIL
import torch
import torchvision
from tabulate import tabulate
__all__ = ["collect_env_info"]
def collect_torch_env():
try:
import torch.__config__
return torch.__config__.show()
except ImportError:
# compatible with older versions of pytorch
from torch.utils.collect_env import get_pretty_env_info
return get_pretty_env_info()
def get_env_module():
var_name = "DETECTRON2_ENV_MODULE"
return var_name, os.environ.get(var_name, "<not set>")
def detect_compute_compatibility(CUDA_HOME, so_file):
try:
cuobjdump = os.path.join(CUDA_HOME, "bin", "cuobjdump")
if os.path.isfile(cuobjdump):
output = subprocess.check_output(
"'{}' --list-elf '{}'".format(cuobjdump, so_file), shell=True
)
output = output.decode("utf-8").strip().split("\n")
sm = []
for line in output:
line = re.findall(r"\.sm_[0-9]*\.", line)[0]
sm.append(line.strip("."))
sm = sorted(set(sm))
return ", ".join(sm)
else:
return so_file + "; cannot find cuobjdump"
except Exception:
# unhandled failure
return so_file
def collect_env_info():
has_cuda = torch.cuda.is_available()
# NOTE: the use of CUDA_HOME requires the CUDA build deps, though in
# theory detectron2 should be made runnable with only the CUDA runtime
from torch.utils.cpp_extension import CUDA_HOME
data = []
data.append(("sys.platform", sys.platform))
data.append(("Python", sys.version.replace("\n", "")))
data.append(("numpy", np.__version__))
try:
import detectron2 # noqa
data.append(
("detectron2", detectron2.__version__ + " @" + os.path.dirname(detectron2.__file__))
)
except ImportError:
data.append(("detectron2", "failed to import"))
else:
try:
from detectron2 import _C
except ImportError:
data.append(("detectron2._C", "failed to import"))
else:
data.append(("detectron2 compiler", _C.get_compiler_version()))
data.append(("detectron2 CUDA compiler", _C.get_cuda_version()))
if has_cuda:
data.append(
("detectron2 arch flags", detect_compute_compatibility(CUDA_HOME, _C.__file__))
)
data.append(get_env_module())
data.append(("PyTorch", torch.__version__ + " @" + os.path.dirname(torch.__file__)))
data.append(("PyTorch debug build", torch.version.debug))
data.append(("CUDA available", has_cuda))
if has_cuda:
devices = defaultdict(list)
for k in range(torch.cuda.device_count()):
devices[torch.cuda.get_device_name(k)].append(str(k))
for name, devids in devices.items():
data.append(("GPU " + ",".join(devids), name))
from torch.utils.cpp_extension import CUDA_HOME
data.append(("CUDA_HOME", str(CUDA_HOME)))
if CUDA_HOME is not None and os.path.isdir(CUDA_HOME):
try:
nvcc = os.path.join(CUDA_HOME, "bin", "nvcc")
nvcc = subprocess.check_output("'{}' -V | tail -n1".format(nvcc), shell=True)
nvcc = nvcc.decode("utf-8").strip()
except subprocess.SubprocessError:
nvcc = "Not Available"
data.append(("NVCC", nvcc))
cuda_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
if cuda_arch_list:
data.append(("TORCH_CUDA_ARCH_LIST", cuda_arch_list))
data.append(("Pillow", PIL.__version__))
try:
data.append(
(
"torchvision",
str(torchvision.__version__) + " @" + os.path.dirname(torchvision.__file__),
)
)
if has_cuda:
try:
torchvision_C = importlib.util.find_spec("torchvision._C").origin
msg = detect_compute_compatibility(CUDA_HOME, torchvision_C)
data.append(("torchvision arch flags", msg))
except ImportError:
data.append(("torchvision._C", "failed to find"))
except AttributeError:
data.append(("torchvision", "unknown"))
try:
import fvcore
data.append(("fvcore", fvcore.__version__))
except ImportError:
pass
try:
import cv2
data.append(("cv2", cv2.__version__))
except ImportError:
pass
env_str = tabulate(data) + "\n"
env_str += collect_torch_env()
return env_str
if __name__ == "__main__":
try:
import detectron2 # noqa
except ImportError:
print(collect_env_info())
else:
from detectron2.utils.collect_env import collect_env_info
print(collect_env_info())
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
"""
An awesome colormap for really neat visualizations.
Copied from Detectron, and removed gray colors.
"""
import numpy as np
__all__ = ["colormap", "random_color"]
# fmt: off
# RGB:
_COLORS = np.array(
[
0.000, 0.447, 0.741,
0.850, 0.325, 0.098,
0.929, 0.694, 0.125,
0.494, 0.184, 0.556,
0.466, 0.674, 0.188,
0.301, 0.745, 0.933,
0.635, 0.078, 0.184,
0.300, 0.300, 0.300,
0.600, 0.600, 0.600,
1.000, 0.000, 0.000,
1.000, 0.500, 0.000,
0.749, 0.749, 0.000,
0.000, 1.000, 0.000,
0.000, 0.000, 1.000,
0.667, 0.000, 1.000,
0.333, 0.333, 0.000,
0.333, 0.667, 0.000,
0.333, 1.000, 0.000,
0.667, 0.333, 0.000,
0.667, 0.667, 0.000,
0.667, 1.000, 0.000,
1.000, 0.333, 0.000,
1.000, 0.667, 0.000,
1.000, 1.000, 0.000,
0.000, 0.333, 0.500,
0.000, 0.667, 0.500,
0.000, 1.000, 0.500,
0.333, 0.000, 0.500,
0.333, 0.333, 0.500,
0.333, 0.667, 0.500,
0.333, 1.000, 0.500,
0.667, 0.000, 0.500,
0.667, 0.333, 0.500,
0.667, 0.667, 0.500,
0.667, 1.000, 0.500,
1.000, 0.000, 0.500,
1.000, 0.333, 0.500,
1.000, 0.667, 0.500,
1.000, 1.000, 0.500,
0.000, 0.333, 1.000,
0.000, 0.667, 1.000,
0.000, 1.000, 1.000,
0.333, 0.000, 1.000,
0.333, 0.333, 1.000,
0.333, 0.667, 1.000,
0.333, 1.000, 1.000,
0.667, 0.000, 1.000,
0.667, 0.333, 1.000,
0.667, 0.667, 1.000,
0.667, 1.000, 1.000,
1.000, 0.000, 1.000,
1.000, 0.333, 1.000,
1.000, 0.667, 1.000,
0.333, 0.000, 0.000,
0.500, 0.000, 0.000,
0.667, 0.000, 0.000,
0.833, 0.000, 0.000,
1.000, 0.000, 0.000,
0.000, 0.167, 0.000,
0.000, 0.333, 0.000,
0.000, 0.500, 0.000,
0.000, 0.667, 0.000,
0.000, 0.833, 0.000,
0.000, 1.000, 0.000,
0.000, 0.000, 0.167,
0.000, 0.000, 0.333,
0.000, 0.000, 0.500,
0.000, 0.000, 0.667,
0.000, 0.000, 0.833,
0.000, 0.000, 1.000,
0.000, 0.000, 0.000,
0.143, 0.143, 0.143,
0.857, 0.857, 0.857,
1.000, 1.000, 1.000
]
).astype(np.float32).reshape(-1, 3)
# fmt: on
def colormap(rgb=False, maximum=255):
"""
Args:
rgb (bool): whether to return RGB colors or BGR colors.
maximum (int): either 255 or 1
Returns:
ndarray: a float32 array of Nx3 colors, in range [0, 255] or [0, 1]
"""
assert maximum in [255, 1], maximum
c = _COLORS * maximum
if not rgb:
c = c[:, ::-1]
return c
def random_color(rgb=False, maximum=255):
"""
Args:
rgb (bool): whether to return RGB colors or BGR colors.
maximum (int): either 255 or 1
Returns:
ndarray: a vector of 3 numbers
"""
idx = np.random.randint(0, len(_COLORS))
ret = _COLORS[idx] * maximum
if not rgb:
ret = ret[::-1]
return ret
if __name__ == "__main__":
import cv2
size = 100
H, W = 10, 10
canvas = np.random.rand(H * size, W * size, 3).astype("float32")
for h in range(H):
for w in range(W):
idx = h * W + w
if idx >= len(_COLORS):
break
canvas[h * size : (h + 1) * size, w * size : (w + 1) * size] = _COLORS[idx]
cv2.imshow("a", canvas)
cv2.waitKey(0)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
This file contains primitives for multi-gpu communication.
This is useful when doing distributed training.
"""
import functools
import logging
import numpy as np
import pickle
import torch
import torch.distributed as dist
_LOCAL_PROCESS_GROUP = None
"""
A torch process group which only includes processes that on the same machine as the current process.
This variable is set when processes are spawned by `launch()` in "engine/launch.py".
"""
def get_world_size() -> int:
if not dist.is_available():
return 1
if not dist.is_initialized():
return 1
return dist.get_world_size()
def get_rank() -> int:
if not dist.is_available():
return 0
if not dist.is_initialized():
return 0
return dist.get_rank()
def get_local_rank() -> int:
"""
Returns:
The rank of the current process within the local (per-machine) process group.
"""
if not dist.is_available():
return 0
if not dist.is_initialized():
return 0
assert _LOCAL_PROCESS_GROUP is not None
return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
def get_local_size() -> int:
"""
Returns:
The size of the per-machine process group,
i.e. the number of processes per machine.
"""
if not dist.is_available():
return 1
if not dist.is_initialized():
return 1
return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
def is_main_process() -> bool:
return get_rank() == 0
def synchronize():
"""
Helper function to synchronize (barrier) among all processes when
using distributed training
"""
if not dist.is_available():
return
if not dist.is_initialized():
return
world_size = dist.get_world_size()
if world_size == 1:
return
dist.barrier()
@functools.lru_cache()
def _get_global_gloo_group():
"""
Return a process group based on gloo backend, containing all the ranks
The result is cached.
"""
if dist.get_backend() == "nccl":
return dist.new_group(backend="gloo")
else:
return dist.group.WORLD
def _serialize_to_tensor(data, group):
backend = dist.get_backend(group)
assert backend in ["gloo", "nccl"]
device = torch.device("cpu" if backend == "gloo" else "cuda")
buffer = pickle.dumps(data)
if len(buffer) > 1024 ** 3:
logger = logging.getLogger(__name__)
logger.warning(
"Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
get_rank(), len(buffer) / (1024 ** 3), device
)
)
storage = torch.ByteStorage.from_buffer(buffer)
tensor = torch.ByteTensor(storage).to(device=device)
return tensor
def _pad_to_largest_tensor(tensor, group):
"""
Returns:
list[int]: size of the tensor, on each rank
Tensor: padded tensor that has the max size
"""
world_size = dist.get_world_size(group=group)
assert (
world_size >= 1
), "comm.gather/all_gather must be called from ranks within the given group!"
local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device)
size_list = [
torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size)
]
dist.all_gather(size_list, local_size, group=group)
size_list = [int(size.item()) for size in size_list]
max_size = max(size_list)
# we pad the tensor because torch all_gather does not support
# gathering tensors of different shapes
if local_size != max_size:
padding = torch.zeros((max_size - local_size,), dtype=torch.uint8, device=tensor.device)
tensor = torch.cat((tensor, padding), dim=0)
return size_list, tensor
def all_gather(data, group=None):
"""
Run all_gather on arbitrary picklable data (not necessarily tensors).
Args:
data: any picklable object
group: a torch process group. By default, will use a group which
contains all ranks on gloo backend.
Returns:
list[data]: list of data gathered from each rank
"""
if get_world_size() == 1:
return [data]
if group is None:
group = _get_global_gloo_group()
if dist.get_world_size(group) == 1:
return [data]
tensor = _serialize_to_tensor(data, group)
size_list, tensor = _pad_to_largest_tensor(tensor, group)
max_size = max(size_list)
# receiving Tensor from all ranks
tensor_list = [
torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list
]
dist.all_gather(tensor_list, tensor, group=group)
data_list = []
for size, tensor in zip(size_list, tensor_list):
buffer = tensor.cpu().numpy().tobytes()[:size]
data_list.append(pickle.loads(buffer))
return data_list
def gather(data, dst=0, group=None):
"""
Run gather on arbitrary picklable data (not necessarily tensors).
Args:
data: any picklable object
dst (int): destination rank
group: a torch process group. By default, will use a group which
contains all ranks on gloo backend.
Returns:
list[data]: on dst, a list of data gathered from each rank. Otherwise,
an empty list.
"""
if get_world_size() == 1:
return [data]
if group is None:
group = _get_global_gloo_group()
if dist.get_world_size(group=group) == 1:
return [data]
rank = dist.get_rank(group=group)
tensor = _serialize_to_tensor(data, group)
size_list, tensor = _pad_to_largest_tensor(tensor, group)
# receiving Tensor from all ranks
if rank == dst:
max_size = max(size_list)
tensor_list = [
torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list
]
dist.gather(tensor, tensor_list, dst=dst, group=group)
data_list = []
for size, tensor in zip(size_list, tensor_list):
buffer = tensor.cpu().numpy().tobytes()[:size]
data_list.append(pickle.loads(buffer))
return data_list
else:
dist.gather(tensor, [], dst=dst, group=group)
return []
def shared_random_seed():
"""
Returns:
int: a random number that is the same across all workers.
If workers need a shared RNG, they can use this shared seed to
create one.
All workers must call this function, otherwise it will deadlock.
"""
ints = np.random.randint(2 ** 31)
all_ints = all_gather(ints)
return all_ints[0]
def reduce_dict(input_dict, average=True):
"""
Reduce the values in the dictionary from all processes so that process with rank
0 has the reduced results.
Args:
input_dict (dict): inputs to be reduced. All the values must be scalar CUDA Tensor.
average (bool): whether to do average or sum
Returns:
a dict with the same keys as input_dict, after reduction.
"""
world_size = get_world_size()
if world_size < 2:
return input_dict
with torch.no_grad():
names = []
values = []
# sort the keys so that they are consistent across processes
for k in sorted(input_dict.keys()):
names.append(k)
values.append(input_dict[k])
values = torch.stack(values, dim=0)
dist.reduce(values, dst=0)
if dist.get_rank() == 0 and average:
# only main process gets accumulated, so only divide by
# world_size in this case
values /= world_size
reduced_dict = {k: v for k, v in zip(names, values)}
return reduced_dict
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import importlib
import importlib.util
import logging
import numpy as np
import os
import random
import sys
from datetime import datetime
import torch
__all__ = ["seed_all_rng"]
def seed_all_rng(seed=None):
"""
Set the random seed for the RNG in torch, numpy and python.
Args:
seed (int): if None, will use a strong random seed.
"""
if seed is None:
seed = (
os.getpid()
+ int(datetime.now().strftime("%S%f"))
+ int.from_bytes(os.urandom(2), "big")
)
logger = logging.getLogger(__name__)
logger.info("Using a generated random seed {}".format(seed))
np.random.seed(seed)
torch.set_rng_state(torch.manual_seed(seed).get_state())
random.seed(seed)
# from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
def _import_file(module_name, file_path, make_importable=False):
spec = importlib.util.spec_from_file_location(module_name, file_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
if make_importable:
sys.modules[module_name] = module
return module
def _configure_libraries():
"""
Configurations for some libraries.
"""
# An environment option to disable `import cv2` globally,
# in case it leads to negative performance impact
disable_cv2 = int(os.environ.get("DETECTRON2_DISABLE_CV2", False))
if disable_cv2:
sys.modules["cv2"] = None
else:
# Disable opencl in opencv since its interaction with cuda often has negative effects
# This envvar is supported after OpenCV 3.4.0
os.environ["OPENCV_OPENCL_RUNTIME"] = "disabled"
try:
import cv2
if int(cv2.__version__.split(".")[0]) >= 3:
cv2.ocl.setUseOpenCL(False)
except ImportError:
pass
def get_version(module, digit=2):
return tuple(map(int, module.__version__.split(".")[:digit]))
# fmt: off
assert get_version(torch) >= (1, 4), "Requires torch>=1.4"
import fvcore
assert get_version(fvcore, 3) >= (0, 1, 1), "Requires fvcore>=0.1.1"
import yaml
assert get_version(yaml) >= (5, 1), "Requires pyyaml>=5.1"
# fmt: on
_ENV_SETUP_DONE = False
def setup_environment():
"""Perform environment setup work. The default setup is a no-op, but this
function allows the user to specify a Python source file or a module in
the $DETECTRON2_ENV_MODULE environment variable, that performs
custom setup work that may be necessary to their computing environment.
"""
global _ENV_SETUP_DONE
if _ENV_SETUP_DONE:
return
_ENV_SETUP_DONE = True
_configure_libraries()
custom_module_path = os.environ.get("DETECTRON2_ENV_MODULE")
if custom_module_path:
setup_custom_environment(custom_module_path)
else:
# The default setup is a no-op
pass
def setup_custom_environment(custom_module):
"""
Load custom environment setup by importing a Python source file or a
module, and run the setup function.
"""
if custom_module.endswith(".py"):
module = _import_file("detectron2.utils.env.custom_module", custom_module)
else:
module = importlib.import_module(custom_module)
assert hasattr(module, "setup_environment") and callable(module.setup_environment), (
"Custom environment module defined in {} does not have the "
"required callable attribute 'setup_environment'."
).format(custom_module)
module.setup_environment()
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import datetime
import json
import logging
import os
import time
from collections import defaultdict
from contextlib import contextmanager
import torch
from fvcore.common.file_io import PathManager
from fvcore.common.history_buffer import HistoryBuffer
_CURRENT_STORAGE_STACK = []
def get_event_storage():
"""
Returns:
The :class:`EventStorage` object that's currently being used.
Throws an error if no :class:`EventStorage` is currently enabled.
"""
assert len(
_CURRENT_STORAGE_STACK
), "get_event_storage() has to be called inside a 'with EventStorage(...)' context!"
return _CURRENT_STORAGE_STACK[-1]
class EventWriter:
"""
Base class for writers that obtain events from :class:`EventStorage` and process them.
"""
def write(self):
raise NotImplementedError
def close(self):
pass
class JSONWriter(EventWriter):
"""
Write scalars to a json file.
It saves scalars as one json per line (instead of a big json) for easy parsing.
Examples parsing such a json file:
.. code-block:: none
$ cat metrics.json | jq -s '.[0:2]'
[
{
"data_time": 0.008433341979980469,
"iteration": 20,
"loss": 1.9228371381759644,
"loss_box_reg": 0.050025828182697296,
"loss_classifier": 0.5316952466964722,
"loss_mask": 0.7236229181289673,
"loss_rpn_box": 0.0856662318110466,
"loss_rpn_cls": 0.48198649287223816,
"lr": 0.007173333333333333,
"time": 0.25401854515075684
},
{
"data_time": 0.007216215133666992,
"iteration": 40,
"loss": 1.282649278640747,
"loss_box_reg": 0.06222952902317047,
"loss_classifier": 0.30682939291000366,
"loss_mask": 0.6970193982124329,
"loss_rpn_box": 0.038663312792778015,
"loss_rpn_cls": 0.1471673548221588,
"lr": 0.007706666666666667,
"time": 0.2490077018737793
}
]
$ cat metrics.json | jq '.loss_mask'
0.7126231789588928
0.689423680305481
0.6776131987571716
...
"""
def __init__(self, json_file, window_size=20):
"""
Args:
json_file (str): path to the json file. New data will be appended if the file exists.
window_size (int): the window size of median smoothing for the scalars whose
`smoothing_hint` are True.
"""
self._file_handle = PathManager.open(json_file, "a")
self._window_size = window_size
def write(self):
storage = get_event_storage()
to_save = {"iteration": storage.iter}
to_save.update(storage.latest_with_smoothing_hint(self._window_size))
self._file_handle.write(json.dumps(to_save, sort_keys=True) + "\n")
self._file_handle.flush()
try:
os.fsync(self._file_handle.fileno())
except AttributeError:
pass
def close(self):
self._file_handle.close()
class TensorboardXWriter(EventWriter):
"""
Write all scalars to a tensorboard file.
"""
def __init__(self, log_dir: str, window_size: int = 20, **kwargs):
"""
Args:
log_dir (str): the directory to save the output events
window_size (int): the scalars will be median-smoothed by this window size
kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)`
"""
self._window_size = window_size
from torch.utils.tensorboard import SummaryWriter
self._writer = SummaryWriter(log_dir, **kwargs)
def write(self):
storage = get_event_storage()
for k, v in storage.latest_with_smoothing_hint(self._window_size).items():
self._writer.add_scalar(k, v, storage.iter)
# storage.put_{image,histogram} is only meant to be used by
# tensorboard writer. So we access its internal fields directly from here.
if len(storage._vis_data) >= 1:
for img_name, img, step_num in storage._vis_data:
self._writer.add_image(img_name, img, step_num)
# Storage stores all image data and rely on this writer to clear them.
# As a result it assumes only one writer will use its image data.
# An alternative design is to let storage store limited recent
# data (e.g. only the most recent image) that all writers can access.
# In that case a writer may not see all image data if its period is long.
storage.clear_images()
if len(storage._histograms) >= 1:
for params in storage._histograms:
self._writer.add_histogram_raw(**params)
storage.clear_histograms()
def close(self):
if hasattr(self, "_writer"): # doesn't exist when the code fails at import
self._writer.close()
class CommonMetricPrinter(EventWriter):
"""
Print **common** metrics to the terminal, including
iteration time, ETA, memory, all losses, and the learning rate.
To print something different, please implement a similar printer by yourself.
"""
def __init__(self, max_iter):
"""
Args:
max_iter (int): the maximum number of iterations to train.
Used to compute ETA.
"""
self.logger = logging.getLogger(__name__)
self._max_iter = max_iter
self._last_write = None
def write(self):
storage = get_event_storage()
iteration = storage.iter
try:
data_time = storage.history("data_time").avg(20)
except KeyError:
# they may not exist in the first few iterations (due to warmup)
# or when SimpleTrainer is not used
data_time = None
eta_string = None
try:
iter_time = storage.history("time").global_avg()
eta_seconds = storage.history("time").median(1000) * (self._max_iter - iteration)
storage.put_scalar("eta_seconds", eta_seconds, smoothing_hint=False)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
except KeyError:
iter_time = None
# estimate eta on our own - more noisy
if self._last_write is not None:
estimate_iter_time = (time.perf_counter() - self._last_write[1]) / (
iteration - self._last_write[0]
)
eta_seconds = estimate_iter_time * (self._max_iter - iteration)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
self._last_write = (iteration, time.perf_counter())
try:
lr = "{:.6f}".format(storage.history("lr").latest())
except KeyError:
lr = "N/A"
if torch.cuda.is_available():
max_mem_mb = torch.cuda.max_memory_allocated() / 1024.0 / 1024.0
else:
max_mem_mb = None
# NOTE: max_mem is parsed by grep in "dev/parse_results.sh"
self.logger.info(
" {eta}iter: {iter} {losses} {time}{data_time}lr: {lr} {memory}".format(
eta=f"eta: {eta_string} " if eta_string else "",
iter=iteration,
losses=" ".join(
[
"{}: {:.3f}".format(k, v.median(20))
for k, v in storage.histories().items()
if "loss" in k
]
),
time="time: {:.4f} ".format(iter_time) if iter_time is not None else "",
data_time="data_time: {:.4f} ".format(data_time) if data_time is not None else "",
lr=lr,
memory="max_mem: {:.0f}M".format(max_mem_mb) if max_mem_mb is not None else "",
)
)
class EventStorage:
"""
The user-facing class that provides metric storage functionalities.
In the future we may add support for storing / logging other types of data if needed.
"""
def __init__(self, start_iter=0):
"""
Args:
start_iter (int): the iteration number to start with
"""
self._history = defaultdict(HistoryBuffer)
self._smoothing_hints = {}
self._latest_scalars = {}
self._iter = start_iter
self._current_prefix = ""
self._vis_data = []
self._histograms = []
def put_image(self, img_name, img_tensor):
"""
Add an `img_tensor` associated with `img_name`, to be shown on
tensorboard.
Args:
img_name (str): The name of the image to put into tensorboard.
img_tensor (torch.Tensor or numpy.array): An `uint8` or `float`
Tensor of shape `[channel, height, width]` where `channel` is
3. The image format should be RGB. The elements in img_tensor
can either have values in [0, 1] (float32) or [0, 255] (uint8).
The `img_tensor` will be visualized in tensorboard.
"""
self._vis_data.append((img_name, img_tensor, self._iter))
def put_scalar(self, name, value, smoothing_hint=True):
"""
Add a scalar `value` to the `HistoryBuffer` associated with `name`.
Args:
smoothing_hint (bool): a 'hint' on whether this scalar is noisy and should be
smoothed when logged. The hint will be accessible through
:meth:`EventStorage.smoothing_hints`. A writer may ignore the hint
and apply custom smoothing rule.
It defaults to True because most scalars we save need to be smoothed to
provide any useful signal.
"""
name = self._current_prefix + name
history = self._history[name]
value = float(value)
history.update(value, self._iter)
self._latest_scalars[name] = value
existing_hint = self._smoothing_hints.get(name)
if existing_hint is not None:
assert (
existing_hint == smoothing_hint
), "Scalar {} was put with a different smoothing_hint!".format(name)
else:
self._smoothing_hints[name] = smoothing_hint
def put_scalars(self, *, smoothing_hint=True, **kwargs):
"""
Put multiple scalars from keyword arguments.
Examples:
storage.put_scalars(loss=my_loss, accuracy=my_accuracy, smoothing_hint=True)
"""
for k, v in kwargs.items():
self.put_scalar(k, v, smoothing_hint=smoothing_hint)
def put_histogram(self, hist_name, hist_tensor, bins=1000):
"""
Create a histogram from a tensor.
Args:
hist_name (str): The name of the histogram to put into tensorboard.
hist_tensor (torch.Tensor): A Tensor of arbitrary shape to be converted
into a histogram.
bins (int): Number of histogram bins.
"""
ht_min, ht_max = hist_tensor.min().item(), hist_tensor.max().item()
# Create a histogram with PyTorch
hist_counts = torch.histc(hist_tensor, bins=bins)
hist_edges = torch.linspace(start=ht_min, end=ht_max, steps=bins + 1, dtype=torch.float32)
# Parameter for the add_histogram_raw function of SummaryWriter
hist_params = dict(
tag=hist_name,
min=ht_min,
max=ht_max,
num=len(hist_tensor),
sum=float(hist_tensor.sum()),
sum_squares=float(torch.sum(hist_tensor ** 2)),
bucket_limits=hist_edges[1:].tolist(),
bucket_counts=hist_counts.tolist(),
global_step=self._iter,
)
self._histograms.append(hist_params)
def history(self, name):
"""
Returns:
HistoryBuffer: the scalar history for name
"""
ret = self._history.get(name, None)
if ret is None:
raise KeyError("No history metric available for {}!".format(name))
return ret
def histories(self):
"""
Returns:
dict[name -> HistoryBuffer]: the HistoryBuffer for all scalars
"""
return self._history
def latest(self):
"""
Returns:
dict[name -> number]: the scalars that's added in the current iteration.
"""
return self._latest_scalars
def latest_with_smoothing_hint(self, window_size=20):
"""
Similar to :meth:`latest`, but the returned values
are either the un-smoothed original latest value,
or a median of the given window_size,
depend on whether the smoothing_hint is True.
This provides a default behavior that other writers can use.
"""
result = {}
for k, v in self._latest_scalars.items():
result[k] = self._history[k].median(window_size) if self._smoothing_hints[k] else v
return result
def smoothing_hints(self):
"""
Returns:
dict[name -> bool]: the user-provided hint on whether the scalar
is noisy and needs smoothing.
"""
return self._smoothing_hints
def step(self):
"""
User should call this function at the beginning of each iteration, to
notify the storage of the start of a new iteration.
The storage will then be able to associate the new data with the
correct iteration number.
"""
self._iter += 1
self._latest_scalars = {}
@property
def iter(self):
return self._iter
@property
def iteration(self):
# for backward compatibility
return self._iter
def __enter__(self):
_CURRENT_STORAGE_STACK.append(self)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
assert _CURRENT_STORAGE_STACK[-1] == self
_CURRENT_STORAGE_STACK.pop()
@contextmanager
def name_scope(self, name):
"""
Yields:
A context within which all the events added to this storage
will be prefixed by the name scope.
"""
old_prefix = self._current_prefix
self._current_prefix = name.rstrip("/") + "/"
yield
self._current_prefix = old_prefix
def clear_images(self):
"""
Delete all the stored images for visualization. This should be called
after images are written to tensorboard.
"""
self._vis_data = []
def clear_histograms(self):
"""
Delete all the stored histograms for visualization.
This should be called after histograms are written to tensorboard.
"""
self._histograms = []
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import functools
import logging
import os
import sys
import time
from collections import Counter
from fvcore.common.file_io import PathManager
from tabulate import tabulate
from termcolor import colored
class _ColorfulFormatter(logging.Formatter):
def __init__(self, *args, **kwargs):
self._root_name = kwargs.pop("root_name") + "."
self._abbrev_name = kwargs.pop("abbrev_name", "")
if len(self._abbrev_name):
self._abbrev_name = self._abbrev_name + "."
super(_ColorfulFormatter, self).__init__(*args, **kwargs)
def formatMessage(self, record):
record.name = record.name.replace(self._root_name, self._abbrev_name)
log = super(_ColorfulFormatter, self).formatMessage(record)
if record.levelno == logging.WARNING:
prefix = colored("WARNING", "red", attrs=["blink"])
elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL:
prefix = colored("ERROR", "red", attrs=["blink", "underline"])
else:
return log
return prefix + " " + log
@functools.lru_cache() # so that calling setup_logger multiple times won't add many handlers
def setup_logger(
output=None, distributed_rank=0, *, color=True, name="detectron2", abbrev_name=None
):
"""
Initialize the detectron2 logger and set its verbosity level to "DEBUG".
Args:
output (str): a file name or a directory to save log. If None, will not save log file.
If ends with ".txt" or ".log", assumed to be a file name.
Otherwise, logs will be saved to `output/log.txt`.
name (str): the root module name of this logger
abbrev_name (str): an abbreviation of the module, to avoid long names in logs.
Set to "" to not log the root module in logs.
By default, will abbreviate "detectron2" to "d2" and leave other
modules unchanged.
Returns:
logging.Logger: a logger
"""
logger = logging.getLogger(name)
logger.setLevel(logging.DEBUG)
logger.propagate = False
if abbrev_name is None:
abbrev_name = "d2" if name == "detectron2" else name
plain_formatter = logging.Formatter(
"[%(asctime)s] %(name)s %(levelname)s: %(message)s", datefmt="%m/%d %H:%M:%S"
)
# stdout logging: master only
if distributed_rank == 0:
ch = logging.StreamHandler(stream=sys.stdout)
ch.setLevel(logging.DEBUG)
if color:
formatter = _ColorfulFormatter(
colored("[%(asctime)s %(name)s]: ", "green") + "%(message)s",
datefmt="%m/%d %H:%M:%S",
root_name=name,
abbrev_name=str(abbrev_name),
)
else:
formatter = plain_formatter
ch.setFormatter(formatter)
logger.addHandler(ch)
# file logging: all workers
if output is not None:
if output.endswith(".txt") or output.endswith(".log"):
filename = output
else:
filename = os.path.join(output, "log.txt")
if distributed_rank > 0:
filename = filename + ".rank{}".format(distributed_rank)
PathManager.mkdirs(os.path.dirname(filename))
fh = logging.StreamHandler(_cached_log_stream(filename))
fh.setLevel(logging.DEBUG)
fh.setFormatter(plain_formatter)
logger.addHandler(fh)
return logger
# cache the opened file object, so that different calls to `setup_logger`
# with the same file name can safely write to the same file.
@functools.lru_cache(maxsize=None)
def _cached_log_stream(filename):
return PathManager.open(filename, "a")
"""
Below are some other convenient logging methods.
They are mainly adopted from
https://github.com/abseil/abseil-py/blob/master/absl/logging/__init__.py
"""
def _find_caller():
"""
Returns:
str: module name of the caller
tuple: a hashable key to be used to identify different callers
"""
frame = sys._getframe(2)
while frame:
code = frame.f_code
if os.path.join("utils", "logger.") not in code.co_filename:
mod_name = frame.f_globals["__name__"]
if mod_name == "__main__":
mod_name = "detectron2"
return mod_name, (code.co_filename, frame.f_lineno, code.co_name)
frame = frame.f_back
_LOG_COUNTER = Counter()
_LOG_TIMER = {}
def log_first_n(lvl, msg, n=1, *, name=None, key="caller"):
"""
Log only for the first n times.
Args:
lvl (int): the logging level
msg (str):
n (int):
name (str): name of the logger to use. Will use the caller's module by default.
key (str or tuple[str]): the string(s) can be one of "caller" or
"message", which defines how to identify duplicated logs.
For example, if called with `n=1, key="caller"`, this function
will only log the first call from the same caller, regardless of
the message content.
If called with `n=1, key="message"`, this function will log the
same content only once, even if they are called from different places.
If called with `n=1, key=("caller", "message")`, this function
will not log only if the same caller has logged the same message before.
"""
if isinstance(key, str):
key = (key,)
assert len(key) > 0
caller_module, caller_key = _find_caller()
hash_key = ()
if "caller" in key:
hash_key = hash_key + caller_key
if "message" in key:
hash_key = hash_key + (msg,)
_LOG_COUNTER[hash_key] += 1
if _LOG_COUNTER[hash_key] <= n:
logging.getLogger(name or caller_module).log(lvl, msg)
def log_every_n(lvl, msg, n=1, *, name=None):
"""
Log once per n times.
Args:
lvl (int): the logging level
msg (str):
n (int):
name (str): name of the logger to use. Will use the caller's module by default.
"""
caller_module, key = _find_caller()
_LOG_COUNTER[key] += 1
if n == 1 or _LOG_COUNTER[key] % n == 1:
logging.getLogger(name or caller_module).log(lvl, msg)
def log_every_n_seconds(lvl, msg, n=1, *, name=None):
"""
Log no more than once per n seconds.
Args:
lvl (int): the logging level
msg (str):
n (int):
name (str): name of the logger to use. Will use the caller's module by default.
"""
caller_module, key = _find_caller()
last_logged = _LOG_TIMER.get(key, None)
current_time = time.time()
if last_logged is None or current_time - last_logged >= n:
logging.getLogger(name or caller_module).log(lvl, msg)
_LOG_TIMER[key] = current_time
def create_small_table(small_dict):
"""
Create a small table using the keys of small_dict as headers. This is only
suitable for small dictionaries.
Args:
small_dict (dict): a result dictionary of only a few items.
Returns:
str: the table as a string.
"""
keys, values = tuple(zip(*small_dict.items()))
table = tabulate(
[values],
headers=keys,
tablefmt="pipe",
floatfmt=".3f",
stralign="center",
numalign="center",
)
return table
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment