""" Parts of this code are from torchvision and thus licensed under BSD 3-Clause License Copyright (c) Soumith Chintala 2016, All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ from __future__ import division import math from typing import Sequence, TypeVar import torch from torch.jit.annotations import List, Tuple from torch import Tensor from torchvision.models.detection._utils import BoxCoder @torch.jit.script def encode_boxes(reference_boxes: torch.Tensor, proposals: torch.Tensor, weights: torch.Tensor, ) -> torch.Tensor: """ Encode a set of proposals with respect to some reference boxes Args: reference_boxes: reference boxes (x1, y1, x2, y2, (z1, z2)) proposals: boxes to be encoded (x1, y1, x2, y2, (z1, z2)) weights: weights for dimensions (wx, wy, ww, wh, wz, wd) """ # perform some unpacking to make it JIT-fusion friendly wx = weights[0] wy = weights[1] ww = weights[2] wh = weights[3] proposals_x1 = proposals[:, 0].unsqueeze(1) proposals_y1 = proposals[:, 1].unsqueeze(1) proposals_x2 = proposals[:, 2].unsqueeze(1) proposals_y2 = proposals[:, 3].unsqueeze(1) reference_boxes_x1 = reference_boxes[:, 0].unsqueeze(1) reference_boxes_y1 = reference_boxes[:, 1].unsqueeze(1) reference_boxes_x2 = reference_boxes[:, 2].unsqueeze(1) reference_boxes_y2 = reference_boxes[:, 3].unsqueeze(1) # implementation starts here ex_widths = proposals_x2 - proposals_x1 ex_heights = proposals_y2 - proposals_y1 ex_ctr_x = proposals_x1 + 0.5 * ex_widths ex_ctr_y = proposals_y1 + 0.5 * ex_heights gt_widths = reference_boxes_x2 - reference_boxes_x1 gt_heights = reference_boxes_y2 - reference_boxes_y1 gt_ctr_x = reference_boxes_x1 + 0.5 * gt_widths gt_ctr_y = reference_boxes_y1 + 0.5 * gt_heights targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights targets_dw = ww * torch.log(gt_widths / ex_widths) targets_dh = wh * torch.log(gt_heights / ex_heights) if proposals.shape[1] == 6: wz = weights[4] wd = weights[5] proposals_z1 = proposals[:, 4].unsqueeze(1) proposals_z2 = proposals[:, 5].unsqueeze(1) ex_depth = proposals_z2 - proposals_z1 ex_ctr_z = proposals_z1 + 0.5 * ex_depth reference_boxes_z1 = reference_boxes[:, 4].unsqueeze(1) reference_boxes_z2 = reference_boxes[:, 5].unsqueeze(1) gt_depth = reference_boxes_z2 - reference_boxes_z1 gt_ctr_z = reference_boxes_z1 + 0.5 * gt_depth targets_dz = wz * (gt_ctr_z - ex_ctr_z) / ex_depth targets_dd = wd * torch.log(gt_depth / ex_depth) targets = torch.cat((targets_dx, targets_dy, targets_dw, targets_dh, targets_dz, targets_dd), dim=1) else: targets = torch.cat((targets_dx, targets_dy, targets_dw, targets_dh), dim=1) return targets def decode_single(rel_codes: Tensor, boxes: Tensor, weights: Sequence[float], bbox_xform_clip: float) -> Tensor: """ From a set of original boxes and encoded relative box offsets, get the decoded boxes. Args: rel_codes: encoded boxes [Num_boxes x (dim * 2)] (dx, dy, dw, dh, dz, dd) boxes: reference boxes (x1, y1, x2, y2, (z1, z2)) """ # offset is 4 in case of 2d data and 6 in case of 3d offset = boxes.shape[1] boxes = boxes.to(rel_codes.dtype) widths = boxes[:, 2] - boxes[:, 0] heights = boxes[:, 3] - boxes[:, 1] ctr_x = boxes[:, 0] + 0.5 * widths ctr_y = boxes[:, 1] + 0.5 * heights wx = weights[0] wy = weights[1] ww = weights[2] wh = weights[3] dx = rel_codes[:, 0::offset] / wx dy = rel_codes[:, 1::offset] / wy dw = rel_codes[:, 2::offset] / ww dh = rel_codes[:, 3::offset] / wh # Prevent sending too large values into torch.exp() dw = torch.clamp(dw, max=bbox_xform_clip) dh = torch.clamp(dh, max=bbox_xform_clip) pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] pred_w = torch.exp(dw) * widths[:, None] pred_h = torch.exp(dh) * heights[:, None] pred_boxes1 = pred_ctr_x - torch.tensor(0.5, dtype=pred_ctr_x.dtype) * pred_w pred_boxes2 = pred_ctr_y - torch.tensor(0.5, dtype=pred_ctr_y.dtype) * pred_h pred_boxes3 = pred_ctr_x + torch.tensor(0.5, dtype=pred_ctr_x.dtype) * pred_w pred_boxes4 = pred_ctr_y + torch.tensor(0.5, dtype=pred_ctr_y.dtype) * pred_h if offset == 6: depths = boxes[:, 5] - boxes[:, 4] ctr_z = boxes[:, 4] + 0.5 * depths wz = weights[4] wd = weights[5] dz = rel_codes[:, 4::offset] / wz dd = rel_codes[:, 5::offset] / wd dd = torch.clamp(dd, max=bbox_xform_clip) pred_ctr_z = dz * depths[:, None] + ctr_z[:, None] pred_z = torch.exp(dd) * depths[:, None] pred_boxes5 = pred_ctr_z - torch.tensor(0.5, dtype=pred_ctr_z.dtype) * pred_z pred_boxes6 = pred_ctr_z + torch.tensor(0.5, dtype=pred_ctr_z.dtype) * pred_z pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4, pred_boxes5, pred_boxes6), dim=2).flatten(1) else: pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4), dim=2).flatten(1) return pred_boxes class BoxCoderND(BoxCoder): """ This class encodes and decodes a set of bounding boxes into the representation used for training the regressors. Compatible with 2d and 3d """ def encode(self, reference_boxes: List[Tensor], proposals: List[Tensor], ) -> Tuple[Tensor]: """ Encode a set of proposals with respect to some reference boxes Args: reference_boxes: reference boxes for each image. (x1, y1, x2, y2, (z1, z2)) proposals: proposals for each image (x1, y1, x2, y2, (z1, z2)) Returns: Tuple[Tensor]: regression targets for each image """ # filter for images which have a foreground class filter_min_one_gt = [rb.numel() > 0 for rb in reference_boxes] filtered_ref_boxes = [ rb for idx, rb in enumerate(reference_boxes) if filter_min_one_gt[idx]] filtered_proposals = [ pr for idx, pr in enumerate(proposals) if filter_min_one_gt[idx]] if any(filter_min_one_gt): filtered_encoded = super().encode(filtered_ref_boxes, filtered_proposals) # fill image with no ground truth idx_enc = 0 encoded = [] for img_idx, gt_present in enumerate(filter_min_one_gt): if gt_present: encoded.append(filtered_encoded[idx_enc]) idx_enc += 1 else: # fill with zeros because they do not contribute to the # regression loss anyway (all anchors are labeled as background) encoded.append(torch.zeros_like(proposals[img_idx])) return encoded def encode_single(self, reference_boxes: Tensor, proposals: Tensor, ) -> Tensor: """ Encode a set of proposals with respect to some reference boxes Arguments: reference_boxes: reference boxes (x1, y1, x2, y2, (z1, z2)) proposals: boxes to be encoded (x1, y1, x2, y2, (z1, z2)) """ dtype, device = reference_boxes.dtype, reference_boxes.device weights = torch.tensor(self.weights, dtype=dtype, device=device) targets = encode_boxes(reference_boxes, proposals, weights) return targets def decode(self, rel_codes: Tensor, boxes: List[Tensor]) -> Tensor: """ Decode boxes Args: rel_codes: relative offsets to reference boxes (dx, dy, dw, dh, (dz, dd))[N, dim * 2] boxes: list of reference boxes per image (x1, y1, x2, y2, (z1, z2)) Returns: Tensor: decoded boxes """ assert isinstance(boxes, (list, tuple)) assert isinstance(rel_codes, torch.Tensor) boxes_per_image = [b.size(0) for b in boxes] concat_boxes = torch.cat(boxes, dim=0) spatial_dims = concat_boxes.shape[1] box_sum = 0 for val in boxes_per_image: box_sum += val pred_boxes = self.decode_single(rel_codes.reshape(box_sum, -1), concat_boxes) return pred_boxes.reshape(box_sum, spatial_dims) def decode_single(self, rel_codes: torch.Tensor, boxes: torch.Tensor): dtype, device = rel_codes.dtype, rel_codes.device return decode_single(rel_codes, boxes, self.weights, self.bbox_xform_clip) CoderType = TypeVar('CoderType', bound=BoxCoderND)