version 1

64b02fb6 · liangjing · 64b02fb6 · 64b02fb6 · 64b02fb6 · 64b02fb6
Commit 64b02fb6 authored Apr 11, 2023 by liangjing
20 changed files
--- a/model/backbone_utils.py
+++ b/model/backbone_utils.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from torch import nn
+
+from torchvision.ops import misc as misc_nn_ops
+
+import model.resnet
+from model.utils import IntermediateLayerGetter
+from model.feature_pyramid_network import FeaturePyramidNetwork, LastLevelMaxPool
+
+
+class BackboneWithFPN(nn.Module):
+    """
+    Adds a FPN on top of a model.
+    Internally, it uses torchvision.models._utils.IntermediateLayerGetter to
+    extract a submodel that returns the feature maps specified in return_layers.
+    The same limitations of IntermediateLayerGetter apply here.
+    Args:
+        backbone (nn.Module)
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+        in_channels_list (List[int]): number of channels for each feature map
+            that is returned, in the order they are present in the OrderedDict
+        out_channels (int): number of channels in the FPN.
+    Attributes:
+        out_channels (int): the number of channels in the FPN
+    """
+    def __init__(self, backbone, return_layers, in_channels_list, out_channels, extra_blocks=None, module_name=""):
+        super(BackboneWithFPN, self).__init__()
+
+        if extra_blocks is None:
+            extra_blocks = LastLevelMaxPool()
+
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.fpn = FeaturePyramidNetwork(
+            in_channels_list=in_channels_list,
+            out_channels=out_channels,
+            extra_blocks=extra_blocks,
+            module_name="module.backbone.fpn",
+        )
+        self.out_channels = out_channels
+
+    def forward(self, x):
+        x = self.body(x)
+        x = self.fpn(x)
+        return x
+
+
+def resnet_fpn_backbone(
+    backbone_name,
+    pretrained,
+    norm_layer=misc_nn_ops.FrozenBatchNorm2d,
+    trainable_layers=3,
+    returned_layers=None,
+    extra_blocks=None,
+    **kwargs
+):
+    """
+    Constructs a specified ResNet backbone with FPN on top. Freezes the specified number of layers in the backbone.
+
+    Examples::
+
+        >>> from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
+        >>> backbone = resnet_fpn_backbone('resnet50', pretrained=True, trainable_layers=3)
+        >>> # get some dummy image
+        >>> x = torch.rand(1,3,64,64)
+        >>> # compute the output
+        >>> output = backbone(x)
+        >>> print([(k, v.shape) for k, v in output.items()])
+        >>> # returns
+        >>>   [('0', torch.Size([1, 256, 16, 16])),
+        >>>    ('1', torch.Size([1, 256, 8, 8])),
+        >>>    ('2', torch.Size([1, 256, 4, 4])),
+        >>>    ('3', torch.Size([1, 256, 2, 2])),
+        >>>    ('pool', torch.Size([1, 256, 1, 1]))]
+
+    Args:
+        backbone_name (string): resnet architecture. Possible values are 'resnet50',
+             'resnet101', 'resnext50_32x4d', 'resnext101_32x8d'
+        pretrained (bool): If True, returns a model with backbone pre-trained on Imagenet
+        norm_layer (torchvision.ops): it is recommended to use the default value. For details visit:
+            (https://github.com/facebookresearch/maskrcnn-benchmark/issues/267)
+        trainable_layers (int): number of trainable (not frozen) resnet layers starting from final block.
+            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
+        returned_layers (list of int): The layers of the network to return. Each entry must be in ``[1, 4]``.
+            By default all layers are returned.
+        extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
+            be performed. It is expected to take the fpn features, the original
+            features and the names of the original features as input, and returns
+            a new list of feature maps and their corresponding names. By
+            default a ``LastLevelMaxPool`` is used.
+    """
+    backbone = model.resnet.__dict__[backbone_name](pretrained=pretrained, norm_layer=norm_layer, **kwargs)
+
+    # select layers that wont be frozen
+    assert 0 <= trainable_layers <= 5
+    layers_to_train = ["layer4", "layer3", "layer2", "layer1", "conv1"][:trainable_layers]
+    if trainable_layers == 5:
+        layers_to_train.append("bn1")
+    for name, parameter in backbone.named_parameters():
+        if all([not name.startswith(layer) for layer in layers_to_train]):
+            parameter.requires_grad_(False)
+
+    if extra_blocks is None:
+        extra_blocks = LastLevelMaxPool()
+
+    if returned_layers is None:
+        returned_layers = [1, 2, 3, 4]
+    assert min(returned_layers) > 0 and max(returned_layers) < 5
+    return_layers = {f"layer{k}": str(v) for v, k in enumerate(returned_layers)}
+
+    in_channels_stage2 = backbone.inplanes // 8
+    in_channels_list = [in_channels_stage2 * 2 ** (i - 1) for i in returned_layers]
+    out_channels = 256
+    return BackboneWithFPN(backbone, return_layers, in_channels_list, out_channels, extra_blocks=extra_blocks)
+
+
+def _validate_trainable_layers(pretrained, trainable_backbone_layers, max_value, default_value):
+    # dont freeze any layers if pretrained model or backbone is not used
+    if not pretrained:
+        if trainable_backbone_layers is not None:
+            warnings.warn(
+                "Changing trainable_backbone_layers has not effect if "
+                "neither pretrained nor pretrained_backbone have been set to True, "
+                "falling back to trainable_backbone_layers={} so that all layers are trainable".format(max_value))
+        trainable_backbone_layers = max_value
+
+    # by default freeze first blocks
+    if trainable_backbone_layers is None:
+        trainable_backbone_layers = default_value
+    assert 0 <= trainable_backbone_layers <= max_value
+    return trainable_backbone_layers
+
--- a/model/boxes.py
+++ b/model/boxes.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch import Tensor
+from typing import Tuple
+import torchvision
+from torchvision.extension import _assert_has_ops
+
+
+def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
+    """
+    Performs non-maximum suppression (NMS) on the boxes according
+    to their intersection-over-union (IoU).
+
+    NMS iteratively removes lower scoring boxes which have an
+    IoU greater than iou_threshold with another (higher scoring)
+    box.
+
+    If multiple boxes have the exact same score and satisfy the IoU
+    criterion with respect to a reference box, the selected box is
+    not guaranteed to be the same between CPU and GPU. This is similar
+    to the behavior of argsort in PyTorch when repeated values are present.
+
+    Args:
+        boxes (Tensor[N, 4])): boxes to perform NMS on. They
+            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
+            ``0 <= y1 < y2``.
+        scores (Tensor[N]): scores for each one of the boxes
+        iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
+
+    Returns:
+        Tensor: int64 tensor with the indices of the elements that have been kept
+        by NMS, sorted in decreasing order of scores
+    """
+    _assert_has_ops()
+    return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
+
+
+def batched_nms(
+    boxes: Tensor,
+    scores: Tensor,
+    idxs: Tensor,
+    iou_threshold: float,
+) -> Tensor:
+    """
+    Performs non-maximum suppression in a batched fashion.
+
+    Each index value correspond to a category, and NMS
+    will not be applied between elements of different categories.
+
+    Args:
+        boxes (Tensor[N, 4]): boxes where NMS will be performed. They
+            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
+            ``0 <= y1 < y2``.
+        scores (Tensor[N]): scores for each one of the boxes
+        idxs (Tensor[N]): indices of the categories for each one of the boxes.
+        iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
+
+    Returns:
+        Tensor: int64 tensor with the indices of the elements that have been kept by NMS, sorted
+        in decreasing order of scores
+    """
+    # Benchmarks that drove the following thresholds are at
+    # https://github.com/pytorch/vision/issues/1311#issuecomment-781329339
+    # Ideally for GPU we'd use a higher threshold
+    if boxes.numel() > 4_000 and not torchvision._is_tracing():
+        return _batched_nms_vanilla(boxes, scores, idxs, iou_threshold)
+    else:
+        return _batched_nms_coordinate_trick(boxes, scores, idxs, iou_threshold)
+
+
+@torch.jit._script_if_tracing
+def _batched_nms_coordinate_trick(
+    boxes: Tensor,
+    scores: Tensor,
+    idxs: Tensor,
+    iou_threshold: float,
+) -> Tensor:
+    # strategy: in order to perform NMS independently per class,
+    # we add an offset to all the boxes. The offset is dependent
+    # only on the class idx, and is large enough so that boxes
+    # from different classes do not overlap
+    if boxes.numel() == 0:
+        return torch.empty((0,), dtype=torch.int64, device=boxes.device)
+    max_coordinate = boxes.max()
+    offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
+    boxes_for_nms = boxes + offsets[:, None]
+    keep = nms(boxes_for_nms, scores, iou_threshold)
+    return keep
+
+
+@torch.jit._script_if_tracing
+def _batched_nms_vanilla(
+    boxes: Tensor,
+    scores: Tensor,
+    idxs: Tensor,
+    iou_threshold: float,
+) -> Tensor:
+    # Based on Detectron2 implementation, just manually call nms() on each class independently
+    keep_mask = torch.zeros_like(scores, dtype=torch.bool)
+    for class_id in torch.unique(idxs):
+        curr_indices = torch.where(idxs == class_id)[0]
+        curr_keep_indices = nms(boxes[curr_indices], scores[curr_indices], iou_threshold)
+        keep_mask[curr_indices[curr_keep_indices]] = True
+    keep_indices = torch.where(keep_mask)[0]
+    return keep_indices[scores[keep_indices].sort(descending=True)[1]]
+
+
+def clip_boxes_to_image(boxes: Tensor, size: Tuple[int, int]) -> Tensor:
+    """
+    Clip boxes so that they lie inside an image of size `size`.
+
+    Args:
+        boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format
+            with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+        size (Tuple[height, width]): size of the image
+
+    Returns:
+        Tensor[N, 4]: clipped boxes
+    """
+    dim = boxes.dim()
+    boxes_x = boxes[..., 0::2]
+    boxes_y = boxes[..., 1::2]
+    height, width = size
+
+    if torchvision._is_tracing():
+        boxes_x = torch.max(boxes_x, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
+        boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
+        boxes_y = torch.max(boxes_y, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
+        boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
+    else:
+        boxes_x = boxes_x.clamp(min=0, max=width)
+        boxes_y = boxes_y.clamp(min=0, max=height)
+
+    clipped_boxes = torch.stack((boxes_x, boxes_y), dim=dim)
+    return clipped_boxes.reshape(boxes.shape)
+
+
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+def box_area(boxes: Tensor) -> Tensor:
+    """
+    Computes the area of a set of bounding boxes, which are specified by their
+    (x1, y1, x2, y2) coordinates.
+
+    Args:
+        boxes (Tensor[N, 4]): boxes for which the area will be computed. They
+            are expected to be in (x1, y1, x2, y2) format with
+            ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+
+    Returns:
+        Tensor[N]: the area for each box
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
+# with slight modifications
+def _box_inter_union(boxes1: Tensor, boxes2: Tensor) -> Tuple[Tensor, Tensor]:
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = _upcast(rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    return inter, union
+
+
+def box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+    """
+    Return intersection-over-union (Jaccard index) between two sets of boxes.
+
+    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
+    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+
+    Args:
+        boxes1 (Tensor[N, 4]): first set of boxes
+        boxes2 (Tensor[M, 4]): second set of boxes
+
+    Returns:
+        Tensor[N, M]: the NxM matrix containing the pairwise IoU values for every element in boxes1 and boxes2
+    """
+    inter, union = _box_inter_union(boxes1, boxes2)
+    iou = inter / union
+    return iou
--- a/model/feature_pyramid_network.py
+++ b/model/feature_pyramid_network.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from typing import Tuple, List, Dict, Optional
+
+from mlperf_logger import mllogger
+from mlperf_logging.mllog.constants import WEIGHTS_INITIALIZATION
+
+
+class ExtraFPNBlock(nn.Module):
+    """
+    Base class for the extra block in the FPN.
+
+    Args:
+        results (List[Tensor]): the result of the FPN
+        x (List[Tensor]): the original feature maps
+        names (List[str]): the names for each one of the
+            original feature maps
+
+    Returns:
+        results (List[Tensor]): the extended set of results
+            of the FPN
+        names (List[str]): the extended set of names for the results
+    """
+    def forward(
+        self,
+        results: List[Tensor],
+        x: List[Tensor],
+        names: List[str],
+    ) -> Tuple[List[Tensor], List[str]]:
+        pass
+
+
+class FeaturePyramidNetwork(nn.Module):
+    """
+    Module that adds a FPN from on top of a set of feature maps. This is based on
+    `"Feature Pyramid Network for Object Detection" <https://arxiv.org/abs/1612.03144>`_.
+
+    The feature maps are currently supposed to be in increasing depth
+    order.
+
+    The input to the model is expected to be an OrderedDict[Tensor], containing
+    the feature maps on top of which the FPN will be added.
+
+    Args:
+        in_channels_list (list[int]): number of channels for each feature map that
+            is passed to the module
+        out_channels (int): number of channels of the FPN representation
+        extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
+            be performed. It is expected to take the fpn features, the original
+            features and the names of the original features as input, and returns
+            a new list of feature maps and their corresponding names
+
+    Examples::
+
+        >>> m = torchvision.ops.FeaturePyramidNetwork([10, 20, 30], 5)
+        >>> # get some dummy data
+        >>> x = OrderedDict()
+        >>> x['feat0'] = torch.rand(1, 10, 64, 64)
+        >>> x['feat2'] = torch.rand(1, 20, 16, 16)
+        >>> x['feat3'] = torch.rand(1, 30, 8, 8)
+        >>> # compute the FPN on top of x
+        >>> output = m(x)
+        >>> print([(k, v.shape) for k, v in output.items()])
+        >>> # returns
+        >>>   [('feat0', torch.Size([1, 5, 64, 64])),
+        >>>    ('feat2', torch.Size([1, 5, 16, 16])),
+        >>>    ('feat3', torch.Size([1, 5, 8, 8]))]
+
+    """
+    def __init__(
+        self,
+        in_channels_list: List[int],
+        out_channels: int,
+        extra_blocks: Optional[ExtraFPNBlock] = None,
+        module_name: Optional[str] = "",
+    ):
+        super(FeaturePyramidNetwork, self).__init__()
+        self.inner_blocks = nn.ModuleList()
+        self.layer_blocks = nn.ModuleList()
+        for in_channels in in_channels_list:
+            if in_channels == 0:
+                raise ValueError("in_channels=0 is currently not supported")
+            inner_block_module = nn.Conv2d(in_channels, out_channels, 1)
+            layer_block_module = nn.Conv2d(out_channels, out_channels, 3, padding=1)
+            self.inner_blocks.append(inner_block_module)
+            self.layer_blocks.append(layer_block_module)
+
+        # initialize parameters now to avoid modifying the initialization of top_blocks
+        for name, m in self.named_modules(prefix=module_name):
+            if isinstance(m, nn.Conv2d):
+                mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.weight"})
+                nn.init.kaiming_uniform_(m.weight, a=1)
+                mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.bias"})
+                nn.init.constant_(m.bias, 0)
+
+        if extra_blocks is not None:
+            assert isinstance(extra_blocks, ExtraFPNBlock)
+        self.extra_blocks = extra_blocks
+
+    def get_result_from_inner_blocks(self, x: Tensor, idx: int) -> Tensor:
+        """
+        This is equivalent to self.inner_blocks[idx](x),
+        but torchscript doesn't support this yet
+        """
+        num_blocks = len(self.inner_blocks)
+        if idx < 0:
+            idx += num_blocks
+        i = 0
+        out = x
+        for module in self.inner_blocks:
+            if i == idx:
+                out = module(x)
+            i += 1
+        return out
+
+    def get_result_from_layer_blocks(self, x: Tensor, idx: int) -> Tensor:
+        """
+        This is equivalent to self.layer_blocks[idx](x),
+        but torchscript doesn't support this yet
+        """
+        num_blocks = len(self.layer_blocks)
+        if idx < 0:
+            idx += num_blocks
+        i = 0
+        out = x
+        for module in self.layer_blocks:
+            if i == idx:
+                out = module(x)
+            i += 1
+        return out
+
+    def forward(self, x: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """
+        Computes the FPN for a set of feature maps.
+
+        Args:
+            x (OrderedDict[Tensor]): feature maps for each feature level.
+
+        Returns:
+            results (OrderedDict[Tensor]): feature maps after FPN layers.
+                They are ordered from highest resolution first.
+        """
+        # unpack OrderedDict into two lists for easier handling
+        names = list(x.keys())
+        x = list(x.values())
+
+        last_inner = self.get_result_from_inner_blocks(x[-1], -1)
+        results = []
+        results.append(self.get_result_from_layer_blocks(last_inner, -1))
+
+        for idx in range(len(x) - 2, -1, -1):
+            inner_lateral = self.get_result_from_inner_blocks(x[idx], idx)
+            feat_shape = inner_lateral.shape[-2:]
+            inner_top_down = F.interpolate(last_inner, size=feat_shape, mode="nearest")
+            last_inner = inner_lateral + inner_top_down
+            results.insert(0, self.get_result_from_layer_blocks(last_inner, idx))
+
+        if self.extra_blocks is not None:
+            results, names = self.extra_blocks(results, x, names)
+
+        # make it back an OrderedDict
+        out = OrderedDict([(k, v) for k, v in zip(names, results)])
+
+        return out
+
+
+class LastLevelMaxPool(ExtraFPNBlock):
+    """
+    Applies a max_pool2d on top of the last feature map
+    """
+    def forward(
+        self,
+        x: List[Tensor],
+        y: List[Tensor],
+        names: List[str],
+    ) -> Tuple[List[Tensor], List[str]]:
+        names.append("pool")
+        x.append(F.max_pool2d(x[-1], 1, 2, 0))
+        return x, names
+
+
+class LastLevelP6P7(ExtraFPNBlock):
+    """
+    This module is used in RetinaNet to generate extra layers, P6 and P7.
+    """
+    def __init__(self, in_channels: int, out_channels: int, module_name: Optional[str]=""):
+        super(LastLevelP6P7, self).__init__()
+        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
+        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
+        for name, module in self.named_modules(prefix=module_name):
+            if module in [self.p6, self.p7]:
+                mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.weight"})
+                nn.init.kaiming_uniform_(module.weight, a=1)
+                mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.bias"})
+                nn.init.constant_(module.bias, 0)
+        self.use_P5 = in_channels == out_channels
+
+    def forward(
+        self,
+        p: List[Tensor],
+        c: List[Tensor],
+        names: List[str],
+    ) -> Tuple[List[Tensor], List[str]]:
+        p5, c5 = p[-1], c[-1]
+        x = p5 if self.use_P5 else c5
+        p6 = self.p6(x)
+        p7 = self.p7(F.relu(p6))
+        p.extend([p6, p7])
+        names.extend(["p6", "p7"])
+        return p, names
--- a/model/focal_loss.py
+++ b/model/focal_loss.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F
+
+try:
+    from apex.contrib.focal_loss.focal_loss import FocalLoss
+    focal_loss_opt = FocalLoss.apply
+except ImportError as err:
+    print("Could not import APEX fused focal loss, it's fine if you do not use --apex-focal-loss")
+
+
+def sigmoid_focal_loss(
+    inputs: torch.Tensor,
+    targets: torch.Tensor,
+    alpha: float = 0.25,
+    gamma: float = 2,
+    reduction: str = "none",
+):
+    """
+    Original implementation from https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/focal_loss.py .
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples or -1 for ignore. Default = 0.25
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+        reduction: 'none' | 'mean' | 'sum'
+                 'none': No reduction will be applied to the output.
+                 'mean': The output will be averaged.
+                 'sum': The output will be summed.
+    Returns:
+        Loss tensor with the reduction option applied.
+    """
+    p = torch.sigmoid(inputs)
+    ce_loss = F.binary_cross_entropy_with_logits(
+        inputs, targets, reduction="none"
+    )
+    p_t = p * targets + (1 - p) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    if reduction == "mean":
+        loss = loss.mean()
+    elif reduction == "sum":
+        loss = loss.sum()
+
+    return loss
+
+
+# The following focal loss implementation is similar to the previous one, besides an additional mask operation.
+# The mask operation is handy when using CUDA graphs, since it will enable fixed tensor dimension (otherwise,
+# for each image a different sized tensor would be used).
+def sigmoid_focal_loss_masked(
+    inputs: torch.Tensor,
+    targets: torch.Tensor,
+    mask: torch.Tensor,
+    alpha: float = 0.25,
+    gamma: float = 2,
+    reduction: str = "none",
+):
+    assert(reduction == "sum")
+
+    p = torch.sigmoid(inputs)
+    ce_loss = F.binary_cross_entropy_with_logits(
+        inputs, targets, reduction="none"
+    )
+    p_t = p * targets + (1 - p) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    loss = loss * mask
+    loss = loss.sum(dim=[1, 2])
+
+    return loss
+
+
+def sigmoid_focal_loss_masked_fused(
+    inputs: torch.Tensor,
+    targets: torch.Tensor,
+    mask: torch.Tensor,
+    alpha: float = 0.25,
+    gamma: float = 2,
+    label_smoothing: float = 0.0,
+    reduction: str = "none",
+    one_ptr: torch.Tensor = None
+):
+    assert(reduction == "sum")
+
+    num_classes = inputs.size(2)
+    inputs_ = inputs.reshape([inputs.size(0), 1, 13343, 9, num_classes])
+    # -2 indicates the kernel to ignore that value
+    targets_ = torch.where(mask, targets, -2)
+    targets_ = targets_.reshape([inputs.size(0), 1, 13343, 9])
+
+    # TODO: implement within the kernel and not with a loop
+    loss = []
+    inputs_list = torch.chunk(inputs_, inputs_.size(0))
+    targets_list = torch.chunk(targets_, targets_.size(0))
+    for b in range(inputs_.size(0)):
+        loss.append(focal_loss_opt(inputs_list[b], targets_list[b], one_ptr, num_classes, alpha, gamma, label_smoothing))
+
+    return torch.stack(loss)
+
--- a/model/frozen_bn.py
+++ b/model/frozen_bn.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+import torch
+from torch import Tensor
+from typing import Callable, List, Optional
+
+
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters
+    are fixed
+    """
+
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-5,
+        n: Optional[int] = None,
+    ):
+        # n=None for backward-compatibility
+        if n is not None:
+            warnings.warn("`n` argument is deprecated and has been renamed `num_features`",
+                          DeprecationWarning)
+            num_features = n
+        super(FrozenBatchNorm2d, self).__init__()
+        self.eps = eps
+        self.register_buffer("weight", torch.ones(num_features))
+        self.register_buffer("bias", torch.zeros(num_features))
+        self.register_buffer("running_mean", torch.zeros(num_features))
+        self.register_buffer("running_var", torch.ones(num_features))
+
+    def _load_from_state_dict(
+        self,
+        state_dict: dict,
+        prefix: str,
+        local_metadata: dict,
+        strict: bool,
+        missing_keys: List[str],
+        unexpected_keys: List[str],
+        error_msgs: List[str],
+    ):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+
+        # one-time preprocessing
+        self.weight = self.weight.reshape(1, -1, 1, 1)
+        self.bias = self.bias.reshape(1, -1, 1, 1)
+        self.running_var = self.running_var.reshape(1, -1, 1, 1)
+        self.running_mean = self.running_mean.reshape(1, -1, 1, 1)
+
+        # registering these variables as buffers
+        self.register_buffer("scale", self.weight * (self.running_var + self.eps).rsqrt())
+        self.register_buffer("bias_term", self.bias - self.running_mean * self.scale)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x * self.scale + self.bias_term
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.weight.shape[0]}, eps={self.eps})"
--- a/model/image_list.py
+++ b/model/image_list.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch import Tensor
+from typing import List, Tuple
+
+
+class ImageList(object):
+    """
+    Structure that holds a list of images (of possibly
+    varying sizes) as a single tensor.
+    This works by padding the images to the same size,
+    and storing in a field the original sizes of each image
+    """
+
+    def __init__(self, tensors: Tensor, image_sizes: List[Tuple[int, int]]):
+        """
+        Args:
+            tensors (tensor)
+            image_sizes (list[tuple[int, int]])
+        """
+        self.tensors = tensors
+        self.image_sizes = image_sizes
+
+    def to(self, device: torch.device) -> 'ImageList':
+        cast_tensor = self.tensors.to(device)
+        return ImageList(cast_tensor, self.image_sizes)
--- a/model/jit_fn.py
+++ b/model/jit_fn.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+import torch
+import torch.nn.functional as F
+from torch import Tensor, HalfTensor, BoolTensor
+from typing import Callable, List, Optional, Tuple
+from model.frozen_bn import FrozenBatchNorm2d
+
+
+# For debugging backprop put the following in the function and uncomment
+# import pydevd
+# pydevd.settrace(suspend=False, trace_only_current_thread=True)
+
+
+class bn_relu_wrapper(FrozenBatchNorm2d):
+    def __init__(self, num_features, eps=1e-5, n=None):
+        super(bn_relu_wrapper, self).__init__(num_features, eps, n)
+
+    def forward(self, x):
+        return bn_relu_jit.apply(x, self.scale, self.bias_term)
+
+
+class bn_relu_jit(torch.autograd.Function):
+    @staticmethod
+    @torch.cuda.amp.custom_fwd(cast_inputs=torch.half)
+    def forward(ctx, input, scale, bias):
+        bn_relu_out, relu_mask = fwd_bn_relu_jit(input, scale, bias)
+
+        ctx.save_for_backward(scale, relu_mask)
+        return bn_relu_out
+
+    @staticmethod
+    @torch.cuda.amp.custom_bwd
+    def backward(ctx, grad_output):
+        scale, relu_mask = ctx.saved_tensors
+
+        grad_input = bwd_bn_relu_jit(grad_output, scale, relu_mask)
+        return grad_input, None, None
+
+
+@torch.jit.script
+def fwd_bn_relu_jit(input: HalfTensor, scale: HalfTensor, bias: HalfTensor) -> Tuple[HalfTensor, BoolTensor]:
+    bn = input * scale + bias
+    bn_relu = torch.nn.functional.relu(bn)
+    relu_mask = bn > 0
+    return bn_relu, relu_mask
+
+
+@torch.jit.script
+def bwd_bn_relu_jit(grad_output: HalfTensor, scale: HalfTensor, relu_mask: BoolTensor) -> HalfTensor:
+    grad_input = grad_output * scale
+    grad_input = grad_input * relu_mask
+
+    return grad_input
+
+
+class bn_add_relu_jit(torch.autograd.Function):
+    @staticmethod
+    @torch.cuda.amp.custom_fwd(cast_inputs=torch.half)
+    def forward(ctx, input1, scale1, bias1, input2):
+        bn_relu_out, relu_mask = fwd_bn_add_relu_jit(input1, scale1, bias1, input2)
+
+        ctx.save_for_backward(scale1, relu_mask)
+        return bn_relu_out
+
+    @staticmethod
+    @torch.cuda.amp.custom_bwd
+    def backward(ctx, grad_output):
+        scale, relu_mask = ctx.saved_tensors
+
+        grad_input1, grad_input2 = bwd_bn_add_relu_jit(grad_output, scale, relu_mask)
+        return grad_input1, None, None, grad_input2
+
+
+@torch.jit.script
+def fwd_bn_add_relu_jit(input1: HalfTensor, scale1: HalfTensor, bias1: HalfTensor,
+                        input2: HalfTensor) -> Tuple[HalfTensor, BoolTensor]:
+    bn = input1 * scale1 + bias1
+    bn_add = bn + input2
+    bn_add_relu = torch.nn.functional.relu(bn_add)
+    relu_mask = bn_add > 0
+    return bn_add_relu, relu_mask
+
+
+@torch.jit.script
+def bwd_bn_add_relu_jit(grad_output: HalfTensor, scale: HalfTensor,
+                        relu_mask: BoolTensor) -> Tuple[HalfTensor, HalfTensor]:
+    grad_input2 = grad_output * relu_mask
+    grad_input1 = grad_input2 * scale
+
+    return grad_input1, grad_input2
+
+
+class bn_bn_add_relu_jit(torch.autograd.Function):
+    @staticmethod
+    @torch.cuda.amp.custom_fwd(cast_inputs=torch.half)
+    def forward(ctx, input1, scale1, bias1, input2, scale2, bias2):
+        bn_relu_out, relu_mask = fwd_bn_bn_add_relu_jit(input1, scale1, bias1,
+                                                        input2, scale2, bias2)
+
+        ctx.save_for_backward(scale1, scale2, relu_mask)
+        return bn_relu_out
+
+    @staticmethod
+    @torch.cuda.amp.custom_bwd
+    def backward(ctx, grad_output):
+        scale1, scale2, relu_mask = ctx.saved_tensors
+
+        grad_input1, grad_input2 = bwd_bn_bn_add_relu_jit(grad_output, scale1, scale2, relu_mask)
+        return grad_input1, None, None, grad_input2, None, None
+
+
+@torch.jit.script
+def fwd_bn_bn_add_relu_jit(input1: HalfTensor, scale1: HalfTensor, bias1: HalfTensor,
+                           input2: HalfTensor, scale2: HalfTensor, bias2: HalfTensor) -> Tuple[HalfTensor, BoolTensor]:
+    bn1 = input1 * scale1 + bias1
+    bn2 = input2 * scale2 + bias2
+    bn_add = bn1 + bn2
+    bn_add_relu = torch.nn.functional.relu(bn_add)
+    relu_mask = bn_add > 0
+    return bn_add_relu, relu_mask
+
+
+@torch.jit.script
+def bwd_bn_bn_add_relu_jit(grad_output: HalfTensor, scale1: HalfTensor, scale2: HalfTensor,
+                           relu_mask: BoolTensor) -> Tuple[HalfTensor, HalfTensor]:
+    grad_output_masked = grad_output * relu_mask
+    grad_input1 = grad_output_masked * scale1
+    grad_input2 = grad_output_masked * scale2
+
+    return grad_input1, grad_input2
--- a/model/measure.py
+++ b/model/measure.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import time
+import numpy as np
+import pickle
+
+IS_PROFILE = False
+IS_WALL_TIME = False
+
+
+class stats_wrapper:
+    def __init__(self):
+        self.records = {}
+        self.warmup_t = 0
+
+    def print_all(self):
+        print('>>> START STATS PRINT <<<')
+        for k, v in self.records.items():
+            samples = np.asarray(v['samples'])
+
+            mean = np.mean(samples)
+            standard_deviation = np.std(samples)
+            distance_from_mean = abs(samples - mean)
+            max_deviations = 2
+            not_outlier = distance_from_mean < max_deviations * standard_deviation
+            samples_ = samples[not_outlier]
+
+            avg = samples_.mean()
+            var = samples_.var()
+            print('{}, {}, {}, {}, {}'.format(k, avg * 1000, var * 1000, samples.max() * 1000, samples.min() * 1000))
+        print('>>> END STATS PRINT <<<')
+        pickle.dump(self.records, open(b"records.pkl", "wb"))
+
+    def create(self, k):
+        if k not in self.records:
+            self.records[k] = {'n': self.warmup_t * (-1), 'samples': []}
+
+    def add(self, k, v):
+        self.records[k]['samples'].append(v)
+        self.records[k]['n'] += 1
+
+
+class measure_t:
+    def __init__(self, name, enable=True):
+        self.name = name
+        self.t0, self.t1 = 0, 0
+
+        self.enable = enable
+        self.is_running = False
+
+        if enable:
+            stats.create(self.name)
+
+    def __enter__(self):
+        if not self.enable:
+            return
+
+        self.start()
+
+    def __exit__(self, type, value, traceback):
+        if not self.enable:
+            return
+
+        self.stop()
+
+    def start(self):
+        if not self.enable:
+            return
+
+        if IS_PROFILE:
+            torch.cuda.nvtx.range_push(self.name)
+
+        if IS_WALL_TIME:
+            torch.cuda.synchronize()
+            self.t0 = time.time()
+
+        self.is_running = True
+
+    def stop(self):
+        if not self.enable:
+            return
+
+        if self.is_running:
+            if IS_PROFILE:
+                torch.cuda.nvtx.range_pop()
+
+            if IS_WALL_TIME:
+                torch.cuda.synchronize()
+                self.t1 = time.time()
+                delta = self.t1 - self.t0
+                stats.add(self.name, delta)
+
+            self.is_running = False
+
+
+stats = stats_wrapper()
--- a/model/resnet.py
+++ b/model/resnet.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch import Tensor
+import torch.nn as nn
+from torch.hub import load_state_dict_from_url
+from typing import Type, Any, Callable, Union, List, Optional
+from .jit_fn import bn_relu_jit, bn_add_relu_jit, bn_bn_add_relu_jit, bn_relu_wrapper
+
+from mlperf_logger import mllogger
+from mlperf_logging.mllog.constants import WEIGHTS_INITIALIZATION
+
+__all__ = ['resnet50', 'resnet101',
+           'resnext50_32x4d', 'resnext101_32x8d']
+
+
+model_urls = {
+    'resnet50': 'https://download.pytorch.org/models/resnet50-0676ba61.pth',
+    'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-63fe2227.pth',
+    'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
+}
+
+
+def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+
+
+def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion: int = 1
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion: int = 4
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class BottleneckJIT(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion: int = 4
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super(BottleneckJIT, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = bn_relu_jit.apply(out, self.bn1.scale, self.bn1.bias_term)
+
+        out = self.conv2(out)
+        out = bn_relu_jit.apply(out, self.bn2.scale, self.bn2.bias_term)
+
+        out = self.conv3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample[0](x)
+            out = bn_bn_add_relu_jit.apply(out, self.bn3.scale, self.bn3.bias_term,
+                                           identity, self.downsample[1].scale, self.downsample[1].bias_term)
+        else:
+            out = bn_add_relu_jit.apply(out, self.bn3.scale, self.bn3.bias_term, identity)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        layers: List[int],
+        num_classes: int = 1000,
+        zero_init_residual: bool = False,
+        groups: int = 1,
+        width_per_group: int = 64,
+        replace_stride_with_dilation: Optional[List[bool]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        module_name: Optional[str] = "",
+        **kwargs: Any
+    ) -> None:
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        self.jit = kwargs['jit']
+
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        # have to workaround with bn_relu_wrapper, since during execution the forward function is not called
+        if self.jit:
+            self.bn1 = bn_relu_wrapper(self.inplanes)
+        else:
+            self.bn1 = norm_layer(self.inplanes)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for name, m in self.named_modules(prefix=module_name):
+            if isinstance(m, nn.Conv2d):
+                mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.weight"})
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.weight"})
+                nn.init.constant_(m.weight, 1)
+                mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.bias"})
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for name, m in self.named_modules(prefix=module_name):
+                if isinstance(m, Bottleneck):
+                    mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.weight"})
+                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
+                elif isinstance(m, BasicBlock):
+                    mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{name}.weight"})
+                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]
+
+    def _make_layer(self, block: Type[Union[BasicBlock, Bottleneck]], planes: int, blocks: int,
+                    stride: int = 1, dilate: bool = False) -> nn.Sequential:
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = [block(self.inplanes, planes, stride, downsample, self.groups,
+                        self.base_width, previous_dilation, norm_layer)]
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=self.groups,
+                                base_width=self.base_width, dilation=self.dilation,
+                                norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x) if not self.jit else x
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def _resnet(
+    arch: str,
+    block: Type[Union[BasicBlock, Bottleneck]],
+    layers: List[int],
+    pretrained: bool,
+    progress: bool,
+    **kwargs: Any
+) -> ResNet:
+    model = ResNet(block, layers, module_name="module.backbone.body", **kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls[arch],
+                                              progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+
+
+def resnet50(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet101(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-101 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnext50_32x4d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNeXt-50 32x4d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    resnet_block = BottleneckJIT if kwargs['jit'] else Bottleneck
+
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 4
+    return _resnet('resnext50_32x4d', resnet_block, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+
+
+def resnext101_32x8d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNeXt-101 32x8d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)
--- a/model/retinanet.py
+++ b/model/retinanet.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections import OrderedDict
+import warnings
+
+import torch
+from torch import nn, Tensor
+from torch.hub import load_state_dict_from_url
+from typing import Dict, List, Tuple, Optional
+
+from model.anchor_utils import AnchorGenerator
+from model.transform import GeneralizedRCNNTransform
+from model.backbone_utils import resnet_fpn_backbone, _validate_trainable_layers
+from model.feature_pyramid_network import LastLevelP6P7
+from model.focal_loss import sigmoid_focal_loss, sigmoid_focal_loss_masked, sigmoid_focal_loss_masked_fused
+from model.boxes import box_iou, clip_boxes_to_image, batched_nms
+from model.utils import Matcher, MatcherBatch, overwrite_eps, BoxCoder
+
+from .frozen_bn import FrozenBatchNorm2d
+from torchvision.ops import misc as misc_nn_ops
+
+from mlperf_logger import mllogger
+from mlperf_logging.mllog.constants import WEIGHTS_INITIALIZATION
+import utils
+
+
+try:
+    from apex.contrib.conv_bias_relu import ConvBiasReLU, ConvBias
+except ImportError as err:
+    print("Could not import APEX fused Conv-Bias-ReLU, it's fine if you do not use --apex-head")
+
+
+__all__ = [
+    "retinanet_from_backbone",
+    "retinanet_resnet50_fpn",
+    "retinanet_resnet101_fpn",
+    "retinanet_resnext50_32x4d_fpn",
+    "retinanet_resnext101_32x8d_fpn",
+]
+
+
+class GradClone_(torch.autograd.Function):
+    @staticmethod
+    @torch.cuda.amp.custom_fwd
+    def forward(ctx, x):
+        return x
+
+    @staticmethod
+    @torch.cuda.amp.custom_bwd
+    def backward(ctx, grad_output):
+        return grad_output.clone()
+
+
+GradClone = GradClone_.apply
+
+
+def _sum(x: List[Tensor]) -> Tensor:
+    res = x[0]
+    for i in x[1:]:
+        res = res + i
+    return res
+
+
+def cudnn_fusion_warmup(bs_list):
+    hw_dim_list = [100, 50, 25, 13, 7]
+
+    for bs in bs_list:
+        for hw in hw_dim_list:
+            ConvBiasReLU(torch.rand([bs, 256, hw, hw], dtype=torch.half).to(memory_format=torch.channels_last).cuda(),
+                         torch.rand([256, 256, 3, 3], dtype=torch.half).to(memory_format=torch.channels_last).cuda(),
+                         torch.rand([1, 256, 1, 1], dtype=torch.half).to(memory_format=torch.channels_last).cuda(), 1, 1)
+            ConvBias(torch.rand([bs, 256, hw, hw], dtype=torch.half).to(memory_format=torch.channels_last).cuda(),
+                     torch.rand([2376, 256, 3, 3], dtype=torch.half).to(memory_format=torch.channels_last).cuda(),
+                     torch.rand([1, 2376, 1, 1], dtype=torch.half).to(memory_format=torch.channels_last).cuda(), 1, 1)
+            ConvBias(torch.rand([bs, 256, hw, hw], dtype=torch.half).to(memory_format=torch.channels_last).cuda(),
+                     torch.rand([36, 256, 3, 3], dtype=torch.half).to(memory_format=torch.channels_last).cuda(),
+                     torch.rand([1, 36, 1, 1], dtype=torch.half).to(memory_format=torch.channels_last).cuda(), 1, 1)
+
+
+class RetinaNetHead(nn.Module):
+    """
+    A regression and classification head for use in RetinaNet.
+
+    Args:
+        in_channels (int): number of channels of the input feature
+        num_anchors (int): number of anchors to be predicted
+        num_classes (int): number of classes to be predicted
+    """
+
+    def __init__(self, in_channels, num_anchors, num_classes, fusion=False):
+        super().__init__()
+        self.classification_head = RetinaNetClassificationHead(in_channels, num_anchors, num_classes, fusion=fusion,
+            module_name="module.head.classification_head")
+        self.regression_head = RetinaNetRegressionHead(in_channels, num_anchors, fusion=fusion,
+            module_name="module.head.regression_head")
+
+    def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
+        # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor], List[Tensor]) -> Dict[str, Tensor]
+        return {
+            'classification': self.classification_head.compute_loss(targets, head_outputs, matched_idxs),
+            'bbox_regression': self.regression_head.compute_loss(targets, head_outputs, anchors, matched_idxs),
+        }
+
+    def forward(self, x):
+        return [self.classification_head(x), self.regression_head(x)]
+
+
+class RetinaNetClassificationHead(nn.Module):
+    """
+    A classification head for use in RetinaNet.
+
+    Args:
+        in_channels (int): number of channels of the input feature
+        num_anchors (int): number of anchors to be predicted
+        num_classes (int): number of classes to be predicted
+    """
+
+    def __init__(self, in_channels, num_anchors, num_classes, prior_probability=0.01, fusion=False, module_name=""):
+        super().__init__()
+
+        conv = []
+        for _ in range(4):
+            conv.append(nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1))
+            conv.append(nn.ReLU())
+        self.conv = nn.Sequential(*conv)
+
+        for name, layer in self.conv.named_children():
+            if isinstance(layer, nn.Conv2d):
+                mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{module_name}.conv.{name}.weight"})
+                torch.nn.init.normal_(layer.weight, std=0.01)
+                mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{module_name}.conv.{name}.bias"})
+                torch.nn.init.constant_(layer.bias, 0)
+
+        self.cls_logits = nn.Conv2d(in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1)
+        mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{module_name}.cls_logits.weight"})
+        torch.nn.init.normal_(self.cls_logits.weight, std=0.01)
+        mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{module_name}.cls_logits.bias"})
+        torch.nn.init.constant_(self.cls_logits.bias, -math.log((1 - prior_probability) / prior_probability))
+
+        self.num_classes = num_classes
+        self.num_anchors = num_anchors
+
+        # This is to fix using det_utils.Matcher.BETWEEN_THRESHOLDS in TorchScript.
+        # TorchScript doesn't support class attributes.
+        # https://github.com/pytorch/vision/pull/1697#issuecomment-630255584
+        self.BETWEEN_THRESHOLDS = Matcher.BETWEEN_THRESHOLDS
+
+        self.register_buffer("one", torch.Tensor([1.]))
+
+        self.fusion = fusion
+
+    # --- original implementation ---
+    def compute_loss(self, targets, head_outputs, matched_idxs):
+        # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor]) -> Tensor
+        losses = []
+
+        cls_logits = head_outputs['cls_logits']
+
+        for labels_per_image, cls_logits_per_image, matched_idxs_per_image in zip(targets['labels'], cls_logits, matched_idxs):
+            # determine only the foreground
+            foreground_idxs_per_image = matched_idxs_per_image >= 0
+            num_foreground = foreground_idxs_per_image.sum()
+
+            # create the target classification
+            gt_classes_target = torch.zeros_like(cls_logits_per_image)
+            gt_classes_target[
+                foreground_idxs_per_image,
+                labels_per_image[matched_idxs_per_image[foreground_idxs_per_image]]
+            ] = 1.0
+
+            # find indices for which anchors should be ignored
+            valid_idxs_per_image = matched_idxs_per_image != self.BETWEEN_THRESHOLDS
+
+            # compute the classification loss
+            losses.append(sigmoid_focal_loss(
+                cls_logits_per_image[valid_idxs_per_image],
+                gt_classes_target[valid_idxs_per_image],
+                reduction='sum',
+            ) / max(1, num_foreground))
+
+        # doesn't matter which targets['?'] is taken, this represent the batch size
+        return _sum(losses) / len(targets['boxes'])
+
+    def compute_loss_prologue(self, target_labels, matched_idxs, one_hot):
+        # determine only the foreground
+        foreground_idxs_ = matched_idxs >= 0
+        num_foreground_ = foreground_idxs_.sum(dim=1)
+
+        # find indices for which anchors should be ignored
+        valid_idxs_ = matched_idxs != self.BETWEEN_THRESHOLDS
+
+        # TODO: unable to parallelize, try again
+        for i, (labels_per_image, matched_idxs_per_image, foreground_idxs_per_image) in \
+                enumerate(zip(target_labels, matched_idxs, foreground_idxs_)):
+
+            # create the target classification
+            if one_hot:
+                utils.ScratchPad.gt_classes_target[i][
+                    foreground_idxs_per_image,
+                    labels_per_image[matched_idxs_per_image[foreground_idxs_per_image]]
+                ] = 1.0
+            else:
+                utils.ScratchPad.gt_classes_target[i][foreground_idxs_per_image] = \
+                    labels_per_image[matched_idxs_per_image[foreground_idxs_per_image]]
+
+        return utils.ScratchPad.gt_classes_target, num_foreground_, valid_idxs_
+
+    def compute_loss_prologue_padded(self, target_labels, matched_idxs, one_hot, max_boxes):
+        # buffers are initialized in init_scratchpad
+        # utils.ScratchPad.gt_classes_target.fill_(0 if one_hot else -1)
+
+        # determine only the foreground
+        foreground_idxs_ = matched_idxs >= 0
+        num_foreground_ = foreground_idxs_.sum(dim=1)
+
+        # find indices for which anchors should be ignored
+        valid_idxs_ = matched_idxs != self.BETWEEN_THRESHOLDS
+
+        if one_hot:
+            idxs = torch.gather(target_labels, 1, torch.where(foreground_idxs_, matched_idxs, max_boxes))
+            utils.ScratchPad.gt_classes_target.scatter_(2, idxs[:, :, None], 1)
+            gt_classes_target = utils.ScratchPad.gt_classes_target[:, :, :-1]
+        else:
+            utils.ScratchPad.gt_classes_target = \
+                torch.gather(target_labels, 1, torch.where(foreground_idxs_, matched_idxs, max_boxes))
+            gt_classes_target = utils.ScratchPad.gt_classes_target
+
+        return gt_classes_target, num_foreground_, valid_idxs_
+
+    def compute_loss_core(self, cls_logits, gt_classes_target, valid_idxs, num_foreground, fused_focal_loss=False):
+        # notice that in the original implementation, the focal loss input dimension may differ
+        if not fused_focal_loss:
+            losses = sigmoid_focal_loss_masked(cls_logits, gt_classes_target, valid_idxs[:, :, None], reduction='sum')
+        else:
+            losses = sigmoid_focal_loss_masked_fused(cls_logits, gt_classes_target, valid_idxs, reduction='sum',
+                                                     one_ptr=self.one)
+        losses = losses / num_foreground
+
+        return _sum(losses) / num_foreground.size(0)
+
+    def forward(self, x):
+        # type: (List[Tensor]) -> Tensor
+        all_cls_logits = []
+
+        # since weights are shared, we can cast weights and biases only one time per iteration
+        if self.fusion:
+            conv1_w = self.conv[0].weight.half()
+            conv2_w = self.conv[2].weight.half()
+            conv3_w = self.conv[4].weight.half()
+            conv4_w = self.conv[6].weight.half()
+            conv5_w = self.cls_logits.weight.half()
+            conv1_b = self.conv[0].bias.reshape(1, -1, 1, 1).half()
+            conv2_b = self.conv[2].bias.reshape(1, -1, 1, 1).half()
+            conv3_b = self.conv[4].bias.reshape(1, -1, 1, 1).half()
+            conv4_b = self.conv[6].bias.reshape(1, -1, 1, 1).half()
+            conv5_b = self.cls_logits.bias.reshape(1, -1, 1, 1).half()
+
+        for features in x:
+            if not self.fusion:
+                cls_logits = self.conv(features)
+                cls_logits = self.cls_logits(cls_logits)
+            else:
+                cls_logits = ConvBiasReLU(features, conv1_w, conv1_b, 1, 1)
+                cls_logits = ConvBiasReLU(cls_logits, conv2_w, conv2_b, 1, 1)
+                cls_logits = ConvBiasReLU(cls_logits, conv3_w, conv3_b, 1, 1)
+                cls_logits = ConvBiasReLU(cls_logits, conv4_w, conv4_b, 1, 1)
+                cls_logits = ConvBias(cls_logits, conv5_w, conv5_b, 1, 1)
+
+                # cloning grad in backprop to make it contiguous for fusion code
+                cls_logits = GradClone(cls_logits)
+
+            # Permute classification output from (N, A * K, H, W) to (N, HWA, K).
+            N, _, H, W = cls_logits.shape
+            cls_logits = cls_logits.view(N, -1, self.num_classes, H, W)
+            cls_logits = cls_logits.permute(0, 3, 4, 1, 2)
+            cls_logits = cls_logits.reshape(N, -1, self.num_classes)  # Size=(N, HWA, 4)
+
+            all_cls_logits.append(cls_logits)
+
+        return torch.cat(all_cls_logits, dim=1)
+
+
+class RetinaNetRegressionHead(nn.Module):
+    """
+    A regression head for use in RetinaNet.
+
+    Args:
+        in_channels (int): number of channels of the input feature
+        num_anchors (int): number of anchors to be predicted
+    """
+    __annotations__ = {
+        'box_coder': BoxCoder,
+    }
+
+    def __init__(self, in_channels, num_anchors, fusion=False, module_name=""):
+        super().__init__()
+
+        conv = []
+        for _ in range(4):
+            conv.append(nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1))
+            conv.append(nn.ReLU())
+        self.conv = nn.Sequential(*conv)
+
+        self.bbox_reg = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1)
+        mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{module_name}.bbox_reg.weight"})
+        torch.nn.init.normal_(self.bbox_reg.weight, std=0.01)
+        mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{module_name}.bbox_reg.bias"})
+        torch.nn.init.zeros_(self.bbox_reg.bias)
+
+        for name, layer in self.conv.named_children():
+            if isinstance(layer, nn.Conv2d):
+                mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{module_name}.conv.{name}.weight"})
+                torch.nn.init.normal_(layer.weight, std=0.01)
+                mllogger.event(key=WEIGHTS_INITIALIZATION, metadata={"tensor": f"{module_name}.conv.{name}.bias"})
+                torch.nn.init.zeros_(layer.bias)
+
+        self.box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
+
+        self.fusion = fusion
+
+    # --- original implementation ---
+    def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
+        # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor], List[Tensor]) -> Tensor
+        losses = []
+
+        bbox_regression = head_outputs['bbox_regression']
+
+        for boxes_per_image, bbox_regression_per_image, anchors_per_image, matched_idxs_per_image in \
+                zip(targets['boxes'], bbox_regression, anchors, matched_idxs):
+            # determine only the foreground indices, ignore the rest
+            foreground_idxs_per_image = torch.where(matched_idxs_per_image >= 0)[0]
+            num_foreground = foreground_idxs_per_image.numel()
+
+            # select only the foreground boxes
+            matched_gt_boxes_per_image = boxes_per_image[matched_idxs_per_image[foreground_idxs_per_image]]
+            bbox_regression_per_image = bbox_regression_per_image[foreground_idxs_per_image, :]
+            anchors_per_image = anchors_per_image[foreground_idxs_per_image, :]
+
+            # compute the regression targets
+            target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image)
+
+            # compute the loss
+            losses.append(torch.nn.functional.l1_loss(
+                bbox_regression_per_image,
+                target_regression,
+                reduction='sum'
+            ) / max(1, num_foreground))
+
+        # doesn't matter which targets['?'] is taken, this represent the batch size
+        return _sum(losses) / len(targets['boxes'])
+
+    def compute_loss_prologue(self, target_boxes, matched_idxs, anchors):
+        foreground_idxs_mask, num_foreground_, target_regression_ = [], [], []
+
+        for boxes_per_image, anchors_per_image, matched_idxs_per_image in zip(target_boxes, anchors, matched_idxs):
+            foreground_idxs_per_image = torch.where(matched_idxs_per_image >= 0)[0]
+            num_foreground = foreground_idxs_per_image.numel()
+
+            foreground_idxs_mask.append(foreground_idxs_per_image)
+            num_foreground_.append(num_foreground)
+
+            # select only the foreground boxes
+            matched_gt_boxes_per_image = boxes_per_image[matched_idxs_per_image[foreground_idxs_per_image]]
+            anchors_per_image = anchors_per_image[foreground_idxs_per_image, :]
+
+            # compute the regression targets
+            target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image)
+
+            target_regression_.append(target_regression)
+
+        return target_regression_, num_foreground_, foreground_idxs_mask
+
+    def compute_loss_core(self, bbox_regression, target_regression, foreground_idxs, num_foreground):
+        losses = []
+
+        for bbox_regression_i, target_regression_i, foreground_idxs_i, num_foreground_i in \
+                zip(bbox_regression, target_regression, foreground_idxs, num_foreground):
+
+            bbox_regression_i_ = bbox_regression_i[foreground_idxs_i, :]
+
+            losses.append(torch.nn.functional.l1_loss(bbox_regression_i_, target_regression_i, reduction='sum')
+                          / max(1, num_foreground_i))
+
+        return _sum(losses) / num_foreground.size(0)
+
+    def compute_loss_prologue_padded(self, target_boxes, matched_idxs, anchors):
+        # notice the number of boxes is padded in this implementation
+        # make sure we do not trim bboxes
+        # assert (matched_idxs.max() < max_boxes)
+
+        foreground_idxs_mask = matched_idxs >= 0
+        num_foreground_ = foreground_idxs_mask.sum(dim=1)
+        # clamping to avoid -2, -1
+        matched_idxs_clamped = torch.clamp(matched_idxs, min=0)
+
+        # check that the premade vector size is relevant to the current batch size
+        # not sure what will happen if it is not
+        assert(utils.ScratchPad.batch_size_vector.size(0) == len(target_boxes))
+
+        matched_gt_boxes_ = target_boxes[utils.ScratchPad.batch_size_vector, matched_idxs_clamped]
+        target_regression_ = self.box_coder.encode_batch(matched_gt_boxes_,
+                                                         torch.stack(anchors)) * foreground_idxs_mask[:, :, None]
+
+        return target_regression_, num_foreground_, foreground_idxs_mask
+
+    def compute_loss_core_padded(self, bbox_regression, target_regression, foreground_idxs, num_foreground):
+        bbox_regression_masked = bbox_regression * foreground_idxs[:, :, None]
+        losses = torch.norm(bbox_regression_masked - target_regression, 1, dim=[1, 2]) / \
+                 torch.max(torch.ones_like(num_foreground), num_foreground)
+
+        # The denominator is just the batch size
+        return _sum(losses) / num_foreground.size(0)
+
+    def forward(self, x):
+        # type: (List[Tensor]) -> Tensor
+        all_bbox_regression = []
+
+        # since weights are shared, we can cast weights and biases only one time per iteration
+        if self.fusion:
+            conv1_w = self.conv[0].weight.half()
+            conv2_w = self.conv[2].weight.half()
+            conv3_w = self.conv[4].weight.half()
+            conv4_w = self.conv[6].weight.half()
+            conv5_w = self.bbox_reg.weight.half()
+            conv1_b = self.conv[0].bias.reshape(1, -1, 1, 1).half()
+            conv2_b = self.conv[2].bias.reshape(1, -1, 1, 1).half()
+            conv3_b = self.conv[4].bias.reshape(1, -1, 1, 1).half()
+            conv4_b = self.conv[6].bias.reshape(1, -1, 1, 1).half()
+            conv5_b = self.bbox_reg.bias.reshape(1, -1, 1, 1).half()
+
+        for features in x:
+            if not self.fusion:
+                bbox_regression = self.conv(features)
+                bbox_regression = self.bbox_reg(bbox_regression)
+            else:
+                bbox_regression = ConvBiasReLU(features, conv1_w, conv1_b, 1, 1)
+                bbox_regression = ConvBiasReLU(bbox_regression, conv2_w, conv2_b, 1, 1)
+                bbox_regression = ConvBiasReLU(bbox_regression, conv3_w, conv3_b, 1, 1)
+                bbox_regression = ConvBiasReLU(bbox_regression, conv4_w, conv4_b, 1, 1)
+                bbox_regression = ConvBias(bbox_regression, conv5_w, conv5_b, 1, 1)
+
+                # cloning grad in backprop to make it contiguous for fusion code
+                bbox_regression = GradClone(bbox_regression)
+
+            # Permute bbox regression output from (N, 4 * A, H, W) to (N, HWA, 4).
+            N, _, H, W = bbox_regression.shape
+            bbox_regression = bbox_regression.view(N, -1, 4, H, W)
+            bbox_regression = bbox_regression.permute(0, 3, 4, 1, 2)
+            bbox_regression = bbox_regression.reshape(N, -1, 4)  # Size=(N, HWA, 4)
+
+            all_bbox_regression.append(bbox_regression)
+
+        return torch.cat(all_bbox_regression, dim=1)
+
+
+class RetinaNet(nn.Module):
+    """
+    Implements RetinaNet.
+
+    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
+    image, and should be in 0-1 range. Different images can have different sizes.
+
+    The behavior of the model changes depending if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    containing:
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the class label for each ground-truth box
+
+    The model returns a Dict[Tensor] during training, containing the classification and regression
+    losses.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
+    follows:
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (Int64Tensor[N]): the predicted labels for each image
+        - scores (Tensor[N]): the scores for each prediction
+
+    Args:
+        backbone (nn.Module): the network used to compute the features for the model.
+            It should contain an out_channels attribute, which indicates the number of output
+            channels that each feature map has (and it should be the same for all feature maps).
+            The backbone should return a single Tensor or an OrderedDict[Tensor].
+        num_classes (int): number of output classes of the model (including the background).
+        image_mean (Tuple[float, float, float]): mean values used for input normalization.
+            They are generally the mean values of the dataset on which the backbone has been trained
+            on
+        image_std (Tuple[float, float, float]): std values used for input normalization.
+            They are generally the std values of the dataset on which the backbone has been trained on
+        anchor_generator (AnchorGenerator): module that generates the anchors for a set of feature
+            maps.
+        head (nn.Module): Module run on top of the feature pyramid.
+            Defaults to a module containing a classification and regression module.
+        score_thresh (float): Score threshold used for postprocessing the detections.
+        nms_thresh (float): NMS threshold used for postprocessing the detections.
+        detections_per_img (int): Number of best detections to keep after NMS.
+        fg_iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be
+            considered as positive during training.
+        bg_iou_thresh (float): maximum IoU between the anchor and the GT box so that they can be
+            considered as negative during training.
+        topk_candidates (int): Number of best detections to keep before NMS.
+
+    Example:
+
+        >>> import torch
+        >>> import torchvision
+        >>> from torchvision.models.detection import RetinaNet
+        >>> from torchvision.models.detection.anchor_utils import AnchorGenerator
+        >>> # load a pre-trained model for classification and return
+        >>> # only the features
+        >>> backbone = torchvision.models.mobilenet_v2(pretrained=True).features
+        >>> # RetinaNet needs to know the number of
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # so we need to add it here
+        >>> backbone.out_channels = 1280
+        >>>
+        >>> # let's make the network generate 5 x 3 anchors per spatial
+        >>> # location, with 5 different sizes and 3 different aspect
+        >>> # ratios. We have a Tuple[Tuple[int]] because each feature
+        >>> # map could potentially have different sizes and
+        >>> # aspect ratios
+        >>> anchor_generator = AnchorGenerator(
+        >>>     sizes=((32, 64, 128, 256, 512),),
+        >>>     aspect_ratios=((0.5, 1.0, 2.0),)
+        >>> )
+        >>>
+        >>> # put the pieces together inside a RetinaNet model
+        >>> model = RetinaNet(backbone,
+        >>>                   num_classes=2,
+        >>>                   anchor_generator=anchor_generator)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+    """
+    __annotations__ = {
+        'box_coder': BoxCoder,
+        'proposal_matcher': Matcher,
+    }
+
+    def __init__(self, backbone, num_classes, data_layout='channels_first', head_fusion=False,
+                 # transform parameters
+                 image_size=None, image_mean=None, image_std=None,
+                 # Anchor parameters
+                 anchor_generator=None, head=None,
+                 # Detection parameters
+                 proposal_matcher=None,
+                 score_thresh=0.05,
+                 nms_thresh=0.5,
+                 detections_per_img=300,
+                 fg_iou_thresh=0.5, bg_iou_thresh=0.4,
+                 topk_candidates=1000):
+        super().__init__()
+
+        if not hasattr(backbone, "out_channels"):
+            raise ValueError(
+                "backbone should contain an attribute out_channels "
+                "specifying the number of output channels (assumed to be the "
+                "same for all the levels)")
+        self.backbone = backbone
+        self.data_layout = data_layout
+
+        assert isinstance(anchor_generator, (AnchorGenerator, type(None)))
+
+        if anchor_generator is None:
+            anchor_sizes = tuple((x, int(x * 2 ** (1.0 / 3)), int(x * 2 ** (2.0 / 3))) for x in [32, 64, 128, 256, 512])
+            aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
+            anchor_generator = AnchorGenerator(
+                anchor_sizes, aspect_ratios
+            )
+        self.anchor_generator = anchor_generator
+        self.anchors = None
+
+        if head is None:
+            head = RetinaNetHead(backbone.out_channels, anchor_generator.num_anchors_per_location()[0], num_classes,
+                                 fusion=head_fusion)
+        self.head = head
+
+        if proposal_matcher is None:
+            proposal_matcher = Matcher(
+                fg_iou_thresh,
+                bg_iou_thresh,
+                allow_low_quality_matches=True,
+            )
+        else:
+            warnings.warn('proposal_matcher_batch is statically assigned to MatcherBatch')
+        self.proposal_matcher = proposal_matcher
+        self.proposal_matcher_batch = MatcherBatch(fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=True)
+
+        self.score_thresh = score_thresh
+        self.nms_thresh = nms_thresh
+        self.detections_per_img = detections_per_img
+        self.topk_candidates = topk_candidates
+
+        self.box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
+        self.anchors = None
+
+        if image_size is None:
+            image_size = [800, 800]
+        if image_std is None:
+            image_std = [0.229, 0.224, 0.225]
+        if image_mean is None:
+            image_mean = [0.485, 0.456, 0.406]
+
+        self.transform = GeneralizedRCNNTransform(image_size=image_size,
+                                                  image_mean=image_mean, image_std=image_std)
+
+        # used only on torchscript mode
+        self._has_warned = False
+
+    @torch.jit.unused
+    def eager_outputs(self, losses, detections):
+        # type: (Dict[str, Tensor], List[Dict[str, Tensor]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
+        if self.training:
+            return losses
+
+        return detections
+
+    # --- original code ---
+    def get_matched_idxs(self, target_boxes):
+        matched_idxs = []
+        for anchors_per_image, boxes_per_image in zip(self.anchors, target_boxes):
+            if boxes_per_image.numel() == 0:
+                matched_idxs.append(torch.full((anchors_per_image.size(0),), -1, dtype=torch.int64,
+                                               device=anchors_per_image.device))
+                continue
+
+            match_quality_matrix = box_iou(boxes_per_image, anchors_per_image)
+            matched_idxs.append(self.proposal_matcher(match_quality_matrix))
+
+        return torch.stack(matched_idxs)
+
+    # --- parallel implementation ---
+    # this implementation is not in use, since (1) it is done as part of DALI pipe; and (2) because of the
+    # significant padding to target_boxes, box_iou has significant computational overheads
+    def get_matched_idxs_padded(self, target_boxes, batch_sz, max_boxes):
+        target_boxes_ = target_boxes.reshape(-1, 4)
+
+        match_quality_matrix = box_iou(target_boxes_, self.anchors[0])
+        match_quality_matrix = match_quality_matrix.reshape([batch_sz, max_boxes, -1])
+        matched_idxs = self.proposal_matcher_batch(match_quality_matrix)
+
+        return matched_idxs
+
+    # --- original code ---
+    def compute_loss(self, targets, head_outputs):
+        # type: (List[Dict[str, Tensor]], Dict[str, Tensor]) -> Dict[str, Tensor]
+
+        matched_idxs = []
+        for anchors_per_image, boxes_per_image in zip(self.anchors, targets['boxes']):
+
+            # Uncomment to support trim of targets according to MAX_BOXES, so can be used a reference
+            # boxes_per_image = boxes_per_image[0:MAX_BOXES, :]
+
+            if boxes_per_image.numel() == 0:
+                matched_idxs.append(torch.full((anchors_per_image.size(0),), -1, dtype=torch.int64,
+                                               device=anchors_per_image.device))
+                continue
+
+            match_quality_matrix = box_iou(boxes_per_image, anchors_per_image)
+            matched_idxs.append(self.proposal_matcher(match_quality_matrix))
+
+        return self.head.compute_loss(targets, head_outputs, self.anchors, matched_idxs)
+
+    def update_anchors(self, images, device, features=None, dtype=torch.float16, force=False):
+        # TODO: should perhaps create once in the relevant constructor
+        if self.anchors is None or force is True:
+            if features is None:
+                # forward_opt uses the default grid size (100, 50, 25, 13, 7)
+                # images is the image tensor shape
+                self.anchors = self.anchor_generator.forward_opt(image_shape=images, device=device, dtype=dtype)
+            else:
+                # using the old method if the features are passed
+                self.anchors = self.anchor_generator.forward(images, features)
+
+    def eval_postprocess_detections(self, head_outputs, anchors, image_shapes):
+        # type: (Dict[str, List[Tensor]], List[List[Tensor]], List[Tuple[int, int]]) -> List[Dict[str, Tensor]]
+        class_logits = head_outputs['cls_logits']
+        box_regression = head_outputs['bbox_regression']
+
+        num_images = len(image_shapes)
+
+        detections: List[Dict[str, Tensor]] = []
+
+        for index in range(num_images):
+            box_regression_per_image = [br[index] for br in box_regression]
+            logits_per_image = [cl[index] for cl in class_logits]
+            # anchors[i] = anchors[j] for every i!=j
+            anchors_per_image, image_shape = anchors[0], image_shapes[index]
+
+            image_boxes = []
+            image_scores = []
+            image_labels = []
+
+            for box_regression_per_level, logits_per_level, anchors_per_level in \
+                    zip(box_regression_per_image, logits_per_image, anchors_per_image):
+                num_classes = logits_per_level.shape[-1]
+
+                # remove low scoring boxes
+                scores_per_level = torch.sigmoid(logits_per_level).flatten()
+                keep_idxs = scores_per_level > self.score_thresh
+                scores_per_level = scores_per_level[keep_idxs]
+                topk_idxs = torch.where(keep_idxs)[0]
+
+                # keep only topk scoring predictions
+                num_topk = min(self.topk_candidates, topk_idxs.size(0))
+                scores_per_level, idxs = scores_per_level.topk(num_topk)
+                topk_idxs = topk_idxs[idxs]
+
+                anchor_idxs = torch.div(topk_idxs, num_classes, rounding_mode='floor')
+                labels_per_level = topk_idxs % num_classes
+
+                boxes_per_level = self.box_coder.decode_single(box_regression_per_level[anchor_idxs],
+                                                               anchors_per_level[anchor_idxs])
+                boxes_per_level = clip_boxes_to_image(boxes_per_level, image_shape)
+
+                image_boxes.append(boxes_per_level)
+                image_scores.append(scores_per_level)
+                image_labels.append(labels_per_level)
+
+            image_boxes = torch.cat(image_boxes, dim=0)
+            image_scores = torch.cat(image_scores, dim=0)
+            image_labels = torch.cat(image_labels, dim=0)
+
+            # non-maximum suppression
+            keep = batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh)
+            keep = keep[:self.detections_per_img]
+
+            detections.append({
+                'boxes': image_boxes[keep],
+                'scores': image_scores[keep],
+                'labels': image_labels[keep],
+            })
+
+        return detections
+
+    def eval_postprocess(self, images, features, targets, head_outputs, targets_dict=False):
+        # recover level sizes
+        num_anchors_per_level = [x.size(2) * x.size(3) for x in features]
+        HW = 0
+        for v in num_anchors_per_level:
+            HW += v
+        HWA = head_outputs['cls_logits'].size(1)
+        A = HWA // HW
+        num_anchors_per_level = [hw * A for hw in num_anchors_per_level]
+
+        # split outputs per level
+        split_head_outputs: Dict[str, List[Tensor]] = {}
+        for k in head_outputs:
+            split_head_outputs[k] = list(head_outputs[k].split(num_anchors_per_level, dim=1))
+        split_anchors = [list(a.split(num_anchors_per_level)) for a in self.anchors]
+
+        # get the original image sizes
+        original_image_sizes = []
+        if targets_dict:
+            original_image_sizes = targets['original_image_size']
+        else:
+            for target in targets:
+                original_image_sizes.append(target['original_image_size'])
+
+        # compute the detections
+        detections = self.eval_postprocess_detections(split_head_outputs, split_anchors,
+                                                      [(image.size(1), image.size(2)) for image in images])
+        detections = self.transform.postprocess(detections,
+                                                [(image.size(1), image.size(2)) for image in images],
+                                                original_image_sizes)
+
+        return detections
+
+    def validate_input(self, images, targets):
+        if self.training and targets is None:
+            raise ValueError("In training mode, targets should be passed")
+
+        if self.training:
+            assert targets is not None
+            for boxes in targets["boxes"]:
+                if isinstance(boxes, torch.Tensor):
+                    if len(boxes.shape) != 2 or boxes.shape[-1] != 4:
+                        raise ValueError("Expected target boxes to be a tensor"
+                                         "of shape [N, 4], got {:}.".format(
+                                             boxes.shape))
+                else:
+                    raise ValueError("Expected target boxes to be of type "
+                                     "Tensor, got {:}.".format(type(boxes)))
+
+        # check for degenerate boxes
+        if targets is not None:
+            for target_idx, boxes in enumerate(targets["boxes"]):
+                degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
+                if degenerate_boxes.any():
+                    # print the first degenerate box
+                    bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
+                    degen_bb: List[float] = boxes[bb_idx].tolist()
+                    raise ValueError("All bounding boxes should have positive height and width."
+                                     " Found invalid box {} for target at index {}."
+                                     .format(degen_bb, target_idx))
+
+    def forward(self, images: Tensor) -> Tuple[Tensor]:
+        """
+        Args:
+            images (Tensor): images to be processed
+
+        Returns:
+            result (Tuple[Tensor]): the output from the model; [0]: pyramid 100x100, [1] 50x50, [2] 25x25,
+            [3] 13x13, [4] 7x7, [5] cls head, [6] bbox head
+        """
+
+        # get the features from the backbone
+        features = self.backbone(images)
+        if isinstance(features, torch.Tensor):
+            features = OrderedDict([('0', features)])
+
+        features = list(features.values())
+
+        # compute the retinanet heads outputs using the features
+        head_outputs = self.head(features)
+
+        features.extend(head_outputs)
+        out = tuple(features)
+
+        return out
+
+
+model_urls = {
+    'retinanet_resnet50_fpn_coco':
+        'https://download.pytorch.org/models/retinanet_resnet50_fpn_coco-eeacb38b.pth',
+}
+
+
+def retinanet_resnet50_fpn(num_classes, image_size, data_layout='channels_first',
+                           pretrained=False, progress=True, pretrained_backbone=True,
+                           trainable_backbone_layers=None):
+    """
+    Constructs a RetinaNet model with a ResNet-50-FPN backbone.
+
+    Reference: `"Focal Loss for Dense Object Detection" <https://arxiv.org/abs/1708.02002>`_.
+
+    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
+    image, and should be in ``0-1`` range. Different images can have different sizes.
+
+    The behavior of the model changes depending if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    containing:
+
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
+
+    The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
+    losses.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
+    follows, where ``N`` is the number of detections:
+
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the predicted labels for each detection
+        - scores (``Tensor[N]``): the scores of each detection
+
+    For more details on the output, you may refer to :ref:`instance_seg_output`.
+
+    Example::
+
+        >>> model = torchvision.models.detection.retinanet_resnet50_fpn(pretrained=True)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+
+    Args:
+        num_classes (int): number of output classes of the model (including the background)
+        image_size (list(int, int)): Image size
+        data_layout (str): model data layout (channels_first or channels_last)
+        pretrained (bool): If True, returns a model pre-trained on COCO train2017
+        progress (bool): If True, displays a progress bar of the download to stderr
+        pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet
+        trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting from final block.
+            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
+    """
+    trainable_backbone_layers = _validate_trainable_layers(
+        pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3)
+
+    if pretrained:
+        # no need to download the backbone if pretrained is set
+        pretrained_backbone = False
+    # skip P2 because it generates too many anchors (according to their paper)
+    backbone = resnet_fpn_backbone('resnet50', pretrained_backbone, returned_layers=[2, 3, 4],
+                                   extra_blocks=LastLevelP6P7(256, 256, module_name="module.backbone.fpn.extra_blocks"),
+                                   trainable_layers=trainable_backbone_layers)
+    model = RetinaNet(backbone=backbone, num_classes=num_classes, data_layout=data_layout, image_size=image_size)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls['retinanet_resnet50_fpn_coco'],
+                                              progress=progress)
+        model.load_state_dict(state_dict)
+        overwrite_eps(model, 0.0)
+    return model
+
+
+def retinanet_resnext50_32x4d_fpn(num_classes, image_size, data_layout='channels_first',
+                                  pretrained=False, progress=True, pretrained_backbone=True,
+                                  trainable_backbone_layers=None, jit=False, head_fusion=False, frozen_bn_opt=False):
+    """
+    Constructs a RetinaNet model with a resnext50_32x4d-FPN backbone.
+
+    Reference: `"Focal Loss for Dense Object Detection" <https://arxiv.org/abs/1708.02002>`_.
+
+    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
+    image, and should be in ``0-1`` range. Different images can have different sizes.
+
+    The behavior of the model changes depending if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    containing:
+
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
+
+    The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
+    losses.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
+    follows, where ``N`` is the number of detections:
+
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the predicted labels for each detection
+        - scores (``Tensor[N]``): the scores of each detection
+
+    For more details on the output, you may refer to :ref:`instance_seg_output`.
+
+    Example::
+
+        >>> model = torchvision.models.detection.retinanet_resnext50_32x4d_fpn(pretrained=True)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+
+    Args:
+        num_classes (int): number of output classes of the model (including the background)
+        image_size (list(int, int)): Image size
+        data_layout (str): model data layout (channels_first or channels_last)
+        pretrained (bool): If True, returns a model pre-trained on COCO train2017
+        progress (bool): If True, displays a progress bar of the download to stderr
+        pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet
+        trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting from final block.
+            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
+    """
+    trainable_backbone_layers = _validate_trainable_layers(
+        pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3)
+
+    if pretrained:
+        # no need to download the backbone if pretrained is set
+        pretrained_backbone = False
+    # skip P2 because it generates too many anchors (according to their paper)
+    backbone = resnet_fpn_backbone('resnext50_32x4d', pretrained_backbone, returned_layers=[2, 3, 4],
+                                   extra_blocks=LastLevelP6P7(256, 256, module_name="module.backbone.fpn.extra_blocks"),
+                                   trainable_layers=trainable_backbone_layers,
+                                   norm_layer=FrozenBatchNorm2d if frozen_bn_opt else misc_nn_ops.FrozenBatchNorm2d,
+                                   jit=jit)
+    model = RetinaNet(backbone=backbone, num_classes=num_classes, data_layout=data_layout, image_size=image_size,
+                      head_fusion=head_fusion)
+    if pretrained:
+        raise ValueError("Torchvision doesn't have a pretrained retinanet_resnext50_32x4d_fpn model")
+
+    return model
+
+
+def retinanet_resnet101_fpn(num_classes, image_size, data_layout='channels_first',
+                            pretrained=False, progress=True, pretrained_backbone=True,
+                            trainable_backbone_layers=None):
+    """
+    Constructs a RetinaNet model with a ResNet-101-FPN backbone.
+
+    Reference: `"Focal Loss for Dense Object Detection" <https://arxiv.org/abs/1708.02002>`_.
+
+    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
+    image, and should be in ``0-1`` range. Different images can have different sizes.
+
+    The behavior of the model changes depending if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    containing:
+
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
+
+    The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
+    losses.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
+    follows, where ``N`` is the number of detections:
+
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the predicted labels for each detection
+        - scores (``Tensor[N]``): the scores of each detection
+
+    For more details on the output, you may refer to :ref:`instance_seg_output`.
+
+    Example::
+
+        >>> model = torchvision.models.detection.retinanet_resnet101_fpn(pretrained=True)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+
+    Args:
+        num_classes (int): number of output classes of the model (including the background)
+        image_size (list(int, int)): Image size
+        data_layout (str): model data layout (channels_first or channels_last)
+        pretrained (bool): If True, returns a model pre-trained on COCO train2017
+        progress (bool): If True, displays a progress bar of the download to stderr
+        pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet
+        trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting from final block.
+            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
+    """
+    trainable_backbone_layers = _validate_trainable_layers(
+        pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3)
+
+    if pretrained:
+        # no need to download the backbone if pretrained is set
+        pretrained_backbone = False
+    # skip P2 because it generates too many anchors (according to their paper)
+    backbone = resnet_fpn_backbone('resnet101', pretrained_backbone, returned_layers=[2, 3, 4],
+                                   extra_blocks=LastLevelP6P7(256, 256, module_name="module.backbone.fpn.extra_blocks"),
+                                   trainable_layers=trainable_backbone_layers)
+    model = RetinaNet(backbone=backbone, num_classes=num_classes, data_layout=data_layout, image_size=image_size)
+    if pretrained:
+        raise ValueError("Torchvision doesn't have a pretrained retinanet_resnet101_fpn model")
+
+    return model
+
+
+def retinanet_resnext101_32x8d_fpn(num_classes, image_size, data_layout='channels_first',
+                                   pretrained=False, progress=True, pretrained_backbone=True,
+                                   trainable_backbone_layers=None):
+    """
+    Constructs a RetinaNet model with a resnext101_32x8d-FPN backbone.
+
+    Reference: `"Focal Loss for Dense Object Detection" <https://arxiv.org/abs/1708.02002>`_.
+
+    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
+    image, and should be in ``0-1`` range. Different images can have different sizes.
+
+    The behavior of the model changes depending if it is in training or evaluation mode.
+
+    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    containing:
+
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
+
+    The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
+    losses.
+
+    During inference, the model requires only the input tensors, and returns the post-processed
+    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
+    follows, where ``N`` is the number of detections:
+
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
+          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
+        - labels (``Int64Tensor[N]``): the predicted labels for each detection
+        - scores (``Tensor[N]``): the scores of each detection
+
+    For more details on the output, you may refer to :ref:`instance_seg_output`.
+
+    Example::
+
+        >>> model = torchvision.models.detection.retinanet_resnext101_32x8d_fpn(pretrained=True)
+        >>> model.eval()
+        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> predictions = model(x)
+
+    Args:
+        num_classes (int): number of output classes of the model (including the background)
+        image_size (list(int, int)): Image size
+        data_layout (str): model data layout (channels_first or channels_last)
+        pretrained (bool): If True, returns a model pre-trained on COCO train2017
+        progress (bool): If True, displays a progress bar of the download to stderr
+        pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet
+        trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting from final block.
+            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
+    """
+    trainable_backbone_layers = _validate_trainable_layers(
+        pretrained or pretrained_backbone, trainable_backbone_layers, 5, 3)
+
+    if pretrained:
+        # no need to download the backbone if pretrained is set
+        pretrained_backbone = False
+    # skip P2 because it generates too many anchors (according to their paper)
+    backbone = resnet_fpn_backbone('resnext101_32x8d', pretrained_backbone, returned_layers=[2, 3, 4],
+                                   extra_blocks=LastLevelP6P7(256, 256, module_name="module.backbone.fpn.extra_blocks"),
+                                   trainable_layers=trainable_backbone_layers)
+    model = RetinaNet(backbone=backbone, num_classes=num_classes, data_layout=data_layout, image_size=image_size)
+    if pretrained:
+        raise ValueError("Torchvision doesn't have a pretrained retinanet_resnext101_32x8d_fpn model")
+
+    return model
+
+
+def retinanet_from_backbone(backbone,
+                            num_classes=91, data_layout='channels_first', image_size=None,
+                            pretrained=False, progress=True, pretrained_backbone=True,
+                            trainable_backbone_layers=None, jit=False, head_fusion=False, frozen_bn_opt=False):
+    if image_size is None:
+        image_size = [800, 800]
+
+    if backbone == "resnet50":
+        return retinanet_resnet50_fpn(num_classes=num_classes, data_layout=data_layout, image_size=image_size,
+                                      pretrained=pretrained, progress=progress,
+                                      pretrained_backbone=pretrained_backbone,
+                                      trainable_backbone_layers=trainable_backbone_layers)
+    elif backbone == "resnext50_32x4d":
+        return retinanet_resnext50_32x4d_fpn(num_classes=num_classes, data_layout=data_layout, image_size=image_size,
+                                             pretrained=pretrained, progress=progress,
+                                             pretrained_backbone=pretrained_backbone,
+                                             trainable_backbone_layers=trainable_backbone_layers, jit=jit,
+                                             head_fusion=head_fusion, frozen_bn_opt=frozen_bn_opt)
+    elif backbone == "resnet101":
+        return retinanet_resnet101_fpn(num_classes=num_classes, data_layout=data_layout, image_size=image_size,
+                                       pretrained=pretrained, progress=progress,
+                                       pretrained_backbone=pretrained_backbone,
+                                       trainable_backbone_layers=trainable_backbone_layers)
+    elif backbone == "resnext101_32x8d":
+        return retinanet_resnext101_32x8d_fpn(num_classes=num_classes, data_layout=data_layout, image_size=image_size,
+                                              pretrained=pretrained, progress=progress,
+                                              pretrained_backbone=pretrained_backbone,
+                                              trainable_backbone_layers=trainable_backbone_layers)
+    else:
+        raise ValueError(f"Unknown backbone {backbone}")
--- a/model/roi_heads.py
+++ b/model/roi_heads.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torchvision
+
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from torchvision.ops import boxes as box_ops
+from torchvision.ops import roi_align
+
+from typing import Optional, List, Dict, Tuple
+
+from model.utils import BoxCoder, Matcher
+
+
+def expand_boxes(boxes, scale):
+    # type: (Tensor, float) -> Tensor
+    w_half = (boxes[:, 2] - boxes[:, 0]) * .5
+    h_half = (boxes[:, 3] - boxes[:, 1]) * .5
+    x_c = (boxes[:, 2] + boxes[:, 0]) * .5
+    y_c = (boxes[:, 3] + boxes[:, 1]) * .5
+
+    w_half *= scale
+    h_half *= scale
+
+    boxes_exp = torch.zeros_like(boxes)
+    boxes_exp[:, 0] = x_c - w_half
+    boxes_exp[:, 2] = x_c + w_half
+    boxes_exp[:, 1] = y_c - h_half
+    boxes_exp[:, 3] = y_c + h_half
+    return boxes_exp
+
+
+def expand_masks(mask, padding):
+    # type: (Tensor, int) -> Tuple[Tensor, float]
+    M = mask.shape[-1]
+    scale = float(M + 2 * padding) / M
+    padded_mask = F.pad(mask, (padding,) * 4)
+    return padded_mask, scale
+
+
+def paste_mask_in_image(mask, box, im_h, im_w):
+    # type: (Tensor, Tensor, int, int) -> Tensor
+    TO_REMOVE = 1
+    w = int(box[2] - box[0] + TO_REMOVE)
+    h = int(box[3] - box[1] + TO_REMOVE)
+    w = max(w, 1)
+    h = max(h, 1)
+
+    # Set shape to [batchxCxHxW]
+    mask = mask.expand((1, 1, -1, -1))
+
+    # Resize mask
+    mask = F.interpolate(mask, size=(h, w), mode='bilinear', align_corners=False)
+    mask = mask[0][0]
+
+    im_mask = torch.zeros((im_h, im_w), dtype=mask.dtype, device=mask.device)
+    x_0 = max(box[0], 0)
+    x_1 = min(box[2] + 1, im_w)
+    y_0 = max(box[1], 0)
+    y_1 = min(box[3] + 1, im_h)
+
+    im_mask[y_0:y_1, x_0:x_1] = mask[
+        (y_0 - box[1]):(y_1 - box[1]), (x_0 - box[0]):(x_1 - box[0])
+    ]
+    return im_mask
+
+
+def paste_masks_in_image(masks, boxes, img_shape, padding=1):
+    # type: (Tensor, Tensor, Tuple[int, int], int) -> Tensor
+    masks, scale = expand_masks(masks, padding=padding)
+    boxes = expand_boxes(boxes, scale).to(dtype=torch.int64)
+    im_h, im_w = img_shape
+    res = [
+        paste_mask_in_image(m[0], b, im_h, im_w)
+        for m, b in zip(masks, boxes)
+    ]
+    if len(res) > 0:
+        ret = torch.stack(res, dim=0)[:, None]
+    else:
+        ret = masks.new_empty((0, 1, im_h, im_w))
+    return ret
--- a/model/transform.py
+++ b/model/transform.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import torch
+import torchvision
+
+from torch import nn, Tensor
+from typing import List, Tuple, Dict, Optional
+
+from model.image_list import ImageList
+from model.roi_heads import paste_masks_in_image
+
+
+@torch.jit.unused
+def _get_shape_onnx(image: Tensor) -> Tensor:
+    from torch.onnx import operators
+    return operators.shape_as_tensor(image)[-2:]
+
+
+@torch.jit.unused
+def _fake_cast_onnx(v: Tensor) -> float:
+    # ONNX requires a tensor but here we fake its type for JIT.
+    return v
+
+
+def _resize_image_and_masks(image: Tensor,
+                            target: Optional[Dict[str, Tensor]] = None,
+                            image_size: Optional[Tuple[int, int]] = None,
+                            ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+    if torchvision._is_tracing():
+        im_shape = _get_shape_onnx(image)
+    else:
+        im_shape = torch.tensor(image.shape[-2:])
+
+    image = torch.nn.functional.interpolate(image[None], size=image_size, scale_factor=None, mode='bilinear',
+                                            recompute_scale_factor=None, align_corners=False)[0]
+
+    if target is None:
+        return image, target
+
+    if "masks" in target:
+        mask = target["masks"]
+        mask = torch.nn.functional.interpolate(mask[:, None].float(), size=image_size, scale_factor=None,
+                                               recompute_scale_factor=None)[:, 0].byte()
+        target["masks"] = mask
+    return image, target
+
+
+class GeneralizedRCNNTransform(nn.Module):
+    """
+    Performs input / target transformation before feeding the data to a GeneralizedRCNN
+    model.
+
+    The transformations it perform are:
+        - input normalization (mean subtraction and std division)
+        - input / target resizing to match image_size
+
+    It returns a ImageList for the inputs, and a List[Dict[Tensor]] for the targets
+    """
+
+    def __init__(self, image_size: Optional[Tuple[int, int]],
+                 image_mean: List[float], image_std: List[float],):
+        super(GeneralizedRCNNTransform, self).__init__()
+        self.image_size = image_size
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def forward(self,
+                images: List[Tensor],
+                targets: Optional[List[Dict[str, Tensor]]] = None
+                ) -> Tuple[ImageList, Optional[List[Dict[str, Tensor]]]]:
+        images = [img for img in images]
+        if targets is not None:
+            # make a copy of targets to avoid modifying it in-place
+            # once torchscript supports dict comprehension
+            # this can be simplified as follows
+            # targets = [{k: v for k,v in t.items()} for t in targets]
+            targets_copy: List[Dict[str, Tensor]] = []
+            for t in targets:
+                data: Dict[str, Tensor] = {}
+                for k, v in t.items():
+                    data[k] = v
+                targets_copy.append(data)
+            targets = targets_copy
+        for i in range(len(images)):
+            image = images[i]
+            target_index = targets[i] if targets is not None else None
+
+            if image.dim() != 3:
+                raise ValueError("images is expected to be a list of 3d tensors "
+                                 "of shape [C, H, W], got {}".format(image.shape))
+            image = self.normalize(image)
+            image, target_index = self.resize(image, target_index)
+            images[i] = image
+            if targets is not None and target_index is not None:
+                targets[i] = target_index
+
+        image_sizes = [img.shape[-2:] for img in images]
+        images = torch.stack(images)
+        image_sizes_list: List[Tuple[int, int]] = []
+        for image_size in image_sizes:
+            assert len(image_size) == 2
+            image_sizes_list.append((image_size[0], image_size[1]))
+
+        image_list = ImageList(images, image_sizes_list)
+        return image_list, targets
+
+    def normalize(self, image: Tensor) -> Tensor:
+        if not image.is_floating_point():
+            raise TypeError(
+                f"Expected input images to be of floating type (in range [0, 1]), "
+                f"but found type {image.dtype} instead"
+            )
+        dtype, device = image.dtype, image.device
+        mean = torch.as_tensor(self.image_mean, dtype=dtype, device=device)
+        std = torch.as_tensor(self.image_std, dtype=dtype, device=device)
+        return (image - mean[:, None, None]) / std[:, None, None]
+
+    def torch_choice(self, k: List[int]) -> int:
+        """
+        Implements `random.choice` via torch ops so it can be compiled with
+        TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803
+        is fixed.
+        """
+        index = int(torch.empty(1).uniform_(0., float(len(k))).item())
+        return k[index]
+
+    def resize(self,
+               image: Tensor,
+               target: Optional[Dict[str, Tensor]] = None,
+               ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        h, w = image.shape[-2:]
+        image, target = _resize_image_and_masks(image, target, self.image_size)
+
+        if target is None:
+            return image, target
+
+        bbox = target["boxes"]
+        bbox = resize_boxes(bbox, (h, w), image.shape[-2:])
+        target["boxes"] = bbox
+
+        if "keypoints" in target:
+            keypoints = target["keypoints"]
+            keypoints = resize_keypoints(keypoints, (h, w), image.shape[-2:])
+            target["keypoints"] = keypoints
+        return image, target
+
+    def postprocess(self,
+                    result: List[Dict[str, Tensor]],
+                    image_shapes: List[Tuple[int, int]],
+                    original_image_sizes: List[Tuple[int, int]]
+                    ) -> List[Dict[str, Tensor]]:
+        if self.training:
+            return result
+        for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)):
+            boxes = pred["boxes"]
+            boxes = resize_boxes(boxes, im_s, o_im_s)
+            result[i]["boxes"] = boxes
+            if "masks" in pred:
+                masks = pred["masks"]
+                masks = paste_masks_in_image(masks, boxes, o_im_s)
+                result[i]["masks"] = masks
+            if "keypoints" in pred:
+                keypoints = pred["keypoints"]
+                keypoints = resize_keypoints(keypoints, im_s, o_im_s)
+                result[i]["keypoints"] = keypoints
+        return result
+
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + '('
+        _indent = '\n    '
+        format_string += "{0}Normalize(mean={1}, std={2})".format(_indent, self.image_mean, self.image_std)
+        format_string += "{0}Resize(height={1}, width={2}, mode='bilinear')".format(_indent, self.image_size[0],
+                                                                                         self.image_size[1])
+        format_string += '\n)'
+        return format_string
+
+
+def resize_keypoints(keypoints: Tensor, original_size: List[int], new_size: List[int]) -> Tensor:
+    ratios = [
+        torch.tensor(s, dtype=torch.float32, device=keypoints.device) /
+        torch.tensor(s_orig, dtype=torch.float32, device=keypoints.device)
+        for s, s_orig in zip(new_size, original_size)
+    ]
+    ratio_h, ratio_w = ratios
+    resized_data = keypoints.clone()
+    if torch._C._get_tracing_state():
+        resized_data_0 = resized_data[:, :, 0] * ratio_w
+        resized_data_1 = resized_data[:, :, 1] * ratio_h
+        resized_data = torch.stack((resized_data_0, resized_data_1, resized_data[:, :, 2]), dim=2)
+    else:
+        resized_data[..., 0] *= ratio_w
+        resized_data[..., 1] *= ratio_h
+    return resized_data
+
+
+def resize_boxes(boxes: Tensor, original_size: List[int], new_size: List[int]) -> Tensor:
+    ratios = [
+        torch.tensor(s, dtype=torch.float32, device=boxes.device) /
+        torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)
+        for s, s_orig in zip(new_size, original_size)
+    ]
+    ratio_height, ratio_width = ratios
+    xmin, ymin, xmax, ymax = boxes.unbind(1)
+
+    xmin = xmin * ratio_width
+    xmax = xmax * ratio_width
+    ymin = ymin * ratio_height
+    ymax = ymax * ratio_height
+    return torch.stack((xmin, ymin, xmax, ymax), dim=1)
--- a/model/utils.py
+++ b/model/utils.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import torch
+
+from collections import OrderedDict
+from torch import Tensor, nn
+from typing import List, Tuple, Dict
+
+from .frozen_bn import FrozenBatchNorm2d
+
+
+class IntermediateLayerGetter(nn.ModuleDict):
+    """
+    Module wrapper that returns intermediate layers from a model
+
+    It has a strong assumption that the modules have been registered
+    into the model in the same order as they are used.
+    This means that one should **not** reuse the same nn.Module
+    twice in the forward if you want this to work.
+
+    Additionally, it is only able to query submodules that are directly
+    assigned to the model. So if `model` is passed, `model.feature1` can
+    be returned, but not `model.feature1.layer2`.
+
+    Args:
+        model (nn.Module): model on which we will extract the features
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+
+    Examples::
+
+        >>> m = torchvision.models.resnet18(pretrained=True)
+        >>> # extract layer1 and layer3, giving as names `feat1` and feat2`
+        >>> new_m = torchvision.models._utils.IntermediateLayerGetter(m,
+        >>>     {'layer1': 'feat1', 'layer3': 'feat2'})
+        >>> out = new_m(torch.rand(1, 3, 224, 224))
+        >>> print([(k, v.shape) for k, v in out.items()])
+        >>>     [('feat1', torch.Size([1, 64, 56, 56])),
+        >>>      ('feat2', torch.Size([1, 256, 14, 14]))]
+    """
+    _version = 2
+    __annotations__ = {
+        "return_layers": Dict[str, str],
+    }
+
+    def __init__(self, model: nn.Module, return_layers: Dict[str, str]) -> None:
+        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
+            raise ValueError("return_layers are not present in model")
+        orig_return_layers = return_layers
+        return_layers = {str(k): str(v) for k, v in return_layers.items()}
+        layers = OrderedDict()
+        for name, module in model.named_children():
+            layers[name] = module
+            if name in return_layers:
+                del return_layers[name]
+            if not return_layers:
+                break
+
+        super(IntermediateLayerGetter, self).__init__(layers)
+        self.return_layers = orig_return_layers
+
+    def forward(self, x):
+        out = OrderedDict()
+        for name, module in self.items():
+            x = module(x)
+            if name in self.return_layers:
+                out_name = self.return_layers[name]
+                out[out_name] = x
+        return out
+
+
+@torch.jit._script_if_tracing
+def encode_boxes(reference_boxes, proposals, weights):
+    # type: (torch.Tensor, torch.Tensor, torch.Tensor) -> torch.Tensor
+    """
+    Encode a set of proposals with respect to some
+    reference boxes
+
+    Args:
+        reference_boxes (Tensor): reference boxes
+        proposals (Tensor): boxes to be encoded
+        weights (Tensor[4]): the weights for ``(x, y, w, h)``
+    """
+
+    # perform some unpacking to make it JIT-fusion friendly
+    wx = weights[0]
+    wy = weights[1]
+    ww = weights[2]
+    wh = weights[3]
+
+    proposals_x1 = proposals[:, 0].unsqueeze(1)
+    proposals_y1 = proposals[:, 1].unsqueeze(1)
+    proposals_x2 = proposals[:, 2].unsqueeze(1)
+    proposals_y2 = proposals[:, 3].unsqueeze(1)
+
+    reference_boxes_x1 = reference_boxes[:, 0].unsqueeze(1)
+    reference_boxes_y1 = reference_boxes[:, 1].unsqueeze(1)
+    reference_boxes_x2 = reference_boxes[:, 2].unsqueeze(1)
+    reference_boxes_y2 = reference_boxes[:, 3].unsqueeze(1)
+
+    # implementation starts here
+    ex_widths = proposals_x2 - proposals_x1
+    ex_heights = proposals_y2 - proposals_y1
+    ex_ctr_x = proposals_x1 + 0.5 * ex_widths
+    ex_ctr_y = proposals_y1 + 0.5 * ex_heights
+
+    gt_widths = reference_boxes_x2 - reference_boxes_x1
+    gt_heights = reference_boxes_y2 - reference_boxes_y1
+    gt_ctr_x = reference_boxes_x1 + 0.5 * gt_widths
+    gt_ctr_y = reference_boxes_y1 + 0.5 * gt_heights
+
+    targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths
+    targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights
+    targets_dw = ww * torch.log(gt_widths / ex_widths)
+    targets_dh = wh * torch.log(gt_heights / ex_heights)
+
+    targets = torch.cat((targets_dx, targets_dy, targets_dw, targets_dh), dim=1)
+    return targets
+
+
+# Similar to encode_boxes, but accepts tensors with batch dimension
+@torch.jit._script_if_tracing
+def encode_boxes_batch(reference_boxes, proposals, weights):
+    # type: (torch.Tensor, torch.Tensor, torch.Tensor) -> torch.Tensor
+    """
+    Encode a set of proposals with respect to some
+    reference boxes
+
+    Args:
+        reference_boxes (Tensor): reference boxes
+        proposals (Tensor): boxes to be encoded
+        weights (Tensor[4]): the weights for ``(x, y, w, h)``
+    """
+
+    # perform some unpacking to make it JIT-fusion friendly
+    wx = weights[0]
+    wy = weights[1]
+    ww = weights[2]
+    wh = weights[3]
+
+    proposals_x1 = proposals[:, :, 0]
+    proposals_y1 = proposals[:, :, 1]
+    proposals_x2 = proposals[:, :, 2]
+    proposals_y2 = proposals[:, :, 3]
+
+    reference_boxes_x1 = reference_boxes[:, :, 0]
+    reference_boxes_y1 = reference_boxes[:, :, 1]
+    reference_boxes_x2 = reference_boxes[:, :, 2]
+    reference_boxes_y2 = reference_boxes[:, :, 3]
+
+    # implementation starts here
+    ex_widths = proposals_x2 - proposals_x1
+    ex_heights = proposals_y2 - proposals_y1
+    ex_ctr_x = proposals_x1 + 0.5 * ex_widths
+    ex_ctr_y = proposals_y1 + 0.5 * ex_heights
+
+    gt_widths = reference_boxes_x2 - reference_boxes_x1
+    gt_heights = reference_boxes_y2 - reference_boxes_y1
+    gt_ctr_x = reference_boxes_x1 + 0.5 * gt_widths
+    gt_ctr_y = reference_boxes_y1 + 0.5 * gt_heights
+
+    targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths
+    targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights
+    targets_dw = ww * torch.log(gt_widths / ex_widths)
+    targets_dh = wh * torch.log(gt_heights / ex_heights)
+
+    targets = torch.cat((targets_dx[:, :, None], targets_dy[:, :, None], targets_dw[:, :, None], targets_dh[:, :, None]), dim=2)
+    return targets
+
+
+class BoxCoder(object):
+    """
+    This class encodes and decodes a set of bounding boxes into
+    the representation used for training the regressors.
+    """
+
+    def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)):
+        # type: (Tuple[float, float, float, float], float) -> None
+        """
+        Args:
+            weights (4-element tuple)
+            bbox_xform_clip (float)
+        """
+        self.weights = weights
+        self.weights_as_tensor = None
+        self.bbox_xform_clip = bbox_xform_clip
+
+    def encode(self, reference_boxes, proposals):
+        # type: (List[Tensor], List[Tensor]) -> List[Tensor]
+        boxes_per_image = [len(b) for b in reference_boxes]
+        reference_boxes = torch.cat(reference_boxes, dim=0)
+        proposals = torch.cat(proposals, dim=0)
+        targets = self.encode_single(reference_boxes, proposals)
+        return targets.split(boxes_per_image, 0)
+
+    def encode_single(self, reference_boxes, proposals):
+        """
+        Encode a set of proposals with respect to some
+        reference boxes
+
+        Args:
+            reference_boxes (Tensor): reference boxes
+            proposals (Tensor): boxes to be encoded
+        """
+        dtype = reference_boxes.dtype
+        device = reference_boxes.device
+        weights = torch.as_tensor(self.weights, dtype=dtype, device=device)
+        targets = encode_boxes(reference_boxes, proposals, weights)
+
+        return targets
+
+    # Similar to encode_single, just a wrapper for a batched input
+    def encode_batch(self, reference_boxes, proposals):
+        """
+        Encode a set of proposals with respect to some
+        reference boxes
+
+        Args:
+            reference_boxes (Tensor): reference boxes
+            proposals (Tensor): boxes to be encoded
+        """
+        dtype = reference_boxes.dtype
+        device = reference_boxes.device
+        if self.weights_as_tensor is None:
+            self.weights_as_tensor = torch.as_tensor(self.weights, dtype=dtype, device=device)
+        weights = self.weights_as_tensor
+        targets = encode_boxes_batch(reference_boxes, proposals, weights)
+
+        return targets
+
+    def decode(self, rel_codes, boxes):
+        # type: (Tensor, List[Tensor]) -> Tensor
+        assert isinstance(boxes, (list, tuple))
+        assert isinstance(rel_codes, torch.Tensor)
+        boxes_per_image = [b.size(0) for b in boxes]
+        concat_boxes = torch.cat(boxes, dim=0)
+        box_sum = 0
+        for val in boxes_per_image:
+            box_sum += val
+        if box_sum > 0:
+            rel_codes = rel_codes.reshape(box_sum, -1)
+        pred_boxes = self.decode_single(
+            rel_codes, concat_boxes
+        )
+        if box_sum > 0:
+            pred_boxes = pred_boxes.reshape(box_sum, -1, 4)
+        return pred_boxes
+
+    def decode_single(self, rel_codes, boxes):
+        """
+        From a set of original boxes and encoded relative box offsets,
+        get the decoded boxes.
+
+        Args:
+            rel_codes (Tensor): encoded boxes
+            boxes (Tensor): reference boxes.
+        """
+
+        boxes = boxes.to(rel_codes.dtype)
+
+        widths = boxes[:, 2] - boxes[:, 0]
+        heights = boxes[:, 3] - boxes[:, 1]
+        ctr_x = boxes[:, 0] + 0.5 * widths
+        ctr_y = boxes[:, 1] + 0.5 * heights
+
+        wx, wy, ww, wh = self.weights
+        dx = rel_codes[:, 0::4] / wx
+        dy = rel_codes[:, 1::4] / wy
+        dw = rel_codes[:, 2::4] / ww
+        dh = rel_codes[:, 3::4] / wh
+
+        # Prevent sending too large values into torch.exp()
+        dw = torch.clamp(dw, max=self.bbox_xform_clip)
+        dh = torch.clamp(dh, max=self.bbox_xform_clip)
+
+        pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
+        pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
+        pred_w = torch.exp(dw) * widths[:, None]
+        pred_h = torch.exp(dh) * heights[:, None]
+
+        # Distance from center to box's corner.
+        c_to_c_h = torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device) * pred_h
+        c_to_c_w = torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device) * pred_w
+
+        pred_boxes1 = pred_ctr_x - c_to_c_w
+        pred_boxes2 = pred_ctr_y - c_to_c_h
+        pred_boxes3 = pred_ctr_x + c_to_c_w
+        pred_boxes4 = pred_ctr_y + c_to_c_h
+        pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4), dim=2).flatten(1)
+        return pred_boxes
+
+
+class Matcher(object):
+    """
+    This class assigns to each predicted "element" (e.g., a box) a ground-truth
+    element. Each predicted element will have exactly zero or one matches; each
+    ground-truth element may be assigned to zero or more predicted elements.
+
+    Matching is based on the MxN match_quality_matrix, that characterizes how well
+    each (ground-truth, predicted)-pair match. For example, if the elements are
+    boxes, the matrix may contain box IoU overlap values.
+
+    The matcher returns a tensor of size N containing the index of the ground-truth
+    element m that matches to prediction n. If there is no match, a negative value
+    is returned.
+    """
+
+    BELOW_LOW_THRESHOLD = -1
+    BETWEEN_THRESHOLDS = -2
+
+    __annotations__ = {
+        'BELOW_LOW_THRESHOLD': int,
+        'BETWEEN_THRESHOLDS': int,
+    }
+
+    def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False):
+        # type: (float, float, bool) -> None
+        """
+        Args:
+            high_threshold (float): quality values greater than or equal to
+                this value are candidate matches.
+            low_threshold (float): a lower quality threshold used to stratify
+                matches into three levels:
+                1) matches >= high_threshold
+                2) BETWEEN_THRESHOLDS matches in [low_threshold, high_threshold)
+                3) BELOW_LOW_THRESHOLD matches in [0, low_threshold)
+            allow_low_quality_matches (bool): if True, produce additional matches
+                for predictions that have only low-quality match candidates. See
+                set_low_quality_matches_ for more details.
+        """
+        self.BELOW_LOW_THRESHOLD = -1
+        self.BETWEEN_THRESHOLDS = -2
+        assert low_threshold <= high_threshold
+        self.high_threshold = high_threshold
+        self.low_threshold = low_threshold
+        self.allow_low_quality_matches = allow_low_quality_matches
+
+    def __call__(self, match_quality_matrix):
+        """
+        Args:
+            match_quality_matrix (Tensor[float]): an MxN tensor, containing the
+            pairwise quality between M ground-truth elements and N predicted elements.
+
+        Returns:
+            matches (Tensor[int64]): an N tensor where N[i] is a matched gt in
+            [0, M - 1] or a negative value indicating that prediction i could not
+            be matched.
+        """
+        if match_quality_matrix.numel() == 0:
+            # empty targets or proposals not supported during training
+            if match_quality_matrix.shape[0] == 0:
+                raise ValueError(
+                    "No ground-truth boxes available for one of the images "
+                    "during training")
+            else:
+                raise ValueError(
+                    "No proposal boxes available for one of the images "
+                    "during training")
+
+        # match_quality_matrix is M (gt) x N (predicted)
+        # Max over gt elements (dim 0) to find best gt candidate for each prediction
+        matched_vals, matches = match_quality_matrix.max(dim=0)
+        if self.allow_low_quality_matches:
+            all_matches = matches.clone()
+        else:
+            all_matches = None
+
+        # Assign candidate matches with low quality to negative (unassigned) values
+        below_low_threshold = matched_vals < self.low_threshold
+        between_thresholds = (matched_vals >= self.low_threshold) & (
+            matched_vals < self.high_threshold
+        )
+        matches[below_low_threshold] = self.BELOW_LOW_THRESHOLD
+        matches[between_thresholds] = self.BETWEEN_THRESHOLDS
+
+        if self.allow_low_quality_matches:
+            assert all_matches is not None
+            self.set_low_quality_matches_(matches, all_matches, match_quality_matrix)
+
+        return matches
+
+    def set_low_quality_matches_(self, matches, all_matches, match_quality_matrix):
+        """
+        Produce additional matches for predictions that have only low-quality matches.
+        Specifically, for each ground-truth find the set of predictions that have
+        maximum overlap with it (including ties); for each prediction in that set, if
+        it is unmatched, then match it to the ground-truth with which it has the highest
+        quality value.
+        """
+        # For each gt, find the prediction with which it has highest quality
+        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
+        # Find highest quality match available, even if it is low, including ties
+        gt_pred_pairs_of_highest_quality = torch.where(
+            match_quality_matrix == highest_quality_foreach_gt[:, None]
+        )
+        # Example gt_pred_pairs_of_highest_quality:
+        #   tensor([[    0, 39796],
+        #           [    1, 32055],
+        #           [    1, 32070],
+        #           [    2, 39190],
+        #           [    2, 40255],
+        #           [    3, 40390],
+        #           [    3, 41455],
+        #           [    4, 45470],
+        #           [    5, 45325],
+        #           [    5, 46390]])
+        # Each row is a (gt index, prediction index)
+        # Note how gt items 1, 2, 3, and 5 each have two ties
+
+        pred_inds_to_update = gt_pred_pairs_of_highest_quality[1]
+        matches[pred_inds_to_update] = all_matches[pred_inds_to_update]
+
+
+# Similar to Matcher(object), but enabled for batched input
+# See original method for additional comments
+class MatcherBatch(object):
+    BELOW_LOW_THRESHOLD = -1
+    BETWEEN_THRESHOLDS = -2
+
+    __annotations__ = {
+        'BELOW_LOW_THRESHOLD': int,
+        'BETWEEN_THRESHOLDS': int,
+    }
+
+    def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False):
+        # type: (float, float, bool) -> None
+
+        self.BELOW_LOW_THRESHOLD = -1
+        self.BETWEEN_THRESHOLDS = -2
+        assert low_threshold <= high_threshold
+        self.high_threshold = high_threshold
+        self.low_threshold = low_threshold
+        self.allow_low_quality_matches = allow_low_quality_matches
+
+    def __call__(self, match_quality_matrix):
+        # TODO: move to preprocessing
+        if match_quality_matrix.numel() == 0:
+            # empty targets or proposals not supported during training
+            if match_quality_matrix.shape[0] == 0:
+                raise ValueError(
+                    "No ground-truth boxes available for one of the images "
+                    "during training")
+            else:
+                raise ValueError(
+                    "No proposal boxes available for one of the images "
+                    "during training")
+
+        matched_vals, matches = match_quality_matrix.max(dim=1)
+        all_matches = matches.clone() if self.allow_low_quality_matches else None
+
+        below_low_threshold = matched_vals < self.low_threshold
+        between_thresholds = (matched_vals >= self.low_threshold) & (matched_vals < self.high_threshold)
+        matches = torch.where(below_low_threshold, self.BELOW_LOW_THRESHOLD, matches)
+        matches = torch.where(between_thresholds, self.BETWEEN_THRESHOLDS, matches)
+
+        if self.allow_low_quality_matches:
+            assert all_matches is not None
+            matches = self.set_low_quality_matches_(matches, all_matches, match_quality_matrix)
+
+        return matches
+
+    def set_low_quality_matches_(self, matches, all_matches, match_quality_matrix):
+        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=2)
+
+        gt_pred_pairs_of_highest_quality = \
+            torch.where((match_quality_matrix == highest_quality_foreach_gt[:, :, None]) &
+                        (match_quality_matrix != 0), 1, 0)
+
+        gt_pred_pairs_of_highest_quality = gt_pred_pairs_of_highest_quality.sum(dim=1)
+        matches = torch.where(gt_pred_pairs_of_highest_quality >= 1, all_matches, matches)
+
+        return matches
+
+
+class SSDMatcher(Matcher):
+
+    def __init__(self, threshold):
+        super().__init__(threshold, threshold, allow_low_quality_matches=False)
+
+    def __call__(self, match_quality_matrix):
+        matches = super().__call__(match_quality_matrix)
+
+        # For each gt, find the prediction with which it has the highest quality
+        _, highest_quality_pred_foreach_gt = match_quality_matrix.max(dim=1)
+        matches[highest_quality_pred_foreach_gt] = torch.arange(highest_quality_pred_foreach_gt.size(0),
+                                                                dtype=torch.int64,
+                                                                device=highest_quality_pred_foreach_gt.device)
+
+        return matches
+
+
+def overwrite_eps(model, eps):
+    """
+    This method overwrites the default eps values of all the
+    FrozenBatchNorm2d layers of the model with the provided value.
+    This is necessary to address the BC-breaking change introduced
+    by the bug-fix at pytorch/vision#2933. The overwrite is applied
+    only when the pretrained weights are loaded to maintain compatibility
+    with previous versions.
+
+    Args:
+        model (nn.Module): The model on which we perform the overwrite.
+        eps (float): The new value of eps.
+    """
+    for module in model.modules():
+        if isinstance(module, FrozenBatchNorm2d):
+            module.eps = eps
+
+
+def retrieve_out_channels(model, size):
+    """
+    This method retrieves the number of output channels of a specific model.
+
+    Args:
+        model (nn.Module): The model for which we estimate the out_channels.
+            It should return a single Tensor or an OrderedDict[Tensor].
+        size (Tuple[int, int]): The size (wxh) of the input.
+
+    Returns:
+        out_channels (List[int]): A list of the output channels of the model.
+    """
+    in_training = model.training
+    model.eval()
+
+    with torch.no_grad():
+        # Use dummy data to retrieve the feature map sizes to avoid hard-coding their values
+        device = next(model.parameters()).device
+        tmp_img = torch.zeros((1, 3, size[1], size[0]), device=device)
+        features = model(tmp_img)
+        if isinstance(features, torch.Tensor):
+            features = OrderedDict([('0', features)])
+        out_channels = [x.size(1) for x in features.values()]
+
+    if in_training:
+        model.train()
+
+    return out_channels
--- a/model_capture.py
+++ b/model_capture.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import utils
+from engine import preprocessing, init_scratchpad, loss_preprocessing, compute_loss, compute_matched_idxs
+import copy
+
+
+def whole_model_capture(model, optimizer, scaler, dataset, args):
+    print('CUDA graph capture')
+
+    # save original params for later
+    model_bak = copy.deepcopy(model.state_dict())
+    optimizer_bak = copy.deepcopy(optimizer.state_dict())
+
+    model.train()
+
+    # direct pointer to the model
+    model_ptr = model.module if args.distributed else model
+
+    # extracting the device name from some layer
+    device = model_ptr.backbone.body.conv1.weight.device
+
+    # Convert epochs to iterations
+    # we want to control warmup at the epoch level, but update lr every iteration
+    start_iter = 0
+    dataset_len = len(dataset) if dataset is not None else int(args.train_sz / args.batch_size / utils.get_world_size())
+    warmup_iters = args.warmup_epochs * dataset_len
+    lr_scheduler = utils.warmup_lr_scheduler(optimizer, start_iter, warmup_iters, args.warmup_factor)
+
+    if args.cuda_graphs_syn:
+        assert (dataset is None)
+
+        images, targets = [], {'boxes': [], 'labels': []}
+        for b in range(args.batch_size):
+            # These are just arbitrary sizes for model capture
+            images.append(torch.randint(low=0, high=256, size=[3, 1000, 1000], device=device).float() / 255)
+            targets['boxes'].append(torch.tensor([[10, 20, 30, 40]], device=device))
+            targets['labels'].append(torch.tensor([1], device=device))
+        images, targets = preprocessing(images, targets, model_ptr, args.data_layout)
+    else:
+        images, targets = [], []
+
+        # taking the first batch
+        for images_, targets_ in dataset:
+            images = images_
+            targets = targets_
+            break
+
+        # if not DALI, then we should preprocess the data
+        if not args.dali:
+            images = list(image.to(device, non_blocking=True) for image in images)
+            targets = {k: [dic[k].to(device, non_blocking=True) for dic in targets] for k in targets[0]}
+
+            # --- preprocessing
+            images, targets = preprocessing(images, targets, model_ptr, args.data_layout)
+
+    # DALI can compute matched_idxs and put it in targets, but if it doesn't do so, do it here
+    if 'matched_idxs' not in targets:
+        with torch.cuda.amp.autocast(enabled=args.amp):
+            targets['matched_idxs'] = compute_matched_idxs(targets['boxes'], model_ptr)
+
+    with torch.cuda.amp.autocast(enabled=args.amp):
+        init_scratchpad(images, targets, args.batch_size, args.num_classes, args.amp,
+                        args.apex_focal_loss, args.max_boxes, args.cls_head_pad, args.reg_head_pad,
+                        args.cuda_graphs)
+
+        if args.not_graphed_prologues:
+            gt_classes_target, target_regression, num_foreground, valid_idxs, foreground_idxs_mask = \
+                loss_preprocessing(utils.ScratchPad.target_boxes_padded if args.reg_head_pad else targets['boxes'],
+                                   utils.ScratchPad.target_labels_padded if args.cls_head_pad else targets['labels'],
+                                   utils.ScratchPad.target_matched_idxs, model_ptr,
+                                   args.apex_focal_loss, args.max_boxes, args.cls_head_pad, args.reg_head_pad)
+
+    static_matched_idxs = torch.zeros_like(targets['matched_idxs'])
+    static_matched_idxs.copy_(targets['matched_idxs'])
+
+    # --- warmup
+    s = torch.cuda.Stream()
+    s.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(s):
+        for j in range(11):
+            if args.apex_adam:
+                # set_to_none is True by default
+                optimizer.zero_grad()
+            else:
+                optimizer.zero_grad(set_to_none=True)
+
+            # lr_scheduler.step()
+
+            with torch.cuda.amp.autocast(enabled=args.amp):
+                if not args.not_graphed_prologues:
+                    # preprocess everything that does not require model forward and backward
+                    gt_classes_target, target_regression, num_foreground, valid_idxs, foreground_idxs_mask = \
+                        loss_preprocessing(utils.ScratchPad.target_boxes_padded if args.reg_head_pad else targets['boxes'],
+                                           utils.ScratchPad.target_labels_padded if args.cls_head_pad else targets['labels'],
+                                           utils.ScratchPad.target_matched_idxs, model_ptr,
+                                           args.apex_focal_loss, args.max_boxes, args.cls_head_pad, args.reg_head_pad)
+
+                # forward
+                model_output = model(images)
+                # features = model_output[0:5]
+                # head_outputs = {'cls_logits': model_output[5], 'bbox_regression': model_output[6]}
+
+                cls_loss, reg_loss = compute_loss(model_ptr, model_output[5], model_output[6], valid_idxs,
+                                                  gt_classes_target, num_foreground, target_regression,
+                                                  foreground_idxs_mask, args.apex_focal_loss, args.reg_head_pad)
+
+                losses = cls_loss + reg_loss
+                assert(not torch.isnan(losses))
+
+            # backward
+            scaler.scale(losses).backward()
+
+            # optimizer
+            scaler.step(optimizer)
+            scaler.update()
+    torch.cuda.current_stream().wait_stream(s)
+
+    # --- capture
+    g = torch.cuda.CUDAGraph()
+
+    if args.apex_adam:
+        # set_to_none is True by default
+        optimizer.zero_grad()
+    else:
+        optimizer.zero_grad(set_to_none=True)
+
+    with torch.cuda.graph(g):
+        # # LR was already copied during warmup
+        # if args.warmup_epochs > 0:
+        #     lr_scheduler.step()
+
+        with torch.cuda.amp.autocast(enabled=args.amp):
+            if not args.not_graphed_prologues:
+                # loss_preprocessing is now part of the graph
+                gt_classes_target, target_regression, num_foreground, valid_idxs, foreground_idxs_mask = \
+                    loss_preprocessing(utils.ScratchPad.target_boxes_padded if args.reg_head_pad else targets['boxes'],
+                                       utils.ScratchPad.target_labels_padded if args.cls_head_pad else targets['labels'],
+                                       utils.ScratchPad.target_matched_idxs, model_ptr,
+                                       args.apex_focal_loss, args.max_boxes, args.cls_head_pad, args.reg_head_pad)
+
+            # forward
+            static_model_output = model(images)
+
+            # loss
+            static_cls_loss, static_reg_loss = compute_loss(model_ptr, static_model_output[5], static_model_output[6],
+                                                            valid_idxs, gt_classes_target, num_foreground,
+                                                            target_regression, foreground_idxs_mask,
+                                                            args.apex_focal_loss, args.reg_head_pad)
+
+            static_loss = static_cls_loss + static_reg_loss
+
+        # backward
+        scaler.scale(static_loss).backward()
+        # scaler.step(optimizer)
+        # scaler.update()
+
+    scaler.step(optimizer)
+    # set scaler and model back to their default values
+    scaler.update(65536.0)
+    model.load_state_dict(model_bak)
+    optimizer.load_state_dict(optimizer_bak)
+
+    if args.not_graphed_prologues:
+        static_prologues_out = [gt_classes_target, target_regression, num_foreground, valid_idxs, foreground_idxs_mask]
+    else:
+        static_prologues_out = None
+
+    return g, images, static_loss, static_prologues_out
+
+
+def whole_model_capture_eval(model, dataset, args):
+    # save original params for later
+    model_bak = copy.deepcopy(model.state_dict())
+
+    # direct pointer to the model
+    model_ptr = model.module if args.distributed else model
+
+    # extracting the device name from some layer
+    device = model_ptr.backbone.body.conv1.weight.device
+
+    # Convert epochs to iterations
+    # we want to control warmup at the epoch level, but update lr every iteration
+
+    if args.cuda_graphs_syn:
+        assert (dataset is None)
+
+        images, targets = [], {'boxes': [], 'labels': []}
+        for b in range(args.eval_batch_size):
+            # These are just arbitrary sizes for model capture
+            images.append(torch.rand([3, 1000, 1000], device=device))
+            targets['boxes'].append(torch.tensor([[10, 20, 30, 40]], device=device))
+            targets['labels'].append(torch.tensor([1], device=device))
+        images, targets = preprocessing(images, targets, model_ptr, args.data_layout)
+    else:
+        images, targets = [], []
+
+        # taking the first batch
+        for images_, targets_ in dataset:
+            images = images_
+            targets = targets_
+            break
+
+        # if not DALI, then we should preprocess the data
+        if not args.dali:
+            images = list(image.to(device, non_blocking=True) for image in images)
+            targets = {k: [dic[k].to(device, non_blocking=True) for dic in targets] for k in targets[0]}
+
+        # --- preprocessing
+        images, targets = preprocessing(images, targets, model_ptr, args.data_layout)
+
+    # --- warmup
+    s = torch.cuda.Stream()
+    s.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(s):
+        for j in range(11):
+            with torch.cuda.amp.autocast(enabled=args.amp):
+                # forward
+                model_output = model(images)
+                # features = model_output[0:5]
+                # head_outputs = {'cls_logits': model_output[5], 'bbox_regression': model_output[6]}
+    torch.cuda.current_stream().wait_stream(s)
+
+    # --- capture
+    g = torch.cuda.CUDAGraph()
+
+    with torch.cuda.graph(g):
+        with torch.cuda.amp.autocast(enabled=args.amp):
+            # forward
+            static_model_output = model(images)
+
+    return g, images, static_model_output
+
+
+def model_eval_warmup(model, batch_size, iters, args):
+    model.eval()
+
+    # direct pointer to the model
+    model_ptr = model.module if args.distributed else model
+    # extracting the device name from some layer
+    device = model_ptr.backbone.body.conv1.weight.device
+
+    for i in range(iters):
+        with torch.cuda.amp.autocast(enabled=args.amp):
+            x = torch.rand([batch_size, 3, args.image_size[0], args.image_size[1]], device=device)
+            model(x)
--- a/multi_train.sh
+++ b/multi_train.sh
+#!/bin/bash
+## DL params
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export BATCHSIZE=16
+export NUMEPOCHS=6
+export DATASET_DIR="/data/OpenImages_mlperf"
+export EXTRA_PARAMS='--lr 0.000085 --warmup-epochs 0 --frozen-bn-opt --frozen-bn-fp16 --apex-adam --disable-ddp-broadcast-buffers --fp16-allreduce --skip-metric-loss --async-coco'
+
+## System config params
+export DGXNGPU=8
+
+# Set variables
+EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}}
+LOG_INTERVAL=${LOG_INTERVAL:-20}
+TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-model-cache"}
+
+# run benchmark
+echo "running benchmark"
+
+PARAMS=(
+--batch-size "${BATCHSIZE}"
+--eval-batch-size "${EVALBATCHSIZE}"
+--epochs "${NUMEPOCHS}"
+--print-freq "${LOG_INTERVAL}"
+--dataset-path "${DATASET_DIR}"
+)
+
+# run training
+torchrun --nproc_per_node="${DGXNGPU}" train.py "${PARAMS[@]}" ${EXTRA_PARAMS} 2>&1 | tee ssd_bs16_epoch6.log
+
--- a/presets.py
+++ b/presets.py
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import transforms as T
+
+
+class DetectionPresetTrain:
+    def __init__(self, data_augmentation, hflip_prob=0.5, mean=(123., 117., 104.)):
+        if data_augmentation == 'hflip':
+            self.transforms = T.Compose([
+                T.RandomHorizontalFlip(p=hflip_prob),
+                T.ToTensor(),
+            ])
+        elif data_augmentation == 'ssd':
+            self.transforms = T.Compose([
+                T.RandomPhotometricDistort(),
+                T.RandomZoomOut(fill=list(mean)),
+                T.RandomIoUCrop(),
+                T.RandomHorizontalFlip(p=hflip_prob),
+                T.ToTensor(),
+            ])
+        elif data_augmentation == 'ssdlite':
+            self.transforms = T.Compose([
+                T.RandomIoUCrop(),
+                T.RandomHorizontalFlip(p=hflip_prob),
+                T.ToTensor(),
+            ])
+        else:
+            raise ValueError(f'Unknown data augmentation policy "{data_augmentation}"')
+
+    def __call__(self, img, target):
+        return self.transforms(img, target)
+
+
+class DetectionPresetEval:
+    def __init__(self):
+        self.transforms = T.ToTensor()
+
+    def __call__(self, img, target):
+        return self.transforms(img, target)
+
--- a/requirements.txt
+++ b/requirements.txt
+Cython>=0.29.32
+scikit-image>=0.19.3
+ujson>=5.5.0
+pybind11>=2.10.0
+git+https://github.com/NVIDIA/mlperf-common.git
+git+https://github.com/mlcommons/logging.git@2.1.0-rc1
+pyparsing>=3.0.9
--- a/run.sub
+++ b/run.sub
+#!/bin/bash
+#SBATCH --job-name single_stage_detector
+
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euxo pipefail
+
+# Vars without defaults
+: "${DGXSYSTEM:?DGXSYSTEM not set}"
+: "${CONT:?CONT not set}"
+# Vars with defaults
+: "${MLPERF_RULESET:=2.1.0}"
+: "${NEXP:=5}"
+: "${DATESTAMP:=$(date +'%y%m%d%H%M%S%N')}"
+: "${CLEAR_CACHES:=1}"
+: "${WORK_DIR:=/workspace/ssd}"
+: "${CONT_NAME:=single_stage_detector}"
+# ci automagically sets this correctly on Selene
+: "${LOGDIR:=./results}"
+
+# Scaleout brdige
+: "${NVTX_FLAG:=0}"
+: "${TIME_TAGS:=0}"
+: "${NCCL_TEST:=0}"
+: "${SYNTH_DATA:=0}"
+: "${EPOCH_PROF:=0}"
+: "${DISABLE_CG:=0}"
+
+# API Logging defaults
+: "${API_LOGGING:=0}"
+: "${API_LOG_DIR:=./api_logs}" # apiLog.sh output dir
+
+LOGBASE="${DATESTAMP}"
+SPREFIX="single_stage_detector_pytorch_${DGXNNODES}x${DGXNGPU}x${BATCHSIZE}_${DATESTAMP}"
+
+if [ ${TIME_TAGS} -gt 0 ]; then
+    LOGBASE="${SPREFIX}_mllog"
+fi
+if [ ${NVTX_FLAG} -gt 0 ]; then
+    if [[ "$LOGBASE" == *'_'* ]];then
+        LOGBASE="${LOGBASE}_nsys"
+    else
+        LOGBASE="${SPREFIX}_nsys"
+    fi
+
+    if [[ ! -d "${NVMLPERF_NSIGHT_LOCATION}" ]]; then
+	echo "$NVMLPERF_NSIGHT_LOCATION doesn't exist on this system!" 1>&2
+	exit 1
+    fi
+fi
+if [ ${SYNTH_DATA} -gt 0 ]; then
+    if [[ "$LOGBASE" == *'_'* ]];then
+        LOGBASE="${LOGBASE}_synth"
+    else
+        LOGBASE="${SPREFIX}_synth"
+    fi
+fi
+if [ ${EPOCH_PROF} -gt 0 ]; then
+    if [[ "$LOGBASE" == *'_'* ]];then
+        LOGBASE="${LOGBASE}_epoch"
+    else
+        LOGBASE="${SPREFIX}_epoch"
+    fi
+fi
+if [ ${DISABLE_CG} -gt 0 ]; then
+    EXTRA_PARAMS=$(echo $EXTRA_PARAMS | sed 's/--cuda-graphs//')
+    if [[ "$LOGBASE" == *'_'* ]];then
+        LOGBASE="${LOGBASE}_nocg"
+    else
+        LOGBASE="${SPREFIX}_nocg"
+    fi
+fi
+
+# do we need to fetch the data from lustre to /raid/scratch?
+if [[ "${LOCALDISK_FROM_SQUASHFS:-}" ]]; then
+    # LOCALDISK_FROM_SQUASHFS should be the path/name of a squashfs file on /lustre
+    echo "fetching ${LOCALDISK_FROM_SQUASHFS}"
+    dd bs=4M if="${LOCALDISK_FROM_SQUASHFS}" of=/raid/scratch/tmp.sqsh oflag=direct
+    echo "unsquashing /raid/scratch/tmp.sqsh"
+    time unsquashfs -no-progress -dest /raid/scratch/local-root /raid/scratch/tmp.sqsh
+fi
+    
+
+readonly LOG_FILE_BASE="${LOGDIR}/${LOGBASE}"
+CONT_MOUNTS="${DATADIR}:/datasets/open-images-v6,${LOGDIR}:/results,${BACKBONE_DIR}:/root/.cache/torch"
+
+if [[ "${NVTX_FLAG}" -gt 0 ]]; then
+    CONT_MOUNTS="${CONT_MOUNTS},${NVMLPERF_NSIGHT_LOCATION}:/nsight"
+fi
+# API Logging
+if [ "${API_LOGGING}" -eq 1 ]; then
+    CONT_MOUNTS="${CONT_MOUNTS},${API_LOG_DIR}:/logs"
+fi
+
+# Setup directories
+( umask 0002; mkdir -p "${LOGDIR}" )
+srun --ntasks="${SLURM_JOB_NUM_NODES}" mkdir -p "${LOGDIR}"
+
+# Setup container
+echo MELLANOX_VISIBLE_DEVICES="${MELLANOX_VISIBLE_DEVICES:-}"
+srun \
+    --ntasks="${SLURM_JOB_NUM_NODES}" \
+    --container-image="${CONT}" \
+    --container-name="${CONT_NAME}" \
+    true
+srun -N1 -n1 --container-name="${CONT_NAME}" ibv_devinfo --list
+srun -N1 -n1 --container-name="${CONT_NAME}" nvidia-smi topo -m
+
+echo "NCCL_TEST = ${NCCL_TEST}"
+if [[ ${NCCL_TEST} -eq 1 ]]; then
+    (srun --mpi=pmix --ntasks="$(( SLURM_JOB_NUM_NODES * DGXNGPU ))" --ntasks-per-node="${DGXNGPU}" \
+         --container-name="${CONT_NAME}" all_reduce_perf_mpi -b 33260119 -e 33260119 -d half -G 1    ) |& tee "${LOGDIR}/${SPREFIX}_nccl.log"
+
+fi
+
+# Run experiments
+for _experiment_index in $(seq -w 1 "${NEXP}"); do
+    (
+
+        echo "Beginning trial ${_experiment_index} of ${NEXP}"
+	echo ":::DLPAL ${CONT} ${SLURM_JOB_ID} ${SLURM_JOB_NUM_NODES} ${SLURM_JOB_NODELIST}"
+
+        # Print system info
+        srun -N1 -n1 --container-name="${CONT_NAME}" python -c ""
+
+        # Clear caches
+        if [ "${CLEAR_CACHES}" -eq 1 ]; then
+            srun --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3"
+            srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-name="${CONT_NAME}" python -c "
+from mlperf_logger import mllogger
+mllogger.event(key=mllogger.constants.CACHE_CLEAR, value=True)"
+        fi
+
+        # Run experiment
+        srun \
+            --ntasks="$(( SLURM_JOB_NUM_NODES * DGXNGPU ))" \
+            --ntasks-per-node="${DGXNGPU}" \
+            --container-name="${CONT_NAME}" \
+            --container-mounts="${CONT_MOUNTS}" \
+            --container-workdir=${WORK_DIR} \
+            ./run_and_time.sh
+    ) |& tee "${LOG_FILE_BASE}_${_experiment_index}.log"
+    # compliance checker
+    srun --ntasks=1 --nodes=1 --container-name="${CONT_NAME}" \
+         --container-mounts="$(realpath ${LOGDIR}):/results"   \
+         --container-workdir="/results"                        \
+         python3 -m mlperf_logging.compliance_checker --usage training \
+         --ruleset "${MLPERF_RULESET}"                                 \
+         --log_output "/results/compliance_${DATESTAMP}.out"           \
+         "/results/${LOGBASE}_${_experiment_index}.log" \
+	 || true
+done
--- a/run_and_time.sh
+++ b/run_and_time.sh
+#!/bin/bash
+
+# Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# runs benchmark and reports time to convergence
+# to use the script:
+#   run_and_time.sh
+
+set +x
+set -e
+
+# Only rank print
+[ "${SLURM_LOCALID-0}" -ne 0 ] && set +x
+
+
+# start timing
+start=$(date +%s)
+start_fmt=$(date +%Y-%m-%d\ %r)
+echo "STARTING TIMING RUN AT $start_fmt"
+
+# Set variables
+[ "${DEBUG}" = "1" ] && set -x
+LR=${LR:-0.0001}
+WARMUP_EPOCHS=${WARMUP_EPOCHS:-1}
+BATCHSIZE=${BATCHSIZE:-2}
+EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}}
+NUMEPOCHS=${NUMEPOCHS:-10}
+LOG_INTERVAL=${LOG_INTERVAL:-20}
+DATASET_DIR=${DATASET_DIR:-"/datasets/open-images-v6"}
+TORCH_HOME=${TORCH_HOME:-"/torch-home"}
+TIME_TAGS=${TIME_TAGS:-0}
+NVTX_FLAG=${NVTX_FLAG:-0}
+NCCL_TEST=${NCCL_TEST:-0}
+EPOCH_PROF=${EPOCH_PROF:-0}
+SYNTH_DATA=${SYNTH_DATA:-0}
+DISABLE_CG=${DISABLE_CG:-0}
+
+# run benchmark
+echo "running benchmark"
+if [ ${NVTX_FLAG} -gt 0 ]; then
+# FIXME mfrank 2022-May-24: NSYSCMD needs to be an array, not a space-separated string
+ NSYSCMD=" /nsight/bin/nsys profile --capture-range cudaProfilerApi --capture-range-end stop --sample=none --cpuctxsw=none  --trace=cuda,nvtx  --force-overwrite true --output /results/single_stage_detector_pytorch_${DGXNNODES}x${DGXNGPU}x${BATCHSIZE}_${DATESTAMP}_${SLURM_PROCID}_${SYNTH_DATA}_${DISABLE_CG}.nsys-rep "
+else
+ NSYSCMD=""
+fi
+
+if [ ${SYNTH_DATA} -gt 0 ]; then
+EXTRA_PARAMS+=" --syn-dataset --cuda-graphs-syn "
+EXTRA_PARAMS=$(echo $EXTRA_PARAMS | sed 's/--dali//')
+fi
+
+declare -a CMD
+if [ -n "${SLURM_LOCALID-}" ]; then
+    # Mode 1: Slurm launched a task for each GPU and set some envvars; no need for parallel launch
+  if [ "${SLURM_NTASKS}" -gt "${SLURM_JOB_NUM_NODES}" ]; then
+    CMD=( 'bindpcie' '--ib=single' '--' ${NSYSCMD} 'python' '-u' )
+  else
+    CMD=( ${NSYSCMD} 'python' '-u' )
+  fi
+else
+  # Mode 2: Single-node Docker, we've been launched with `torch_run`
+  # TODO: Replace below CMD with NSYSCMD..., but make sure NSYSCMD is an array, not a string
+  CMD=( "python" )
+fi
+
+
+if [ "$LOGGER" = "apiLog.sh" ];
+then
+  LOGGER="${LOGGER} -p MLPerf/${MODEL_NAME} -v ${FRAMEWORK}/train/${DGXSYSTEM}"
+  # TODO(ahmadki): track the apiLog.sh bug and remove the workaround
+  # there is a bug in apiLog.sh preventing it from collecting
+  # NCCL logs, the workaround is to log a single rank only
+  # LOCAL_RANK is set with an enroot hook for Pytorch containers
+  # SLURM_LOCALID is set by Slurm
+  # OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
+  readonly node_rank="${SLURM_NODEID:-0}"
+  readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
+  if [ "$node_rank" -eq 0 ] && [ "$local_rank" -eq 0 ];
+  then
+    LOGGER=$LOGGER
+  else
+    LOGGER=""
+  fi
+fi
+
+PARAMS=(
+      --lr                      "${LR}"
+      --batch-size              "${BATCHSIZE}"
+      --eval-batch-size         "${EVALBATCHSIZE}"
+      --epochs                  "${NUMEPOCHS}"
+      --print-freq              "${LOG_INTERVAL}"
+      --dataset-path            "${DATASET_DIR}"
+      --warmup-epochs           "${WARMUP_EPOCHS}"
+)
+
+# run training
+${LOGGER:-} "${CMD[@]}" train.py "${PARAMS[@]}" ${EXTRA_PARAMS} ; ret_code=$?
+
+set +x
+
+sleep 3
+if [[ $ret_code != 0 ]]; then exit $ret_code; fi
+
+# end timing
+end=$(date +%s)
+end_fmt=$(date +%Y-%m-%d\ %r)
+echo "ENDING TIMING RUN AT $end_fmt"
+
+# report result
+result=$(( $end - $start ))
+result_name="SINGLE_STAGE_DETECTOR"
+
+echo "RESULT,$result_name,,$result,nvidia,$start_fmt"
--- a/run_with_docker.sh
+++ b/run_with_docker.sh
+#!/bin/bash
+
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euxo pipefail
+
+# Vars without defaults
+: "${DGXSYSTEM:?DGXSYSTEM not set}"
+: "${CONT:?CONT not set}"
+# Vars with defaults
+: "${NEXP:=5}"
+: "${DATESTAMP:=$(date +'%y%m%d%H%M%S%N')}"
+: "${CLEAR_CACHES:=1}"
+: "${BACKBONE_DIR:=./torch-home}"
+: "${CONT_NAME:=single_stage_detector}"
+# ci automagically sets this correctly on Selene
+: "${DATADIR:=/raid/datasets/openimages/open-images-v6}"
+: "${LOGDIR:=$(pwd)/results}"
+# Logging
+LOG_BASE="ssd_${DGXNNODES}x${DGXNGPU}x${BATCHSIZE}_${DATESTAMP}"
+readonly LOG_FILE_BASE="${LOGDIR}/${LOG_BASE}"
+# Other vars
+readonly _config_file="./config_${DGXSYSTEM}.sh"
+# Mount points
+CONT_MOUNTS=(
+    "--volume=${DATADIR}:/datasets/open-images-v6"
+    "--volume=${LOGDIR}:/results"
+    "--volume=${BACKBONE_DIR}:/root/.cache/torch"
+)
+# MLPerf vars
+MLPERF_HOST_OS=$(
+    source /etc/os-release
+    source /etc/dgx-release || true
+    echo "${PRETTY_NAME} / ${DGX_PRETTY_NAME:-???} ${DGX_OTA_VERSION:-${DGX_SWBUILD_VERSION:-???}}"
+)
+export MLPERF_HOST_OS
+
+# Setup directories
+mkdir -p "${LOGDIR}"
+
+# Get list of envvars to pass to docker
+mapfile -t _config_env < <(env -i bash -c ". ${_config_file} && compgen -e" | grep -E -v '^(PWD|SHLVL)')
+_config_env+=(MLPERF_HOST_OS)
+mapfile -t _config_env < <(for v in "${_config_env[@]}"; do echo "--env=$v"; done)
+
+# Cleanup container
+cleanup_docker() {
+    docker container rm -f "${CONT_NAME}" || true
+}
+cleanup_docker
+trap 'set -eux; cleanup_docker' EXIT
+
+# Setup container
+if [ -z "${NV_GPU-}" ]; then
+  readonly _docker_gpu_args="--gpus all"
+else
+  readonly _docker_gpu_args='--gpus="'device=${NV_GPU}'" -e NVIDIA_VISIBLE_DEVICES='"${NV_GPU}"
+fi
+
+docker run ${_docker_gpu_args} --rm --init --detach \
+    --net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
+    --ulimit=stack=67108864 --ulimit=memlock=-1 \
+    --name="${CONT_NAME}" "${_cont_mounts[@]}" \
+    "${CONT}" sleep infinity
+#make sure container has time to finish initialization
+sleep 30
+docker exec -it "${CONT_NAME}" true
+
+readonly TORCH_RUN="python -m torch.distributed.run --standalone --no_python"
+
+# Run experiments
+for _experiment_index in $(seq 1 "${NEXP}"); do
+    (
+
+        echo "Beginning trial ${_experiment_index} of ${NEXP}"
+
+        # Clear caches
+        if [ "${CLEAR_CACHES}" -eq 1 ]; then
+            sync && sudo /sbin/sysctl vm.drop_caches=3
+            docker exec -it "${CONT_NAME}" python -c "
+from mlperf_logger import mllogger
+mllogger.event(key=mllogger.constants.CACHE_CLEAR, value=True)"
+        fi
+
+        # Run experiment
+        docker exec -it "${_config_env[@]}" "${CONT_NAME}" \
+               ${TORCH_RUN} --nproc_per_node=${DGXNGPU} ./run_and_time.sh
+    ) |& tee "${LOG_FILE_BASE}_${_experiment_index}.log"
+done