Post-paper Detection Optimizations (#5444)

* Use frozen BN only if pre-trained. * Add LSJ and ability to from scratch training. * Fixing formatter * Adding `--opt` and `--norm-weight-decay` support in Detection. * Fix error message * Make ScaleJitter proportional. * Adding more norm layers in split_normalization_params. * Add FixedSizeCrop * Temporary fix for fill values on PIL * Fix the bug on fill. * Add RandomShortestSize. * Skip resize when an augmentation method is used. * multiscale in [480, 800] * Add missing star * Add new RetinaNet variant. * Add tests. * Update expected file for old retina * Fixing tests * Add FrozenBN to retinav2 * Fix network initialization issues * Adding BN support in MaskRCNNHeads and FPN * Adding support of FasterRCNNHeads * Introduce norm_layers in backbone utils. * Bigger RPN head + 2x rcnn v2 models. * Adding gIoU support to retinanet * Fix assert * Add back nesterov momentum * Rename and extend `FastRCNNConvFCHead` to support arbitrary FCs * Fix linter

Post-paper Detection Optimizations (#5444)
* Use frozen BN only if pre-trained. * Add LSJ and ability to from scratch training. * Fixing formatter * Adding `--opt` and `--norm-weight-decay` support in Detection. * Fix error message * Make ScaleJitter proportional. * Adding more norm layers in split_normalization_params. * Add FixedSizeCrop * Temporary fix for fill values on PIL * Fix the bug on fill. * Add RandomShortestSize. * Skip resize when an augmentation method is used. * multiscale in [480, 800] * Add missing star * Add new RetinaNet variant. * Add tests. * Update expected file for old retina * Fixing tests * Add FrozenBN to retinav2 * Fix network initialization issues * Adding BN support in MaskRCNNHeads and FPN * Adding support of FasterRCNNHeads * Introduce norm_layers in backbone utils. * Bigger RPN head + 2x rcnn v2 models. * Adding gIoU support to retinanet * Fix assert * Add back nesterov momentum * Rename and extend `FastRCNNConvFCHead` to support arbitrary FCs * Fix linter
08cc9a7f · Vasilis Vryniotis · GitHub · 63576c9f · 08cc9a7f · 08cc9a7f
Unverified Commit 08cc9a7f authored Apr 05, 2022 by Vasilis Vryniotis Committed by GitHub Apr 05, 2022
11 changed files
--- a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl
+++ b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl
--- a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl
+++ b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl
--- a/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl
+++ b/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -195,11 +195,14 @@ script_model_unwrapper = {
    "googlenet": lambda x: x.logits,
    "inception_v3": lambda x: x.logits,
    "fasterrcnn_resnet50_fpn": lambda x: x[1],
+    "fasterrcnn_resnet50_fpn_v2": lambda x: x[1],
    "fasterrcnn_mobilenet_v3_large_fpn": lambda x: x[1],
    "fasterrcnn_mobilenet_v3_large_320_fpn": lambda x: x[1],
    "maskrcnn_resnet50_fpn": lambda x: x[1],
+    "maskrcnn_resnet50_fpn_v2": lambda x: x[1],
    "keypointrcnn_resnet50_fpn": lambda x: x[1],
    "retinanet_resnet50_fpn": lambda x: x[1],
+    "retinanet_resnet50_fpn_v2": lambda x: x[1],
    "ssd300_vgg16": lambda x: x[1],
    "ssdlite320_mobilenet_v3_large": lambda x: x[1],
    "fcos_resnet50_fpn": lambda x: x[1],
@@ -227,6 +230,7 @@ autocast_flaky_numerics = (
    "fcn_resnet101",
    "lraspp_mobilenet_v3_large",
    "maskrcnn_resnet50_fpn",
+    "maskrcnn_resnet50_fpn_v2",
 )

 # The tests for the following quantized models are flaky possibly due to inconsistent
@@ -246,6 +250,13 @@ _model_params = {
        "max_size": 224,
        "input_shape": (3, 224, 224),
    },
+    "retinanet_resnet50_fpn_v2": {
+        "num_classes": 20,
+        "score_thresh": 0.01,
+        "min_size": 224,
+        "max_size": 224,
+        "input_shape": (3, 224, 224),
+    },
    "keypointrcnn_resnet50_fpn": {
        "num_classes": 2,
        "min_size": 224,
@@ -259,6 +270,12 @@ _model_params = {
        "max_size": 224,
        "input_shape": (3, 224, 224),
    },
+    "fasterrcnn_resnet50_fpn_v2": {
+        "num_classes": 20,
+        "min_size": 224,
+        "max_size": 224,
+        "input_shape": (3, 224, 224),
+    },
    "fcos_resnet50_fpn": {
        "num_classes": 2,
        "score_thresh": 0.05,
@@ -272,6 +289,12 @@ _model_params = {
        "max_size": 224,
        "input_shape": (3, 224, 224),
    },
+    "maskrcnn_resnet50_fpn_v2": {
+        "num_classes": 10,
+        "min_size": 224,
+        "max_size": 224,
+        "input_shape": (3, 224, 224),
+    },
    "fasterrcnn_mobilenet_v3_large_fpn": {
        "box_score_thresh": 0.02076,
    },
@@ -311,6 +334,10 @@ _model_tests_values = {
        "max_trainable": 5,
        "n_trn_params_per_layer": [36, 46, 65, 78, 88, 89],
    },
+    "retinanet_resnet50_fpn_v2": {
+        "max_trainable": 5,
+        "n_trn_params_per_layer": [44, 74, 131, 170, 200, 203],
+    },
    "keypointrcnn_resnet50_fpn": {
        "max_trainable": 5,
        "n_trn_params_per_layer": [48, 58, 77, 90, 100, 101],
@@ -319,10 +346,18 @@ _model_tests_values = {
        "max_trainable": 5,
        "n_trn_params_per_layer": [30, 40, 59, 72, 82, 83],
    },
+    "fasterrcnn_resnet50_fpn_v2": {
+        "max_trainable": 5,
+        "n_trn_params_per_layer": [50, 80, 137, 176, 206, 209],
+    },
    "maskrcnn_resnet50_fpn": {
        "max_trainable": 5,
        "n_trn_params_per_layer": [42, 52, 71, 84, 94, 95],
    },
+    "maskrcnn_resnet50_fpn_v2": {
+        "max_trainable": 5,
+        "n_trn_params_per_layer": [66, 96, 153, 192, 222, 225],
+    },
    "fasterrcnn_mobilenet_v3_large_fpn": {
        "max_trainable": 6,
        "n_trn_params_per_layer": [22, 23, 44, 70, 91, 97, 100],

--- a/torchvision/models/detection/_utils.py
+++ b/torchvision/models/detection/_utils.py
 import math
 from collections import OrderedDict
-from typing import List, Tuple
+from typing import Dict, List, Optional, Tuple

 import torch
 from torch import Tensor, nn
-from torchvision.ops.misc import FrozenBatchNorm2d
+from torch.nn import functional as F
+from torchvision.ops import FrozenBatchNorm2d, generalized_box_iou_loss


 class BalancedPositiveNegativeSampler:
@@ -507,3 +508,26 @@ def _topk_min(input: Tensor, orig_kval: int, axis: int) -> int:
    axis_dim_val = torch._shape_as_tensor(input)[axis].unsqueeze(0)
    min_kval = torch.min(torch.cat((torch.tensor([orig_kval], dtype=axis_dim_val.dtype), axis_dim_val), 0))
    return _fake_cast_onnx(min_kval)
+
+
+def _box_loss(
+    type: str,
+    box_coder: BoxCoder,
+    anchors_per_image: Tensor,
+    matched_gt_boxes_per_image: Tensor,
+    bbox_regression_per_image: Tensor,
+    cnf: Optional[Dict[str, float]] = None,
+) -> Tensor:
+    torch._assert(type in ["l1", "smooth_l1", "giou"], f"Unsupported loss: {type}")
+
+    if type == "l1":
+        target_regression = box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image)
+        return F.l1_loss(bbox_regression_per_image, target_regression, reduction="sum")
+    elif type == "smooth_l1":
+        target_regression = box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image)
+        beta = cnf["beta"] if cnf is not None and "beta" in cnf else 1.0
+        return F.smooth_l1_loss(bbox_regression_per_image, target_regression, reduction="sum", beta=beta)
+    else:  # giou
+        bbox_per_image = box_coder.decode_single(bbox_regression_per_image, anchors_per_image)
+        eps = cnf["eps"] if cnf is not None and "eps" in cnf else 1e-7
+        return generalized_box_iou_loss(bbox_per_image, matched_gt_boxes_per_image, reduction="sum", eps=eps)
--- a/torchvision/models/detection/backbone_utils.py
+++ b/torchvision/models/detection/backbone_utils.py
@@ -25,6 +25,7 @@ class BackboneWithFPN(nn.Module):
        in_channels_list (List[int]): number of channels for each feature map
            that is returned, in the order they are present in the OrderedDict
        out_channels (int): number of channels in the FPN.
+        norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None
    Attributes:
        out_channels (int): the number of channels in the FPN
    """
@@ -36,6 +37,7 @@ class BackboneWithFPN(nn.Module):
        in_channels_list: List[int],
        out_channels: int,
        extra_blocks: Optional[ExtraFPNBlock] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
    ) -> None:
        super().__init__()

@@ -47,6 +49,7 @@ class BackboneWithFPN(nn.Module):
            in_channels_list=in_channels_list,
            out_channels=out_channels,
            extra_blocks=extra_blocks,
+            norm_layer=norm_layer,
        )
        self.out_channels = out_channels

@@ -115,6 +118,7 @@ def _resnet_fpn_extractor(
    trainable_layers: int,
    returned_layers: Optional[List[int]] = None,
    extra_blocks: Optional[ExtraFPNBlock] = None,
+    norm_layer: Optional[Callable[..., nn.Module]] = None,
 ) -> BackboneWithFPN:

    # select layers that wont be frozen
@@ -139,7 +143,9 @@ def _resnet_fpn_extractor(
    in_channels_stage2 = backbone.inplanes // 8
    in_channels_list = [in_channels_stage2 * 2 ** (i - 1) for i in returned_layers]
    out_channels = 256
-    return BackboneWithFPN(backbone, return_layers, in_channels_list, out_channels, extra_blocks=extra_blocks)
+    return BackboneWithFPN(
+        backbone, return_layers, in_channels_list, out_channels, extra_blocks=extra_blocks, norm_layer=norm_layer
+    )


 def _validate_trainable_layers(
@@ -194,6 +200,7 @@ def _mobilenet_extractor(
    trainable_layers: int,
    returned_layers: Optional[List[int]] = None,
    extra_blocks: Optional[ExtraFPNBlock] = None,
+    norm_layer: Optional[Callable[..., nn.Module]] = None,
 ) -> nn.Module:
    backbone = backbone.features
    # Gather the indices of blocks which are strided. These are the locations of C1, ..., Cn-1 blocks.
@@ -222,7 +229,9 @@ def _mobilenet_extractor(
        return_layers = {f"{stage_indices[k]}": str(v) for v, k in enumerate(returned_layers)}

        in_channels_list = [backbone[stage_indices[i]].out_channels for i in returned_layers]
-        return BackboneWithFPN(backbone, return_layers, in_channels_list, out_channels, extra_blocks=extra_blocks)
+        return BackboneWithFPN(
+            backbone, return_layers, in_channels_list, out_channels, extra_blocks=extra_blocks, norm_layer=norm_layer
+        )
    else:
        m = nn.Sequential(
            backbone,

--- a/torchvision/models/detection/faster_rcnn.py
+++ b/torchvision/models/detection/faster_rcnn.py
-from typing import Any, Optional, Union
+from typing import Any, Callable, List, Optional, Tuple, Union

 import torch
 import torch.nn.functional as F
@@ -24,14 +24,22 @@ from .transform import GeneralizedRCNNTransform
 __all__ = [
    "FasterRCNN",
    "FasterRCNN_ResNet50_FPN_Weights",
+    "FasterRCNN_ResNet50_FPN_V2_Weights",
    "FasterRCNN_MobileNet_V3_Large_FPN_Weights",
    "FasterRCNN_MobileNet_V3_Large_320_FPN_Weights",
    "fasterrcnn_resnet50_fpn",
+    "fasterrcnn_resnet50_fpn_v2",
    "fasterrcnn_mobilenet_v3_large_fpn",
    "fasterrcnn_mobilenet_v3_large_320_fpn",
 ]


+def _default_anchorgen():
+    anchor_sizes = ((32,), (64,), (128,), (256,), (512,))
+    aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
+    return AnchorGenerator(anchor_sizes, aspect_ratios)
+
+
 class FasterRCNN(GeneralizedRCNN):
    """
    Implements Faster R-CNN.
@@ -216,9 +224,7 @@ class FasterRCNN(GeneralizedRCNN):
        out_channels = backbone.out_channels

        if rpn_anchor_generator is None:
-            anchor_sizes = ((32,), (64,), (128,), (256,), (512,))
-            aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
-            rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)
+            rpn_anchor_generator = _default_anchorgen()
        if rpn_head is None:
            rpn_head = RPNHead(out_channels, rpn_anchor_generator.num_anchors_per_location()[0])

@@ -298,6 +304,43 @@ class TwoMLPHead(nn.Module):
        return x


+class FastRCNNConvFCHead(nn.Sequential):
+    def __init__(
+        self,
+        input_size: Tuple[int, int, int],
+        conv_layers: List[int],
+        fc_layers: List[int],
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ):
+        """
+        Args:
+            input_size (Tuple[int, int, int]): the input size in CHW format.
+            conv_layers (list): feature dimensions of each Convolution layer
+            fc_layers (list): feature dimensions of each FCN layer
+            norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None
+        """
+        in_channels, in_height, in_width = input_size
+
+        blocks = []
+        previous_channels = in_channels
+        for current_channels in conv_layers:
+            blocks.append(misc_nn_ops.Conv2dNormActivation(previous_channels, current_channels, norm_layer=norm_layer))
+            previous_channels = current_channels
+        blocks.append(nn.Flatten())
+        previous_channels = previous_channels * in_height * in_width
+        for current_channels in fc_layers:
+            blocks.append(nn.Linear(previous_channels, current_channels))
+            blocks.append(nn.ReLU(inplace=True))
+            previous_channels = current_channels
+
+        super().__init__(*blocks)
+        for layer in self.modules():
+            if isinstance(layer, nn.Conv2d):
+                nn.init.kaiming_normal_(layer.weight, mode="fan_out", nonlinearity="relu")
+                if layer.bias is not None:
+                    nn.init.zeros_(layer.bias)
+
+
 class FastRCNNPredictor(nn.Module):
    """
    Standard classification + bounding box regression layers
@@ -349,6 +392,10 @@ class FasterRCNN_ResNet50_FPN_Weights(WeightsEnum):
    DEFAULT = COCO_V1


+class FasterRCNN_ResNet50_FPN_V2_Weights(WeightsEnum):
+    pass
+
+
 class FasterRCNN_MobileNet_V3_Large_FPN_Weights(WeightsEnum):
    COCO_V1 = Weights(
        url="https://download.pytorch.org/models/fasterrcnn_mobilenet_v3_large_fpn-fb6a3cc7.pth",
@@ -481,6 +528,66 @@ def fasterrcnn_resnet50_fpn(
    return model


+def fasterrcnn_resnet50_fpn_v2(
+    *,
+    weights: Optional[FasterRCNN_ResNet50_FPN_V2_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[ResNet50_Weights] = None,
+    trainable_backbone_layers: Optional[int] = None,
+    **kwargs: Any,
+) -> FasterRCNN:
+    """
+    Constructs an improved Faster R-CNN model with a ResNet-50-FPN backbone.
+
+    Reference: `"Benchmarking Detection Transfer Learning with Vision Transformers"
+    <https://arxiv.org/abs/2111.11429>`_.
+
+    :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` for more details.
+
+    Args:
+        weights (FasterRCNN_ResNet50_FPN_V2_Weights, optional): The pretrained weights for the model
+        progress (bool): If True, displays a progress bar of the download to stderr
+        num_classes (int, optional): number of output classes of the model (including the background)
+        weights_backbone (ResNet50_Weights, optional): The pretrained weights for the backbone
+        trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from final block.
+            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable. If ``None`` is
+            passed (the default) this value is set to 3.
+    """
+    weights = FasterRCNN_ResNet50_FPN_V2_Weights.verify(weights)
+    weights_backbone = ResNet50_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
+    elif num_classes is None:
+        num_classes = 91
+
+    is_trained = weights is not None or weights_backbone is not None
+    trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3)
+
+    backbone = resnet50(weights=weights_backbone, progress=progress)
+    backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers, norm_layer=nn.BatchNorm2d)
+    rpn_anchor_generator = _default_anchorgen()
+    rpn_head = RPNHead(backbone.out_channels, rpn_anchor_generator.num_anchors_per_location()[0], conv_depth=2)
+    box_head = FastRCNNConvFCHead(
+        (backbone.out_channels, 7, 7), [256, 256, 256, 256], [1024], norm_layer=nn.BatchNorm2d
+    )
+    model = FasterRCNN(
+        backbone,
+        num_classes=num_classes,
+        rpn_anchor_generator=rpn_anchor_generator,
+        rpn_head=rpn_head,
+        box_head=box_head,
+        **kwargs,
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress))
+
+    return model
+
+
 def _fasterrcnn_mobilenet_v3_large_fpn(
    *,
    weights: Optional[Union[FasterRCNN_MobileNet_V3_Large_FPN_Weights, FasterRCNN_MobileNet_V3_Large_320_FPN_Weights]],

--- a/torchvision/models/detection/mask_rcnn.py
+++ b/torchvision/models/detection/mask_rcnn.py
 from collections import OrderedDict
-from typing import Any, Optional
+from typing import Any, Callable, Optional

 from torch import nn
 from torchvision.ops import MultiScaleRoIAlign
@@ -12,13 +12,15 @@ from .._utils import handle_legacy_interface, _ovewrite_value_param
 from ..resnet import ResNet50_Weights, resnet50
 from ._utils import overwrite_eps
 from .backbone_utils import _resnet_fpn_extractor, _validate_trainable_layers
-from .faster_rcnn import FasterRCNN
+from .faster_rcnn import FasterRCNN, FastRCNNConvFCHead, RPNHead, _default_anchorgen


 __all__ = [
    "MaskRCNN",
    "MaskRCNN_ResNet50_FPN_Weights",
+    "MaskRCNN_ResNet50_FPN_V2_Weights",
    "maskrcnn_resnet50_fpn",
+    "maskrcnn_resnet50_fpn_v2",
 ]


@@ -264,28 +266,68 @@ class MaskRCNN(FasterRCNN):


 class MaskRCNNHeads(nn.Sequential):
-    def __init__(self, in_channels, layers, dilation):
+    _version = 2
+
+    def __init__(self, in_channels, layers, dilation, norm_layer: Optional[Callable[..., nn.Module]] = None):
        """
        Args:
            in_channels (int): number of input channels
            layers (list): feature dimensions of each FCN layer
            dilation (int): dilation rate of kernel
+            norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None
        """
-        d = OrderedDict()
+        blocks = []
        next_feature = in_channels
-        for layer_idx, layer_features in enumerate(layers, 1):
-            d[f"mask_fcn{layer_idx}"] = nn.Conv2d(
-                next_feature, layer_features, kernel_size=3, stride=1, padding=dilation, dilation=dilation
+        for layer_features in layers:
+            blocks.append(
+                misc_nn_ops.Conv2dNormActivation(
+                    next_feature,
+                    layer_features,
+                    kernel_size=3,
+                    stride=1,
+                    padding=dilation,
+                    dilation=dilation,
+                    norm_layer=norm_layer,
+                )
            )
-            d[f"relu{layer_idx}"] = nn.ReLU(inplace=True)
            next_feature = layer_features

-        super().__init__(d)
-        for name, param in self.named_parameters():
-            if "weight" in name:
-                nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
-            # elif "bias" in name:
-            #     nn.init.constant_(param, 0)
+        super().__init__(*blocks)
+        for layer in self.modules():
+            if isinstance(layer, nn.Conv2d):
+                nn.init.kaiming_normal_(layer.weight, mode="fan_out", nonlinearity="relu")
+                if layer.bias is not None:
+                    nn.init.zeros_(layer.bias)
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            num_blocks = len(self)
+            for i in range(num_blocks):
+                for type in ["weight", "bias"]:
+                    old_key = f"{prefix}mask_fcn{i+1}.{type}"
+                    new_key = f"{prefix}{i}.0.{type}"
+                    state_dict[new_key] = state_dict.pop(old_key)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )


 class MaskRCNNPredictor(nn.Sequential):
@@ -326,6 +368,10 @@ class MaskRCNN_ResNet50_FPN_Weights(WeightsEnum):
    DEFAULT = COCO_V1


+class MaskRCNN_ResNet50_FPN_V2_Weights(WeightsEnum):
+    pass
+
+
 @handle_legacy_interface(
    weights=("pretrained", MaskRCNN_ResNet50_FPN_Weights.COCO_V1),
    weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
@@ -418,3 +464,65 @@ def maskrcnn_resnet50_fpn(
            overwrite_eps(model, 0.0)

    return model
+
+
+def maskrcnn_resnet50_fpn_v2(
+    *,
+    weights: Optional[MaskRCNN_ResNet50_FPN_V2_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[ResNet50_Weights] = None,
+    trainable_backbone_layers: Optional[int] = None,
+    **kwargs: Any,
+) -> MaskRCNN:
+    """
+    Constructs an improved MaskRCNN model with a ResNet-50-FPN backbone.
+
+    Reference: `"Benchmarking Detection Transfer Learning with Vision Transformers"
+    <https://arxiv.org/abs/2111.11429>`_.
+
+    :func:`~torchvision.models.detection.maskrcnn_resnet50_fpn` for more details.
+
+    Args:
+        weights (MaskRCNN_ResNet50_FPN_V2_Weights, optional): The pretrained weights for the model
+        progress (bool): If True, displays a progress bar of the download to stderr
+        num_classes (int, optional): number of output classes of the model (including the background)
+        weights_backbone (ResNet50_Weights, optional): The pretrained weights for the backbone
+        trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from final block.
+            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable. If ``None`` is
+            passed (the default) this value is set to 3.
+    """
+    weights = MaskRCNN_ResNet50_FPN_V2_Weights.verify(weights)
+    weights_backbone = ResNet50_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
+    elif num_classes is None:
+        num_classes = 91
+
+    is_trained = weights is not None or weights_backbone is not None
+    trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3)
+
+    backbone = resnet50(weights=weights_backbone, progress=progress)
+    backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers, norm_layer=nn.BatchNorm2d)
+    rpn_anchor_generator = _default_anchorgen()
+    rpn_head = RPNHead(backbone.out_channels, rpn_anchor_generator.num_anchors_per_location()[0], conv_depth=2)
+    box_head = FastRCNNConvFCHead(
+        (backbone.out_channels, 7, 7), [256, 256, 256, 256], [1024], norm_layer=nn.BatchNorm2d
+    )
+    mask_head = MaskRCNNHeads(backbone.out_channels, [256, 256, 256, 256], 1, norm_layer=nn.BatchNorm2d)
+    model = MaskRCNN(
+        backbone,
+        num_classes=num_classes,
+        rpn_anchor_generator=rpn_anchor_generator,
+        rpn_head=rpn_head,
+        box_head=box_head,
+        mask_head=mask_head,
+        **kwargs,
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress))
+
+    return model
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
 import math
 import warnings
 from collections import OrderedDict
-from typing import Any, Dict, List, Tuple, Optional
+from functools import partial
+from typing import Any, Callable, Dict, List, Tuple, Optional

 import torch
 from torch import nn, Tensor
@@ -17,7 +18,7 @@ from .._meta import _COCO_CATEGORIES
 from .._utils import handle_legacy_interface, _ovewrite_value_param
 from ..resnet import ResNet50_Weights, resnet50
 from . import _utils as det_utils
-from ._utils import overwrite_eps
+from ._utils import overwrite_eps, _box_loss
 from .anchor_utils import AnchorGenerator
 from .backbone_utils import _resnet_fpn_extractor, _validate_trainable_layers
 from .transform import GeneralizedRCNNTransform
@@ -26,7 +27,9 @@ from .transform import GeneralizedRCNNTransform
 __all__ = [
    "RetinaNet",
    "RetinaNet_ResNet50_FPN_Weights",
+    "RetinaNet_ResNet50_FPN_V2_Weights",
    "retinanet_resnet50_fpn",
+    "retinanet_resnet50_fpn_v2",
 ]


@@ -37,6 +40,21 @@ def _sum(x: List[Tensor]) -> Tensor:
    return res


+def _v1_to_v2_weights(state_dict, prefix):
+    for i in range(4):
+        for type in ["weight", "bias"]:
+            old_key = f"{prefix}conv.{2*i}.{type}"
+            new_key = f"{prefix}conv.{i}.0.{type}"
+            state_dict[new_key] = state_dict.pop(old_key)
+
+
+def _default_anchorgen():
+    anchor_sizes = tuple((x, int(x * 2 ** (1.0 / 3)), int(x * 2 ** (2.0 / 3))) for x in [32, 64, 128, 256, 512])
+    aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
+    anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)
+    return anchor_generator
+
+
 class RetinaNetHead(nn.Module):
    """
    A regression and classification head for use in RetinaNet.
@@ -45,12 +63,15 @@ class RetinaNetHead(nn.Module):
        in_channels (int): number of channels of the input feature
        num_anchors (int): number of anchors to be predicted
        num_classes (int): number of classes to be predicted
+        norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None
    """

-    def __init__(self, in_channels, num_anchors, num_classes):
+    def __init__(self, in_channels, num_anchors, num_classes, norm_layer: Optional[Callable[..., nn.Module]] = None):
        super().__init__()
-        self.classification_head = RetinaNetClassificationHead(in_channels, num_anchors, num_classes)
-        self.regression_head = RetinaNetRegressionHead(in_channels, num_anchors)
+        self.classification_head = RetinaNetClassificationHead(
+            in_channels, num_anchors, num_classes, norm_layer=norm_layer
+        )
+        self.regression_head = RetinaNetRegressionHead(in_channels, num_anchors, norm_layer=norm_layer)

    def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
        # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor], List[Tensor]) -> Dict[str, Tensor]
@@ -72,20 +93,30 @@ class RetinaNetClassificationHead(nn.Module):
        in_channels (int): number of channels of the input feature
        num_anchors (int): number of anchors to be predicted
        num_classes (int): number of classes to be predicted
+        norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None
    """

-    def __init__(self, in_channels, num_anchors, num_classes, prior_probability=0.01):
+    _version = 2
+
+    def __init__(
+        self,
+        in_channels,
+        num_anchors,
+        num_classes,
+        prior_probability=0.01,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ):
        super().__init__()

        conv = []
        for _ in range(4):
-            conv.append(nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1))
-            conv.append(nn.ReLU())
+            conv.append(misc_nn_ops.Conv2dNormActivation(in_channels, in_channels, norm_layer=norm_layer))
        self.conv = nn.Sequential(*conv)

-        for layer in self.conv.children():
+        for layer in self.conv.modules():
            if isinstance(layer, nn.Conv2d):
                torch.nn.init.normal_(layer.weight, std=0.01)
+                if layer.bias is not None:
                    torch.nn.init.constant_(layer.bias, 0)

        self.cls_logits = nn.Conv2d(in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1)
@@ -100,6 +131,31 @@ class RetinaNetClassificationHead(nn.Module):
        # https://github.com/pytorch/vision/pull/1697#issuecomment-630255584
        self.BETWEEN_THRESHOLDS = det_utils.Matcher.BETWEEN_THRESHOLDS

+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            _v1_to_v2_weights(state_dict, prefix)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
    def compute_loss(self, targets, head_outputs, matched_idxs):
        # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor]) -> Tensor
        losses = []
@@ -159,31 +215,60 @@ class RetinaNetRegressionHead(nn.Module):
    Args:
        in_channels (int): number of channels of the input feature
        num_anchors (int): number of anchors to be predicted
+        norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None
    """

+    _version = 2
+
    __annotations__ = {
        "box_coder": det_utils.BoxCoder,
    }

-    def __init__(self, in_channels, num_anchors):
+    def __init__(self, in_channels, num_anchors, norm_layer: Optional[Callable[..., nn.Module]] = None):
        super().__init__()

        conv = []
        for _ in range(4):
-            conv.append(nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1))
-            conv.append(nn.ReLU())
+            conv.append(misc_nn_ops.Conv2dNormActivation(in_channels, in_channels, norm_layer=norm_layer))
        self.conv = nn.Sequential(*conv)

        self.bbox_reg = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1)
        torch.nn.init.normal_(self.bbox_reg.weight, std=0.01)
        torch.nn.init.zeros_(self.bbox_reg.bias)

-        for layer in self.conv.children():
+        for layer in self.conv.modules():
            if isinstance(layer, nn.Conv2d):
                torch.nn.init.normal_(layer.weight, std=0.01)
+                if layer.bias is not None:
                    torch.nn.init.zeros_(layer.bias)

        self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
+        self._loss_type = "l1"
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            _v1_to_v2_weights(state_dict, prefix)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )

    def compute_loss(self, targets, head_outputs, anchors, matched_idxs):
        # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor], List[Tensor]) -> Tensor
@@ -203,12 +288,15 @@ class RetinaNetRegressionHead(nn.Module):
            bbox_regression_per_image = bbox_regression_per_image[foreground_idxs_per_image, :]
            anchors_per_image = anchors_per_image[foreground_idxs_per_image, :]

-            # compute the regression targets
-            target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image)
-
            # compute the loss
            losses.append(
-                torch.nn.functional.l1_loss(bbox_regression_per_image, target_regression, reduction="sum")
+                _box_loss(
+                    self._loss_type,
+                    self.box_coder,
+                    anchors_per_image,
+                    matched_gt_boxes_per_image,
+                    bbox_regression_per_image,
+                )
                / max(1, num_foreground)
            )

@@ -361,9 +449,7 @@ class RetinaNet(nn.Module):
            )

        if anchor_generator is None:
-            anchor_sizes = tuple((x, int(x * 2 ** (1.0 / 3)), int(x * 2 ** (2.0 / 3))) for x in [32, 64, 128, 256, 512])
-            aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
-            anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios)
+            anchor_generator = _default_anchorgen()
        self.anchor_generator = anchor_generator

        if head is None:
@@ -604,6 +690,10 @@ class RetinaNet_ResNet50_FPN_Weights(WeightsEnum):
    DEFAULT = COCO_V1


+class RetinaNet_ResNet50_FPN_V2_Weights(WeightsEnum):
+    pass
+
+
 @handle_legacy_interface(
    weights=("pretrained", RetinaNet_ResNet50_FPN_Weights.COCO_V1),
    weights_backbone=("pretrained_backbone", ResNet50_Weights.IMAGENET1K_V1),
@@ -690,3 +780,61 @@ def retinanet_resnet50_fpn(
            overwrite_eps(model, 0.0)

    return model
+
+
+def retinanet_resnet50_fpn_v2(
+    *,
+    weights: Optional[RetinaNet_ResNet50_FPN_V2_Weights] = None,
+    progress: bool = True,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[ResNet50_Weights] = None,
+    trainable_backbone_layers: Optional[int] = None,
+    **kwargs: Any,
+) -> RetinaNet:
+    """
+    Constructs an improved RetinaNet model with a ResNet-50-FPN backbone.
+
+    Reference: `"Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection"
+    <https://arxiv.org/abs/1912.02424>`_.
+
+    :func:`~torchvision.models.detection.retinanet_resnet50_fpn` for more details.
+
+    Args:
+        weights (RetinaNet_ResNet50_FPN_V2_Weights, optional): The pretrained weights for the model
+        progress (bool): If True, displays a progress bar of the download to stderr
+        num_classes (int, optional): number of output classes of the model (including the background)
+        weights_backbone (ResNet50_Weights, optional): The pretrained weights for the backbone
+        trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from final block.
+            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable. If ``None`` is
+            passed (the default) this value is set to 3.
+    """
+    weights = RetinaNet_ResNet50_FPN_V2_Weights.verify(weights)
+    weights_backbone = ResNet50_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param(num_classes, len(weights.meta["categories"]))
+    elif num_classes is None:
+        num_classes = 91
+
+    is_trained = weights is not None or weights_backbone is not None
+    trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3)
+
+    backbone = resnet50(weights=weights_backbone, progress=progress)
+    backbone = _resnet_fpn_extractor(
+        backbone, trainable_backbone_layers, returned_layers=[2, 3, 4], extra_blocks=LastLevelP6P7(2048, 256)
+    )
+    anchor_generator = _default_anchorgen()
+    head = RetinaNetHead(
+        backbone.out_channels,
+        anchor_generator.num_anchors_per_location()[0],
+        num_classes,
+        norm_layer=partial(nn.GroupNorm, 32),
+    )
+    head.regression_head._loss_type = "giou"
+    model = RetinaNet(backbone, num_classes, anchor_generator=anchor_generator, head=head, **kwargs)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress))
+
+    return model
--- a/torchvision/models/detection/rpn.py
+++ b/torchvision/models/detection/rpn.py
@@ -3,6 +3,7 @@ from typing import List, Optional, Dict, Tuple
 import torch
 from torch import nn, Tensor
 from torch.nn import functional as F
+from torchvision.ops import Conv2dNormActivation
 from torchvision.ops import boxes as box_ops

 from . import _utils as det_utils
@@ -19,23 +20,59 @@ class RPNHead(nn.Module):
    Args:
        in_channels (int): number of channels of the input feature
        num_anchors (int): number of anchors to be predicted
+        conv_depth (int, optional): number of convolutions
    """

-    def __init__(self, in_channels: int, num_anchors: int) -> None:
+    _version = 2
+
+    def __init__(self, in_channels: int, num_anchors: int, conv_depth=1) -> None:
        super().__init__()
-        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+        convs = []
+        for _ in range(conv_depth):
+            convs.append(Conv2dNormActivation(in_channels, in_channels, kernel_size=3, norm_layer=None))
+        self.conv = nn.Sequential(*convs)
        self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
        self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1)

-        for layer in self.children():
+        for layer in self.modules():
+            if isinstance(layer, nn.Conv2d):
                torch.nn.init.normal_(layer.weight, std=0.01)  # type: ignore[arg-type]
+                if layer.bias is not None:
                    torch.nn.init.constant_(layer.bias, 0)  # type: ignore[arg-type]

+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            for type in ["weight", "bias"]:
+                old_key = f"{prefix}conv.{type}"
+                new_key = f"{prefix}conv.0.0.{type}"
+                state_dict[new_key] = state_dict.pop(old_key)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
    def forward(self, x: List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
        logits = []
        bbox_reg = []
        for feature in x:
-            t = F.relu(self.conv(feature))
+            t = self.conv(feature)
            logits.append(self.cls_logits(t))
            bbox_reg.append(self.bbox_pred(t))
        return logits, bbox_reg

--- a/torchvision/ops/feature_pyramid_network.py
+++ b/torchvision/ops/feature_pyramid_network.py
 from collections import OrderedDict
-from typing import Tuple, List, Dict, Optional
+from typing import Tuple, List, Dict, Callable, Optional

 import torch.nn.functional as F
 from torch import nn, Tensor

+from ..ops.misc import Conv2dNormActivation
 from ..utils import _log_api_usage_once


@@ -51,6 +52,7 @@ class FeaturePyramidNetwork(nn.Module):
            be performed. It is expected to take the fpn features, the original
            features and the names of the original features as input, and returns
            a new list of feature maps and their corresponding names
+        norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None

    Examples::

@@ -70,11 +72,14 @@ class FeaturePyramidNetwork(nn.Module):

    """

+    _version = 2
+
    def __init__(
        self,
        in_channels_list: List[int],
        out_channels: int,
        extra_blocks: Optional[ExtraFPNBlock] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
    ):
        super().__init__()
        _log_api_usage_once(self)
@@ -83,8 +88,12 @@ class FeaturePyramidNetwork(nn.Module):
        for in_channels in in_channels_list:
            if in_channels == 0:
                raise ValueError("in_channels=0 is currently not supported")
-            inner_block_module = nn.Conv2d(in_channels, out_channels, 1)
-            layer_block_module = nn.Conv2d(out_channels, out_channels, 3, padding=1)
+            inner_block_module = Conv2dNormActivation(
+                in_channels, out_channels, kernel_size=1, padding=0, norm_layer=norm_layer, activation_layer=None
+            )
+            layer_block_module = Conv2dNormActivation(
+                out_channels, out_channels, kernel_size=3, norm_layer=norm_layer, activation_layer=None
+            )
            self.inner_blocks.append(inner_block_module)
            self.layer_blocks.append(layer_block_module)

@@ -92,6 +101,7 @@ class FeaturePyramidNetwork(nn.Module):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_uniform_(m.weight, a=1)
+                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

        if extra_blocks is not None:
@@ -99,6 +109,37 @@ class FeaturePyramidNetwork(nn.Module):
                raise TypeError(f"extra_blocks should be of type ExtraFPNBlock not {type(extra_blocks)}")
        self.extra_blocks = extra_blocks

+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            num_blocks = len(self.inner_blocks)
+            for block in ["inner_blocks", "layer_blocks"]:
+                for i in range(num_blocks):
+                    for type in ["weight", "bias"]:
+                        old_key = f"{prefix}{block}.{i}.{type}"
+                        new_key = f"{prefix}{block}.{i}.0.{type}"
+                        state_dict[new_key] = state_dict.pop(old_key)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
    def get_result_from_inner_blocks(self, x: Tensor, idx: int) -> Tensor:
        """
        This is equivalent to self.inner_blocks[idx](x),