Cerrypicking cleanups for SSD and SSDlite. (#3818)

e35793a1 · Vasilis Vryniotis · GitHub · 6374cff2 · e35793a1 · e35793a1
Unverified Commit e35793a1 authored May 12, 2021 by Vasilis Vryniotis Committed by GitHub May 12, 2021
4 changed files
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -426,8 +426,8 @@ Faster R-CNN ResNet-50 FPN              37.0     -         -
 Faster R-CNN MobileNetV3-Large FPN      32.8     -         -
 Faster R-CNN MobileNetV3-Large 320 FPN  22.8     -         -
 RetinaNet ResNet-50 FPN                 36.4     -         -
-SSD VGG16                               25.1     -         -
+SSD300 VGG16                            25.1     -         -
-SSDlite MobileNetV3-Large               21.3     -         -
+SSDlite320 MobileNetV3-Large            21.3     -         -
 Mask R-CNN ResNet-50 FPN                37.9     34.6      -
 ======================================  =======  ========  ===========
@@ -486,8 +486,8 @@ Faster R-CNN ResNet-50 FPN              0.2288               0.0590
 Faster R-CNN MobileNetV3-Large FPN      0.1020               0.0415              1.0
 Faster R-CNN MobileNetV3-Large 320 FPN  0.0978               0.0376              0.6
 RetinaNet ResNet-50 FPN                 0.2514               0.0939              4.1
-SSD VGG16                               0.2093               0.0744              1.5
+SSD300 VGG16                            0.2093               0.0744              1.5
-SSDlite MobileNetV3-Large               0.1773               0.0906              1.5
+SSDlite320 MobileNetV3-Large            0.1773               0.0906              1.5
 Mask R-CNN ResNet-50 FPN                0.2728               0.0903              5.4
 Keypoint R-CNN ResNet-50 FPN            0.3789               0.1242              6.8
 ======================================  ===================  ==================  ===========
@@ -502,19 +502,19 @@ Faster R-CNN
 RetinaNet
------------
+---------
 .. autofunction:: torchvision.models.detection.retinanet_resnet50_fpn
 SSD
------------
+---
 .. autofunction:: torchvision.models.detection.ssd300_vgg16
 SSDlite
------------
+-------
 .. autofunction:: torchvision.models.detection.ssdlite320_mobilenet_v3_large

--- a/references/detection/README.md
+++ b/references/detection/README.md
@@ -48,7 +48,7 @@ python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\
    --lr-steps 16 22 --aspect-ratio-group-factor 3 --lr 0.01
 ```
-### SSD VGG16
+### SSD300 VGG16
 ```
 python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\
    --dataset coco --model ssd300_vgg16 --epochs 120\
@@ -56,7 +56,7 @@ python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\
    --weight-decay 0.0005 --data-augmentation ssd
 ```
-### SSDlite MobileNetV3-Large
+### SSDlite320 MobileNetV3-Large
 ```
 python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\
    --dataset coco --model ssdlite320_mobilenet_v3_large --epochs 660\

--- a/torchvision/models/detection/ssd.py
+++ b/torchvision/models/detection/ssd.py
@@ -410,7 +410,7 @@ class SSD(nn.Module):
 class SSDFeatureExtractorVGG(nn.Module):
-    def __init__(self, backbone: nn.Module, highres: bool, rescaling: bool):
+    def __init__(self, backbone: nn.Module, highres: bool):
        super().__init__()
        _, _, maxpool3_pos, maxpool4_pos, _ = (i for i, layer in enumerate(backbone) if isinstance(layer, nn.MaxPool2d))
@@ -476,13 +476,8 @@ class SSDFeatureExtractorVGG(nn.Module):
            fc,
        ))
        self.extra = extra
-        self.rescaling = rescaling
    def forward(self, x: Tensor) -> Dict[str, Tensor]:
-        # Undo the 0-1 scaling of toTensor. Necessary for some backbones.
-        if self.rescaling:
-            x *= 255
        # L2 regularization + Rescaling of 1st block's feature map
        x = self.features(x)
        rescaled = self.scale_weight.view(1, -1, 1, 1) * F.normalize(x)
@@ -496,8 +491,7 @@ class SSDFeatureExtractorVGG(nn.Module):
        return OrderedDict([(str(i), v) for i, v in enumerate(output)])
-def _vgg_extractor(backbone_name: str, highres: bool, progress: bool, pretrained: bool, trainable_layers: int,
+def _vgg_extractor(backbone_name: str, highres: bool, progress: bool, pretrained: bool, trainable_layers: int):
-                   rescaling: bool):
    if backbone_name in backbone_urls:
        # Use custom backbones more appropriate for SSD
        arch = backbone_name.split('_')[0]
@@ -521,19 +515,19 @@ def _vgg_extractor(backbone_name: str, highres: bool, progress: bool, pretrained
        for parameter in b.parameters():
            parameter.requires_grad_(False)
-    return SSDFeatureExtractorVGG(backbone, highres, rescaling)
+    return SSDFeatureExtractorVGG(backbone, highres)
 def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: int = 91,
                 pretrained_backbone: bool = True, trainable_backbone_layers: Optional[int] = None, **kwargs: Any):
    """
-    Constructs an SSD model with a VGG16 backbone. See `SSD` for more details.
+    Constructs an SSD model with input size 300x300 and a VGG16 backbone. See `SSD` for more details.
    Example:
        >>> model = torchvision.models.detection.ssd300_vgg16(pretrained=True)
        >>> model.eval()
-        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
+        >>> x = [torch.rand(3, 300, 300), torch.rand(3, 500, 400)]
        >>> predictions = model(x)
    Args:
@@ -544,6 +538,9 @@ def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: i
        trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting from final block.
            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
    """
+    if "size" in kwargs:
+        warnings.warn("The size of the model is already fixed; ignoring the argument.")
    trainable_backbone_layers = _validate_trainable_layers(
        pretrained or pretrained_backbone, trainable_backbone_layers, 5, 5)
@@ -551,12 +548,18 @@ def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: i
        # no need to download the backbone if pretrained is set
        pretrained_backbone = False
-    backbone = _vgg_extractor("vgg16_features", False, progress, pretrained_backbone, trainable_backbone_layers, True)
+    backbone = _vgg_extractor("vgg16_features", False, progress, pretrained_backbone, trainable_backbone_layers)
    anchor_generator = DefaultBoxGenerator([[2], [2, 3], [2, 3], [2, 3], [2], [2]],
                                           scales=[0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05],
                                           steps=[8, 16, 32, 64, 100, 300])
-    model = SSD(backbone, anchor_generator, (300, 300), num_classes,
-                image_mean=[0.48235, 0.45882, 0.40784], image_std=[1., 1., 1.], **kwargs)
+    defaults = {
+        # Rescale the input in a way compatible to the backbone
+        "image_mean": [0.48235, 0.45882, 0.40784],
+        "image_std": [1.0 / 255.0, 1.0 / 255.0, 1.0 / 255.0],  # undo the 0-1 scaling of toTensor
+    }
+    kwargs = {**defaults, **kwargs}
+    model = SSD(backbone, anchor_generator, (300, 300), num_classes, **kwargs)
    if pretrained:
        weights_name = 'ssd300_vgg16_coco'
        if model_urls.get(weights_name, None) is None:

--- a/torchvision/models/detection/ssdlite.py
+++ b/torchvision/models/detection/ssdlite.py
 import torch
+import warnings
 from collections import OrderedDict
 from functools import partial
@@ -94,8 +95,7 @@ class SSDLiteRegressionHead(SSDScoringHead):
 class SSDLiteFeatureExtractorMobileNet(nn.Module):
-    def __init__(self, backbone: nn.Module, c4_pos: int, norm_layer: Callable[..., nn.Module], rescaling: bool,
+    def __init__(self, backbone: nn.Module, c4_pos: int, norm_layer: Callable[..., nn.Module], **kwargs: Any):
-                 **kwargs: Any):
        super().__init__()
        # non-public config parameters
        min_depth = kwargs.pop('_min_depth', 16)
@@ -117,13 +117,8 @@ class SSDLiteFeatureExtractorMobileNet(nn.Module):
        _normal_init(extra)
        self.extra = extra
-        self.rescaling = rescaling
    def forward(self, x: Tensor) -> Dict[str, Tensor]:
-        # Rescale from [0, 1] to [-1, -1]
-        if self.rescaling:
-            x = 2.0 * x - 1.0
        # Get feature maps from backbone and extra. Can't be refactored due to JIT limitations.
        output = []
        for block in self.features:
@@ -138,7 +133,7 @@ class SSDLiteFeatureExtractorMobileNet(nn.Module):
 def _mobilenet_extractor(backbone_name: str, progress: bool, pretrained: bool, trainable_layers: int,
-                         norm_layer: Callable[..., nn.Module], rescaling: bool, **kwargs: Any):
+                         norm_layer: Callable[..., nn.Module], **kwargs: Any):
    backbone = mobilenet.__dict__[backbone_name](pretrained=pretrained, progress=progress,
                                                 norm_layer=norm_layer, **kwargs).features
    if not pretrained:
@@ -158,7 +153,7 @@ def _mobilenet_extractor(backbone_name: str, progress: bool, pretrained: bool, t
        for parameter in b.parameters():
            parameter.requires_grad_(False)
-    return SSDLiteFeatureExtractorMobileNet(backbone, stage_indices[-2], norm_layer, rescaling, **kwargs)
+    return SSDLiteFeatureExtractorMobileNet(backbone, stage_indices[-2], norm_layer, **kwargs)
 def ssdlite320_mobilenet_v3_large(pretrained: bool = False, progress: bool = True, num_classes: int = 91,
@@ -166,7 +161,7 @@ def ssdlite320_mobilenet_v3_large(pretrained: bool = False, progress: bool = Tru
                                  norm_layer: Optional[Callable[..., nn.Module]] = None,
                                  **kwargs: Any):
    """
-    Constructs an SSDlite model with a MobileNetV3 Large backbone. See `SSD` for more details.
+    Constructs an SSDlite model with input size 320x320 and a MobileNetV3 Large backbone. See `SSD` for more details.
    Example:
@@ -186,20 +181,23 @@ def ssdlite320_mobilenet_v3_large(pretrained: bool = False, progress: bool = Tru
            Valid values are between 0 and 6, with 6 meaning all backbone layers are trainable.
        norm_layer (callable, optional): Module specifying the normalization layer to use.
    """
+    if "size" in kwargs:
+        warnings.warn("The size of the model is already fixed; ignoring the argument.")
    trainable_backbone_layers = _validate_trainable_layers(
        pretrained or pretrained_backbone, trainable_backbone_layers, 6, 6)
    if pretrained:
        pretrained_backbone = False
-    # Enable [-1, 1] rescaling and reduced tail if no pretrained backbone is selected
+    # Enable reduced tail if no pretrained backbone is selected
-    rescaling = reduce_tail = not pretrained_backbone
+    reduce_tail = not pretrained_backbone
    if norm_layer is None:
        norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.03)
    backbone = _mobilenet_extractor("mobilenet_v3_large", progress, pretrained_backbone, trainable_backbone_layers,
-                                    norm_layer, rescaling, _reduced_tail=reduce_tail, _width_mult=1.0)
+                                    norm_layer, _reduced_tail=reduce_tail, _width_mult=1.0)
    size = (320, 320)
    anchor_generator = DefaultBoxGenerator([[2, 3] for _ in range(6)], min_ratio=0.2, max_ratio=0.95)
@@ -212,8 +210,10 @@ def ssdlite320_mobilenet_v3_large(pretrained: bool = False, progress: bool = Tru
        "nms_thresh": 0.55,
        "detections_per_img": 300,
        "topk_candidates": 300,
-        "image_mean": [0., 0., 0.],
+        # Rescale the input in a way compatible to the backbone:
-        "image_std": [1., 1., 1.],
+        # The following mean/std rescale the data from [0, 1] to [-1, -1]
+        "image_mean": [0.5, 0.5, 0.5],
+        "image_std": [0.5, 0.5, 0.5],
    }
    kwargs = {**defaults, **kwargs}
    model = SSD(backbone, anchor_generator, size, num_classes,