merge v0.16.0

cc26cd81 · panning · f78f29f5 · fbb4cc54 · cc26cd81 · cc26cd81
Commit cc26cd81 authored Nov 27, 2023 by panning
20 changed files
--- a/torchvision/models/detection/mask_rcnn.py
+++ b/torchvision/models/detection/mask_rcnn.py
@@ -31,9 +31,9 @@ class MaskRCNN(FasterRCNN):
    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
    image, and should be in 0-1 range. Different images can have different sizes.

-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.

-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -56,7 +56,7 @@ class MaskRCNN(FasterRCNN):

    Args:
        backbone (nn.Module): the network used to compute the features for the model.
-            It should contain a out_channels attribute, which indicates the number of output
+            It should contain an out_channels attribute, which indicates the number of output
            channels that each feature map has (and it should be the same for all feature maps).
            The backbone should return a single Tensor or and OrderedDict[Tensor].
        num_classes (int): number of output classes of the model (including the background).
@@ -123,7 +123,7 @@ class MaskRCNN(FasterRCNN):
        >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
        >>> # MaskRCNN needs to know the number of
        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
-        >>> # so we need to add it here
+        >>> # so we need to add it here,
        >>> backbone.out_channels = 1280
        >>>
        >>> # let's make the RPN generate 5 x 3 anchors per spatial
@@ -370,6 +370,8 @@ class MaskRCNN_ResNet50_FPN_Weights(WeightsEnum):
                    "mask_map": 34.6,
                }
            },
+            "_ops": 134.38,
+            "_file_size": 169.84,
            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
        },
    )
@@ -390,6 +392,8 @@ class MaskRCNN_ResNet50_FPN_V2_Weights(WeightsEnum):
                    "mask_map": 41.8,
                }
            },
+            "_ops": 333.577,
+            "_file_size": 177.219,
            "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
        },
    )
@@ -418,9 +422,9 @@ def maskrcnn_resnet50_fpn(
    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
    image, and should be in ``0-1`` range. Different images can have different sizes.

-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.

-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:

        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -497,7 +501,7 @@ def maskrcnn_resnet50_fpn(
    model = MaskRCNN(backbone, num_classes=num_classes, **kwargs)

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
        if weights == MaskRCNN_ResNet50_FPN_Weights.COCO_V1:
            overwrite_eps(model, 0.0)

@@ -578,17 +582,6 @@ def maskrcnn_resnet50_fpn_v2(
    )

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))

    return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "maskrcnn_resnet50_fpn_coco": MaskRCNN_ResNet50_FPN_Weights.COCO_V1.url,
-    }
-)
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -327,9 +327,9 @@ class RetinaNet(nn.Module):
    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
    image, and should be in 0-1 range. Different images can have different sizes.

-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.

-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -382,7 +382,7 @@ class RetinaNet(nn.Module):
        >>> # only the features
        >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
        >>> # RetinaNet needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
        >>> # so we need to add it here
        >>> backbone.out_channels = 1280
        >>>
@@ -690,6 +690,8 @@ class RetinaNet_ResNet50_FPN_Weights(WeightsEnum):
                    "box_map": 36.4,
                }
            },
+            "_ops": 151.54,
+            "_file_size": 130.267,
            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
        },
    )
@@ -709,6 +711,8 @@ class RetinaNet_ResNet50_FPN_V2_Weights(WeightsEnum):
                    "box_map": 41.5,
                }
            },
+            "_ops": 152.238,
+            "_file_size": 146.037,
            "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
        },
    )
@@ -739,9 +743,9 @@ def retinanet_resnet50_fpn(
    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
    image, and should be in ``0-1`` range. Different images can have different sizes.

-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.

-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:

        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -811,7 +815,7 @@ def retinanet_resnet50_fpn(
    model = RetinaNet(backbone, num_classes, **kwargs)

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
        if weights == RetinaNet_ResNet50_FPN_Weights.COCO_V1:
            overwrite_eps(model, 0.0)

@@ -890,17 +894,6 @@ def retinanet_resnet50_fpn_v2(
    model = RetinaNet(backbone, num_classes, anchor_generator=anchor_generator, head=head, **kwargs)

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))

    return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "retinanet_resnet50_fpn_coco": RetinaNet_ResNet50_FPN_Weights.COCO_V1.url,
-    }
-)
--- a/torchvision/models/detection/roi_heads.py
+++ b/torchvision/models/detection/roi_heads.py
@@ -315,7 +315,7 @@ def keypointrcnn_loss(keypoint_logits, proposals, gt_keypoints, keypoint_matched
    valid = torch.cat(valid, dim=0).to(dtype=torch.uint8)
    valid = torch.where(valid)[0]

-    # torch.mean (in binary_cross_entropy_with_logits) does'nt
+    # torch.mean (in binary_cross_entropy_with_logits) doesn't
    # accept empty tensors, so handle it sepaartely
    if keypoint_targets.numel() == 0 or len(valid) == 0:
        return keypoint_logits.sum() * 0
@@ -746,7 +746,7 @@ class RoIHeads(nn.Module):
                if not t["boxes"].dtype in floating_point_types:
                    raise TypeError(f"target boxes must of float type, instead got {t['boxes'].dtype}")
                if not t["labels"].dtype == torch.int64:
-                    raise TypeError("target labels must of int64 type, instead got {t['labels'].dtype}")
+                    raise TypeError(f"target labels must of int64 type, instead got {t['labels'].dtype}")
                if self.has_keypoint():
                    if not t["keypoints"].dtype == torch.float32:
                        raise TypeError(f"target keypoints must of float type, instead got {t['keypoints'].dtype}")
@@ -787,7 +787,7 @@ class RoIHeads(nn.Module):
            mask_proposals = [p["boxes"] for p in result]
            if self.training:
                if matched_idxs is None:
-                    raise ValueError("if in trainning, matched_idxs should not be None")
+                    raise ValueError("if in training, matched_idxs should not be None")

                # during training, only focus on positive boxes
                num_images = len(proposals)

--- a/torchvision/models/detection/ssd.py
+++ b/torchvision/models/detection/ssd.py
@@ -39,6 +39,8 @@ class SSD300_VGG16_Weights(WeightsEnum):
                    "box_map": 25.1,
                }
            },
+            "_ops": 34.858,
+            "_file_size": 135.988,
            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
        },
    )
@@ -126,12 +128,12 @@ class SSD(nn.Module):
    Implements SSD architecture from `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.

    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
-    image, and should be in 0-1 range. Different images can have different sizes but they will be resized
+    image, and should be in 0-1 range. Different images can have different sizes, but they will be resized
    to a fixed size before passing it to the backbone.

-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.

-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -554,7 +556,7 @@ def _vgg_extractor(backbone: VGG, highres: bool, trainable_layers: int):
    stage_indices = [0] + [i for i, b in enumerate(backbone) if isinstance(b, nn.MaxPool2d)][:-1]
    num_stages = len(stage_indices)

-    # find the index of the layer from which we wont freeze
+    # find the index of the layer from which we won't freeze
    torch._assert(
        0 <= trainable_layers <= num_stages,
        f"trainable_layers should be in the range [0, {num_stages}]. Instead got {trainable_layers}",
@@ -588,12 +590,12 @@ def ssd300_vgg16(
    .. betastatus:: detection module

    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
-    image, and should be in 0-1 range. Different images can have different sizes but they will be resized
+    image, and should be in 0-1 range. Different images can have different sizes, but they will be resized
    to a fixed size before passing it to the backbone.

-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.

-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:

        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -675,28 +677,6 @@ def ssd300_vgg16(
    model = SSD(backbone, anchor_generator, (300, 300), num_classes, **kwargs)

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))

    return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "ssd300_vgg16_coco": SSD300_VGG16_Weights.COCO_V1.url,
-    }
-)
-
-
-backbone_urls = _ModelURLs(
-    {
-        # We port the features of a VGG16 backbone trained by amdegroot because unlike the one on TorchVision, it uses
-        # the same input standardization method as the paper.
-        # Ref: https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth
-        # Only the `features` weights have proper values, those on the `classifier` module are filled with nans.
-        "vgg16_features": VGG16_Weights.IMAGENET1K_FEATURES.url,
-    }
-)
--- a/torchvision/models/detection/ssdlite.py
+++ b/torchvision/models/detection/ssdlite.py
@@ -172,7 +172,7 @@ def _mobilenet_extractor(
    stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
    num_stages = len(stage_indices)

-    # find the index of the layer from which we wont freeze
+    # find the index of the layer from which we won't freeze
    if not 0 <= trainable_layers <= num_stages:
        raise ValueError("trainable_layers should be in the range [0, {num_stages}], instead got {trainable_layers}")
    freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers]
@@ -198,6 +198,8 @@ class SSDLite320_MobileNet_V3_Large_Weights(WeightsEnum):
                    "box_map": 21.3,
                }
            },
+            "_ops": 0.583,
+            "_file_size": 13.418,
            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
        },
    )
@@ -324,17 +326,6 @@ def ssdlite320_mobilenet_v3_large(
    )

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))

    return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "ssdlite320_mobilenet_v3_large_coco": SSDLite320_MobileNet_V3_Large_Weights.COCO_V1.url,
-    }
-)
--- a/torchvision/models/detection/transform.py
+++ b/torchvision/models/detection/transform.py
@@ -24,8 +24,8 @@ def _fake_cast_onnx(v: Tensor) -> float:

 def _resize_image_and_masks(
    image: Tensor,
-    self_min_size: float,
-    self_max_size: float,
+    self_min_size: int,
+    self_max_size: int,
    target: Optional[Dict[str, Tensor]] = None,
    fixed_size: Optional[Tuple[int, int]] = None,
 ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
@@ -40,14 +40,24 @@ def _resize_image_and_masks(
    if fixed_size is not None:
        size = [fixed_size[1], fixed_size[0]]
    else:
-        min_size = torch.min(im_shape).to(dtype=torch.float32)
-        max_size = torch.max(im_shape).to(dtype=torch.float32)
-        scale = torch.min(self_min_size / min_size, self_max_size / max_size)
+        if torch.jit.is_scripting() or torchvision._is_tracing():
+            min_size = torch.min(im_shape).to(dtype=torch.float32)
+            max_size = torch.max(im_shape).to(dtype=torch.float32)
+            self_min_size_f = float(self_min_size)
+            self_max_size_f = float(self_max_size)
+            scale = torch.min(self_min_size_f / min_size, self_max_size_f / max_size)
+
+            if torchvision._is_tracing():
+                scale_factor = _fake_cast_onnx(scale)
+            else:
+                scale_factor = scale.item()

-        if torchvision._is_tracing():
-            scale_factor = _fake_cast_onnx(scale)
        else:
-            scale_factor = scale.item()
+            # Do it the normal way
+            min_size = min(im_shape)
+            max_size = max(im_shape)
+            scale_factor = min(self_min_size / min_size, self_max_size / max_size)
+
        recompute_scale_factor = True

    image = torch.nn.functional.interpolate(
@@ -76,7 +86,7 @@ class GeneralizedRCNNTransform(nn.Module):
    Performs input / target transformation before feeding the data to a GeneralizedRCNN
    model.

-    The transformations it perform are:
+    The transformations it performs are:
        - input normalization (mean subtraction and std division)
        - input / target resizing to match min_size / max_size

@@ -158,9 +168,8 @@ class GeneralizedRCNNTransform(nn.Module):

    def torch_choice(self, k: List[int]) -> int:
        """
-        Implements `random.choice` via torch ops so it can be compiled with
-        TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803
-        is fixed.
+        Implements `random.choice` via torch ops, so it can be compiled with
+        TorchScript and we use PyTorch's RNG (not native RNG)
        """
        index = int(torch.empty(1).uniform_(0.0, float(len(k))).item())
        return k[index]
@@ -174,11 +183,10 @@ class GeneralizedRCNNTransform(nn.Module):
        if self.training:
            if self._skip_resize:
                return image, target
-            size = float(self.torch_choice(self.min_size))
+            size = self.torch_choice(self.min_size)
        else:
-            # FIXME assume for now that testing uses the largest scale
-            size = float(self.min_size[-1])
-        image, target = _resize_image_and_masks(image, size, float(self.max_size), target, self.fixed_size)
+            size = self.min_size[-1]
+        image, target = _resize_image_and_masks(image, size, self.max_size, target, self.fixed_size)

        if target is None:
            return image, target

--- a/torchvision/models/efficientnet.py
+++ b/torchvision/models/efficientnet.py
 import copy
 import math
-import warnings
 from dataclasses import dataclass
 from functools import partial
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
@@ -239,7 +238,6 @@ class EfficientNet(nn.Module):
        num_classes: int = 1000,
        norm_layer: Optional[Callable[..., nn.Module]] = None,
        last_channel: Optional[int] = None,
-        **kwargs: Any,
    ) -> None:
        """
        EfficientNet V1 and V2 main class
@@ -263,16 +261,6 @@ class EfficientNet(nn.Module):
        ):
            raise TypeError("The inverted_residual_setting should be List[MBConvConfig]")

-        if "block" in kwargs:
-            warnings.warn(
-                "The parameter 'block' is deprecated since 0.13 and will be removed 0.15. "
-                "Please pass this information on 'MBConvConfig.block' instead."
-            )
-            if kwargs["block"] is not None:
-                for s in inverted_residual_setting:
-                    if isinstance(s, MBConvConfig):
-                        s.block = kwargs["block"]
-
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d

@@ -369,7 +357,7 @@ def _efficientnet(
    model = EfficientNet(inverted_residual_setting, dropout, last_channel=last_channel, **kwargs)

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))

    return model

@@ -464,6 +452,8 @@ class EfficientNet_B0_Weights(WeightsEnum):
                    "acc@5": 93.532,
                }
            },
+            "_ops": 0.386,
+            "_file_size": 20.451,
            "_docs": """These weights are ported from the original paper.""",
        },
    )
@@ -473,7 +463,7 @@ class EfficientNet_B0_Weights(WeightsEnum):
 class EfficientNet_B1_Weights(WeightsEnum):
    IMAGENET1K_V1 = Weights(
        # Weights ported from https://github.com/rwightman/pytorch-image-models/
-        url="https://download.pytorch.org/models/efficientnet_b1_rwightman-533bc792.pth",
+        url="https://download.pytorch.org/models/efficientnet_b1_rwightman-bac287d4.pth",
        transforms=partial(
            ImageClassification, crop_size=240, resize_size=256, interpolation=InterpolationMode.BICUBIC
        ),
@@ -486,6 +476,8 @@ class EfficientNet_B1_Weights(WeightsEnum):
                    "acc@5": 94.186,
                }
            },
+            "_ops": 0.687,
+            "_file_size": 30.134,
            "_docs": """These weights are ported from the original paper.""",
        },
    )
@@ -504,6 +496,8 @@ class EfficientNet_B1_Weights(WeightsEnum):
                    "acc@5": 94.934,
                }
            },
+            "_ops": 0.687,
+            "_file_size": 30.136,
            "_docs": """
                These weights improve upon the results of the original paper by using a modified version of TorchVision's
                `new training recipe
@@ -530,6 +524,8 @@ class EfficientNet_B2_Weights(WeightsEnum):
                    "acc@5": 95.310,
                }
            },
+            "_ops": 1.088,
+            "_file_size": 35.174,
            "_docs": """These weights are ported from the original paper.""",
        },
    )
@@ -552,6 +548,8 @@ class EfficientNet_B3_Weights(WeightsEnum):
                    "acc@5": 96.054,
                }
            },
+            "_ops": 1.827,
+            "_file_size": 47.184,
            "_docs": """These weights are ported from the original paper.""",
        },
    )
@@ -574,6 +572,8 @@ class EfficientNet_B4_Weights(WeightsEnum):
                    "acc@5": 96.594,
                }
            },
+            "_ops": 4.394,
+            "_file_size": 74.489,
            "_docs": """These weights are ported from the original paper.""",
        },
    )
@@ -596,6 +596,8 @@ class EfficientNet_B5_Weights(WeightsEnum):
                    "acc@5": 96.628,
                }
            },
+            "_ops": 10.266,
+            "_file_size": 116.864,
            "_docs": """These weights are ported from the original paper.""",
        },
    )
@@ -618,6 +620,8 @@ class EfficientNet_B6_Weights(WeightsEnum):
                    "acc@5": 96.916,
                }
            },
+            "_ops": 19.068,
+            "_file_size": 165.362,
            "_docs": """These weights are ported from the original paper.""",
        },
    )
@@ -640,6 +644,8 @@ class EfficientNet_B7_Weights(WeightsEnum):
                    "acc@5": 96.908,
                }
            },
+            "_ops": 37.746,
+            "_file_size": 254.675,
            "_docs": """These weights are ported from the original paper.""",
        },
    )
@@ -664,6 +670,8 @@ class EfficientNet_V2_S_Weights(WeightsEnum):
                    "acc@5": 96.878,
                }
            },
+            "_ops": 8.366,
+            "_file_size": 82.704,
            "_docs": """
                These weights improve upon the results of the original paper by using a modified version of TorchVision's
                `new training recipe
@@ -692,6 +700,8 @@ class EfficientNet_V2_M_Weights(WeightsEnum):
                    "acc@5": 97.156,
                }
            },
+            "_ops": 24.582,
+            "_file_size": 208.01,
            "_docs": """
                These weights improve upon the results of the original paper by using a modified version of TorchVision's
                `new training recipe
@@ -723,6 +733,8 @@ class EfficientNet_V2_L_Weights(WeightsEnum):
                    "acc@5": 97.788,
                }
            },
+            "_ops": 56.08,
+            "_file_size": 454.573,
            "_docs": """These weights are ported from the original paper.""",
        },
    )
@@ -755,7 +767,9 @@ def efficientnet_b0(
    weights = EfficientNet_B0_Weights.verify(weights)

    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b0", width_mult=1.0, depth_mult=1.0)
-    return _efficientnet(inverted_residual_setting, 0.2, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting, kwargs.pop("dropout", 0.2), last_channel, weights, progress, **kwargs
+    )


 @register_model()
@@ -784,7 +798,9 @@ def efficientnet_b1(
    weights = EfficientNet_B1_Weights.verify(weights)

    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b1", width_mult=1.0, depth_mult=1.1)
-    return _efficientnet(inverted_residual_setting, 0.2, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting, kwargs.pop("dropout", 0.2), last_channel, weights, progress, **kwargs
+    )


 @register_model()
@@ -813,7 +829,9 @@ def efficientnet_b2(
    weights = EfficientNet_B2_Weights.verify(weights)

    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b2", width_mult=1.1, depth_mult=1.2)
-    return _efficientnet(inverted_residual_setting, 0.3, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting, kwargs.pop("dropout", 0.3), last_channel, weights, progress, **kwargs
+    )


 @register_model()
@@ -842,7 +860,14 @@ def efficientnet_b3(
    weights = EfficientNet_B3_Weights.verify(weights)

    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b3", width_mult=1.2, depth_mult=1.4)
-    return _efficientnet(inverted_residual_setting, 0.3, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.3),
+        last_channel,
+        weights,
+        progress,
+        **kwargs,
+    )


 @register_model()
@@ -871,7 +896,14 @@ def efficientnet_b4(
    weights = EfficientNet_B4_Weights.verify(weights)

    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b4", width_mult=1.4, depth_mult=1.8)
-    return _efficientnet(inverted_residual_setting, 0.4, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.4),
+        last_channel,
+        weights,
+        progress,
+        **kwargs,
+    )


 @register_model()
@@ -902,7 +934,7 @@ def efficientnet_b5(
    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b5", width_mult=1.6, depth_mult=2.2)
    return _efficientnet(
        inverted_residual_setting,
-        0.4,
+        kwargs.pop("dropout", 0.4),
        last_channel,
        weights,
        progress,
@@ -939,7 +971,7 @@ def efficientnet_b6(
    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b6", width_mult=1.8, depth_mult=2.6)
    return _efficientnet(
        inverted_residual_setting,
-        0.5,
+        kwargs.pop("dropout", 0.5),
        last_channel,
        weights,
        progress,
@@ -976,7 +1008,7 @@ def efficientnet_b7(
    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b7", width_mult=2.0, depth_mult=3.1)
    return _efficientnet(
        inverted_residual_setting,
-        0.5,
+        kwargs.pop("dropout", 0.5),
        last_channel,
        weights,
        progress,
@@ -1014,7 +1046,7 @@ def efficientnet_v2_s(
    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_s")
    return _efficientnet(
        inverted_residual_setting,
-        0.2,
+        kwargs.pop("dropout", 0.2),
        last_channel,
        weights,
        progress,
@@ -1052,7 +1084,7 @@ def efficientnet_v2_m(
    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_m")
    return _efficientnet(
        inverted_residual_setting,
-        0.3,
+        kwargs.pop("dropout", 0.3),
        last_channel,
        weights,
        progress,
@@ -1090,28 +1122,10 @@ def efficientnet_v2_l(
    inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_l")
    return _efficientnet(
        inverted_residual_setting,
-        0.4,
+        kwargs.pop("dropout", 0.4),
        last_channel,
        weights,
        progress,
        norm_layer=partial(nn.BatchNorm2d, eps=1e-03),
        **kwargs,
    )
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "efficientnet_b0": EfficientNet_B0_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b1": EfficientNet_B1_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b2": EfficientNet_B2_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b3": EfficientNet_B3_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b4": EfficientNet_B4_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b5": EfficientNet_B5_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b6": EfficientNet_B6_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b7": EfficientNet_B7_Weights.IMAGENET1K_V1.url,
-    }
-)
--- a/torchvision/models/feature_extraction.py
+++ b/torchvision/models/feature_extraction.py
@@ -18,7 +18,7 @@ __all__ = ["create_feature_extractor", "get_graph_node_names"]

 class LeafModuleAwareTracer(fx.Tracer):
    """
-    An fx.Tracer that allows the user to specify a set of leaf modules, ie.
+    An fx.Tracer that allows the user to specify a set of leaf modules, i.e.
    modules that are not to be traced through. The resulting graph ends up
    having single nodes referencing calls to the leaf modules' forward methods.
    """
@@ -103,7 +103,7 @@ class NodePathTracer(LeafModuleAwareTracer):

        if node.op != "call_module":
            # In this case module_qualname from torch.fx doesn't go all the
-            # way to the leaf function/op so we need to append it
+            # way to the leaf function/op, so we need to append it
            if len(node_qualname) > 0:
                # Only append '.' if we are deeper than the top level module
                node_qualname += "."
@@ -136,7 +136,7 @@ class NodePathTracer(LeafModuleAwareTracer):


 def _is_subseq(x, y):
-    """Check if y is a subseqence of x
+    """Check if y is a subsequence of x
    https://stackoverflow.com/a/24017747/4391249
    """
    iter_x = iter(x)
@@ -228,7 +228,7 @@ def get_graph_node_names(
        tracer_kwargs (dict, optional): a dictionary of keyword arguments for
            ``NodePathTracer`` (they are eventually passed onto
            `torch.fx.Tracer <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer>`_).
-            By default it will be set to wrap and make leaf nodes all torchvision ops:
+            By default, it will be set to wrap and make leaf nodes all torchvision ops:
            {"autowrap_modules": (math, torchvision.ops,),"leaf_modules": _get_leaf_modules_for_ops(),}
            WARNING: In case the user provides tracer_kwargs, above default arguments will be appended to the user
            provided dictionary.
@@ -391,7 +391,7 @@ def create_feature_extractor(
        tracer_kwargs (dict, optional): a dictionary of keyword arguments for
            ``NodePathTracer`` (which passes them onto it's parent class
            `torch.fx.Tracer <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer>`_).
-            By default it will be set to wrap and make leaf nodes all torchvision ops:
+            By default, it will be set to wrap and make leaf nodes all torchvision ops:
            {"autowrap_modules": (math, torchvision.ops,),"leaf_modules": _get_leaf_modules_for_ops(),}
            WARNING: In case the user provides tracer_kwargs, above default arguments will be appended to the user
            provided dictionary.
@@ -544,7 +544,7 @@ def create_feature_extractor(
        graph_module.graph.eliminate_dead_code()
        graph_module.recompile()

-        # Keep track of the tracer and graph so we can choose the main one
+        # Keep track of the tracer and graph, so we can choose the main one
        tracers[mode] = tracer
        graphs[mode] = graph


--- a/torchvision/models/googlenet.py
+++ b/torchvision/models/googlenet.py
@@ -290,6 +290,8 @@ class GoogLeNet_Weights(WeightsEnum):
                    "acc@5": 89.530,
                }
            },
+            "_ops": 1.498,
+            "_file_size": 49.731,
            "_docs": """These weights are ported from the original paper.""",
        },
    )
@@ -330,7 +332,7 @@ def googlenet(*, weights: Optional[GoogLeNet_Weights] = None, progress: bool = T
    model = GoogLeNet(**kwargs)

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
        if not original_aux_logits:
            model.aux_logits = False
            model.aux1 = None  # type: ignore[assignment]
@@ -341,15 +343,3 @@ def googlenet(*, weights: Optional[GoogLeNet_Weights] = None, progress: bool = T
            )

    return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        # GoogLeNet ported from TensorFlow
-        "googlenet": GoogLeNet_Weights.IMAGENET1K_V1.url,
-    }
-)
--- a/torchvision/models/inception.py
+++ b/torchvision/models/inception.py
@@ -48,7 +48,7 @@ class Inception3(nn.Module):
            )
            init_weights = True
        if len(inception_blocks) != 7:
-            raise ValueError(f"lenght of inception_blocks should be 7 instead of {len(inception_blocks)}")
+            raise ValueError(f"length of inception_blocks should be 7 instead of {len(inception_blocks)}")
        conv_block = inception_blocks[0]
        inception_a = inception_blocks[1]
        inception_b = inception_blocks[2]
@@ -422,6 +422,8 @@ class Inception_V3_Weights(WeightsEnum):
                    "acc@5": 93.450,
                }
            },
+            "_ops": 5.713,
+            "_file_size": 103.903,
            "_docs": """These weights are ported from the original paper.""",
        },
    )
@@ -468,21 +470,9 @@ def inception_v3(*, weights: Optional[Inception_V3_Weights] = None, progress: bo
    model = Inception3(**kwargs)

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
        if not original_aux_logits:
            model.aux_logits = False
            model.AuxLogits = None

    return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        # Inception v3 ported from TensorFlow
-        "inception_v3_google": Inception_V3_Weights.IMAGENET1K_V1.url,
-    }
-)
--- a/torchvision/models/maxvit.py
+++ b/torchvision/models/maxvit.py
 import math
+from collections import OrderedDict
 from functools import partial
-from typing import Any, Callable, List, Optional, OrderedDict, Sequence, Tuple
+from typing import Any, Callable, List, Optional, Sequence, Tuple

 import numpy as np
 import torch
@@ -300,7 +301,7 @@ class PartitionAttentionLayer(nn.Module):
        self,
        in_channels: int,
        head_dim: int,
-        # partitioning parameteres
+        # partitioning parameters
        partition_size: int,
        partition_type: str,
        # grid size needs to be known at initialization time
@@ -426,7 +427,7 @@ class MaxVitLayer(nn.Module):
    ) -> None:
        super().__init__()

-        layers: OrderedDict[str, Any] = OrderedDict()  # type: ignore
+        layers: OrderedDict = OrderedDict()

        # convolutional layer
        layers["MBconv"] = MBConv(
@@ -762,7 +763,7 @@ def _maxvit(
    )

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))

    return model

@@ -785,6 +786,8 @@ class MaxVit_T_Weights(WeightsEnum):
                    "acc@5": 96.722,
                }
            },
+            "_ops": 5.558,
+            "_file_size": 118.769,
            "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
        },
    )

--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -88,14 +88,14 @@ def _round_to_multiple_of(val: float, divisor: int, round_up_bias: float = 0.9)


 def _get_depths(alpha: float) -> List[int]:
-    """Scales tensor depths as in reference MobileNet code, prefers rouding up
+    """Scales tensor depths as in reference MobileNet code, prefers rounding up
    rather than down."""
    depths = [32, 16, 24, 40, 80, 96, 192, 320]
    return [_round_to_multiple_of(depth * alpha, 8) for depth in depths]


 class MNASNet(torch.nn.Module):
-    """MNASNet, as described in https://arxiv.org/pdf/1807.11626.pdf. This
+    """MNASNet, as described in https://arxiv.org/abs/1807.11626. This
    implements the B1 variant of the model.
    >>> model = MNASNet(1.0, num_classes=1000)
    >>> x = torch.rand(1, 3, 224, 224)
@@ -231,6 +231,8 @@ class MNASNet0_5_Weights(WeightsEnum):
                    "acc@5": 87.490,
                }
            },
+            "_ops": 0.104,
+            "_file_size": 8.591,
            "_docs": """These weights reproduce closely the results of the paper.""",
        },
    )
@@ -251,6 +253,8 @@ class MNASNet0_75_Weights(WeightsEnum):
                    "acc@5": 90.496,
                }
            },
+            "_ops": 0.215,
+            "_file_size": 12.303,
            "_docs": """
                These weights were trained from scratch by using TorchVision's `new training recipe
                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -273,6 +277,8 @@ class MNASNet1_0_Weights(WeightsEnum):
                    "acc@5": 91.510,
                }
            },
+            "_ops": 0.314,
+            "_file_size": 16.915,
            "_docs": """These weights reproduce closely the results of the paper.""",
        },
    )
@@ -293,6 +299,8 @@ class MNASNet1_3_Weights(WeightsEnum):
                    "acc@5": 93.522,
                }
            },
+            "_ops": 0.526,
+            "_file_size": 24.246,
            "_docs": """
                These weights were trained from scratch by using TorchVision's `new training recipe
                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -309,7 +317,7 @@ def _mnasnet(alpha: float, weights: Optional[WeightsEnum], progress: bool, **kwa
    model = MNASNet(alpha, **kwargs)

    if weights:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))

    return model

@@ -319,7 +327,7 @@ def _mnasnet(alpha: float, weights: Optional[WeightsEnum], progress: bool, **kwa
 def mnasnet0_5(*, weights: Optional[MNASNet0_5_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
    """MNASNet with depth multiplier of 0.5 from
    `MnasNet: Platform-Aware Neural Architecture Search for Mobile
-    <https://arxiv.org/pdf/1807.11626.pdf>`_ paper.
+    <https://arxiv.org/abs/1807.11626>`_ paper.

    Args:
        weights (:class:`~torchvision.models.MNASNet0_5_Weights`, optional): The
@@ -347,7 +355,7 @@ def mnasnet0_5(*, weights: Optional[MNASNet0_5_Weights] = None, progress: bool =
 def mnasnet0_75(*, weights: Optional[MNASNet0_75_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
    """MNASNet with depth multiplier of 0.75 from
    `MnasNet: Platform-Aware Neural Architecture Search for Mobile
-    <https://arxiv.org/pdf/1807.11626.pdf>`_ paper.
+    <https://arxiv.org/abs/1807.11626>`_ paper.

    Args:
        weights (:class:`~torchvision.models.MNASNet0_75_Weights`, optional): The
@@ -375,7 +383,7 @@ def mnasnet0_75(*, weights: Optional[MNASNet0_75_Weights] = None, progress: bool
 def mnasnet1_0(*, weights: Optional[MNASNet1_0_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
    """MNASNet with depth multiplier of 1.0 from
    `MnasNet: Platform-Aware Neural Architecture Search for Mobile
-    <https://arxiv.org/pdf/1807.11626.pdf>`_ paper.
+    <https://arxiv.org/abs/1807.11626>`_ paper.

    Args:
        weights (:class:`~torchvision.models.MNASNet1_0_Weights`, optional): The
@@ -403,7 +411,7 @@ def mnasnet1_0(*, weights: Optional[MNASNet1_0_Weights] = None, progress: bool =
 def mnasnet1_3(*, weights: Optional[MNASNet1_3_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
    """MNASNet with depth multiplier of 1.3 from
    `MnasNet: Platform-Aware Neural Architecture Search for Mobile
-    <https://arxiv.org/pdf/1807.11626.pdf>`_ paper.
+    <https://arxiv.org/abs/1807.11626>`_ paper.

    Args:
        weights (:class:`~torchvision.models.MNASNet1_3_Weights`, optional): The

--- a/torchvision/models/mobilenetv2.py
+++ b/torchvision/models/mobilenetv2.py
@@ -23,7 +23,7 @@ class InvertedResidual(nn.Module):
        super().__init__()
        self.stride = stride
        if stride not in [1, 2]:
-            raise ValueError(f"stride should be 1 or 2 insted of {stride}")
+            raise ValueError(f"stride should be 1 or 2 instead of {stride}")

        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
@@ -194,6 +194,8 @@ class MobileNet_V2_Weights(WeightsEnum):
                    "acc@5": 90.286,
                }
            },
+            "_ops": 0.301,
+            "_file_size": 13.555,
            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
        },
    )
@@ -209,6 +211,8 @@ class MobileNet_V2_Weights(WeightsEnum):
                    "acc@5": 90.822,
                }
            },
+            "_ops": 0.301,
+            "_file_size": 13.598,
            "_docs": """
                These weights improve upon the results of the original paper by using a modified version of TorchVision's
                `new training recipe
@@ -251,17 +255,6 @@ def mobilenet_v2(
    model = MobileNetV2(**kwargs)

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))

    return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "mobilenet_v2": MobileNet_V2_Weights.IMAGENET1K_V1.url,
-    }
-)
--- a/torchvision/models/mobilenetv3.py
+++ b/torchvision/models/mobilenetv3.py
@@ -282,7 +282,7 @@ def _mobilenet_v3(
    model = MobileNetV3(inverted_residual_setting, last_channel, **kwargs)

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))

    return model

@@ -307,6 +307,8 @@ class MobileNet_V3_Large_Weights(WeightsEnum):
                    "acc@5": 91.340,
                }
            },
+            "_ops": 0.217,
+            "_file_size": 21.114,
            "_docs": """These weights were trained from scratch by using a simple training recipe.""",
        },
    )
@@ -323,6 +325,8 @@ class MobileNet_V3_Large_Weights(WeightsEnum):
                    "acc@5": 92.566,
                }
            },
+            "_ops": 0.217,
+            "_file_size": 21.107,
            "_docs": """
                These weights improve marginally upon the results of the original paper by using a modified version of
                TorchVision's `new training recipe
@@ -347,6 +351,8 @@ class MobileNet_V3_Small_Weights(WeightsEnum):
                    "acc@5": 87.402,
                }
            },
+            "_ops": 0.057,
+            "_file_size": 9.829,
            "_docs": """
                These weights improve upon the results of the original paper by using a simple training recipe.
            """,
@@ -372,7 +378,7 @@ def mobilenet_v3_large(
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
-        **kwargs: parameters passed to the ``torchvision.models.resnet.MobileNetV3``
+        **kwargs: parameters passed to the ``torchvision.models.mobilenet.MobileNetV3``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py>`_
            for more details about this class.
@@ -403,7 +409,7 @@ def mobilenet_v3_small(
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
-        **kwargs: parameters passed to the ``torchvision.models.resnet.MobileNetV3``
+        **kwargs: parameters passed to the ``torchvision.models.mobilenet.MobileNetV3``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py>`_
            for more details about this class.
@@ -415,15 +421,3 @@ def mobilenet_v3_small(

    inverted_residual_setting, last_channel = _mobilenet_v3_conf("mobilenet_v3_small", **kwargs)
    return _mobilenet_v3(inverted_residual_setting, last_channel, weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "mobilenet_v3_large": MobileNet_V3_Large_Weights.IMAGENET1K_V1.url,
-        "mobilenet_v3_small": MobileNet_V3_Small_Weights.IMAGENET1K_V1.url,
-    }
-)
--- a/torchvision/models/optical_flow/raft.py
+++ b/torchvision/models/optical_flow/raft.py
@@ -35,7 +35,7 @@ class ResidualBlock(nn.Module):
        # But in the RAFT training reference, the BatchNorm2d layers are only activated for the first dataset,
        # and frozen for the rest of the training process (i.e. set as eval()). The bias term is thus still useful
        # for the rest of the datasets. Technically, we could remove the bias for other norm layers like Instance norm
-        # because these aren't frozen, but we don't bother (also, we woudn't be able to load the original weights).
+        # because these aren't frozen, but we don't bother (also, we wouldn't be able to load the original weights).
        self.convnormrelu1 = Conv2dNormActivation(
            in_channels, out_channels, norm_layer=norm_layer, kernel_size=3, stride=stride, bias=True
        )
@@ -318,7 +318,7 @@ class MaskPredictor(nn.Module):
    def __init__(self, *, in_channels, hidden_size, multiplier=0.25):
        super().__init__()
        self.convrelu = Conv2dNormActivation(in_channels, hidden_size, norm_layer=None, kernel_size=3)
-        # 8 * 8 * 9 because the predicted flow is downsampled by 8, from the downsampling of the initial FeatureEncoder
+        # 8 * 8 * 9 because the predicted flow is downsampled by 8, from the downsampling of the initial FeatureEncoder,
        # and we interpolate with all 9 surrounding neighbors. See paper and appendix B.
        self.conv = nn.Conv2d(hidden_size, 8 * 8 * 9, 1, padding=0)

@@ -369,6 +369,19 @@ class CorrBlock(nn.Module):
            raise ValueError(
                f"Input feature maps should have the same shape, instead got {fmap1.shape} (fmap1.shape) != {fmap2.shape} (fmap2.shape)"
            )
+
+        # Explaining min_fmap_size below: the fmaps are down-sampled (num_levels - 1) times by a factor of 2.
+        # The last corr_volume most have at least 2 values (hence the 2* factor), otherwise grid_sample() would
+        # produce nans in its output.
+        min_fmap_size = 2 * (2 ** (self.num_levels - 1))
+        if any(fmap_size < min_fmap_size for fmap_size in fmap1.shape[-2:]):
+            raise ValueError(
+                "Feature maps are too small to be down-sampled by the correlation pyramid. "
+                f"H and W of feature maps should be at least {min_fmap_size}; got: {fmap1.shape[-2:]}. "
+                "Remember that input images to the model are downsampled by 8, so that means their "
+                f"dimensions should be at least 8 * {min_fmap_size} = {8 * min_fmap_size}."
+            )
+
        corr_volume = self._compute_corr_volume(fmap1, fmap2)

        batch_size, h, w, num_channels, _, _ = corr_volume.shape  # _, _ = h, w
@@ -430,7 +443,7 @@ class RAFT(nn.Module):
                Its input is ``image1``. As in the original implementation, its output will be split into 2 parts:

                - one part will be used as the actual "context", passed to the recurrent unit of the ``update_block``
-                - one part will be used to initialize the hidden state of the of the recurrent unit of
+                - one part will be used to initialize the hidden state of the recurrent unit of
                  the ``update_block``

                These 2 parts are split according to the ``hidden_state_size`` of the ``update_block``, so the output
@@ -474,7 +487,7 @@ class RAFT(nn.Module):
        if (h, w) != image2.shape[-2:]:
            raise ValueError(f"input images should have the same shape, instead got ({h}, {w}) != {image2.shape[-2:]}")
        if not (h % 8 == 0) and (w % 8 == 0):
-            raise ValueError(f"input image H and W should be divisible by 8, insted got {h} (h) and {w} (w)")
+            raise ValueError(f"input image H and W should be divisible by 8, instead got {h} (h) and {w} (w)")

        fmaps = self.feature_encoder(torch.cat([image1, image2], dim=0))
        fmap1, fmap2 = torch.chunk(fmaps, chunks=2, dim=0)
@@ -552,6 +565,8 @@ class Raft_Large_Weights(WeightsEnum):
                "Sintel-Train-Finalpass": {"epe": 2.7894},
                "Kitti-Train": {"per_image_epe": 5.0172, "fl_all": 17.4506},
            },
+            "_ops": 211.007,
+            "_file_size": 20.129,
            "_docs": """These weights were ported from the original paper. They
            are trained on :class:`~torchvision.datasets.FlyingChairs` +
            :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -570,6 +585,8 @@ class Raft_Large_Weights(WeightsEnum):
                "Sintel-Train-Finalpass": {"epe": 2.7161},
                "Kitti-Train": {"per_image_epe": 4.5118, "fl_all": 16.0679},
            },
+            "_ops": 211.007,
+            "_file_size": 20.129,
            "_docs": """These weights were trained from scratch on
            :class:`~torchvision.datasets.FlyingChairs` +
            :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -588,6 +605,8 @@ class Raft_Large_Weights(WeightsEnum):
                "Sintel-Test-Cleanpass": {"epe": 1.94},
                "Sintel-Test-Finalpass": {"epe": 3.18},
            },
+            "_ops": 211.007,
+            "_file_size": 20.129,
            "_docs": """
                These weights were ported from the original paper. They are
                trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -612,6 +631,8 @@ class Raft_Large_Weights(WeightsEnum):
                "Sintel-Test-Cleanpass": {"epe": 1.819},
                "Sintel-Test-Finalpass": {"epe": 3.067},
            },
+            "_ops": 211.007,
+            "_file_size": 20.129,
            "_docs": """
                These weights were trained from scratch. They are
                pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -636,6 +657,8 @@ class Raft_Large_Weights(WeightsEnum):
            "_metrics": {
                "Kitti-Test": {"fl_all": 5.10},
            },
+            "_ops": 211.007,
+            "_file_size": 20.129,
            "_docs": """
                These weights were ported from the original paper. They are
                pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -657,6 +680,8 @@ class Raft_Large_Weights(WeightsEnum):
            "_metrics": {
                "Kitti-Test": {"fl_all": 5.19},
            },
+            "_ops": 211.007,
+            "_file_size": 20.129,
            "_docs": """
                These weights were trained from scratch. They are
                pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -698,6 +723,8 @@ class Raft_Small_Weights(WeightsEnum):
                "Sintel-Train-Finalpass": {"epe": 3.2790},
                "Kitti-Train": {"per_image_epe": 7.6557, "fl_all": 25.2801},
            },
+            "_ops": 47.655,
+            "_file_size": 3.821,
            "_docs": """These weights were ported from the original paper. They
            are trained on :class:`~torchvision.datasets.FlyingChairs` +
            :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -715,6 +742,8 @@ class Raft_Small_Weights(WeightsEnum):
                "Sintel-Train-Finalpass": {"epe": 3.2831},
                "Kitti-Train": {"per_image_epe": 7.5978, "fl_all": 25.2369},
            },
+            "_ops": 47.655,
+            "_file_size": 3.821,
            "_docs": """These weights were trained from scratch on
            :class:`~torchvision.datasets.FlyingChairs` +
            :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -802,7 +831,7 @@ def _raft(
    )

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))

    return model


--- a/torchvision/models/quantization/googlenet.py
+++ b/torchvision/models/quantization/googlenet.py
@@ -108,7 +108,7 @@ class QuantizableGoogLeNet(GoogLeNet):

 class GoogLeNet_QuantizedWeights(WeightsEnum):
    IMAGENET1K_FBGEMM_V1 = Weights(
-        url="https://download.pytorch.org/models/quantized/googlenet_fbgemm-c00238cf.pth",
+        url="https://download.pytorch.org/models/quantized/googlenet_fbgemm-c81f6644.pth",
        transforms=partial(ImageClassification, crop_size=224),
        meta={
            "num_params": 6624904,
@@ -123,6 +123,8 @@ class GoogLeNet_QuantizedWeights(WeightsEnum):
                    "acc@5": 89.404,
                }
            },
+            "_ops": 1.498,
+            "_file_size": 12.618,
            "_docs": """
                These weights were produced by doing Post Training Quantization (eager mode) on top of the unquantized
                weights listed below.
@@ -195,7 +197,7 @@ def googlenet(
        quantize_model(model, backend)

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
        if not original_aux_logits:
            model.aux_logits = False
            model.aux1 = None  # type: ignore[assignment]
@@ -206,16 +208,3 @@ def googlenet(
            )

    return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..googlenet import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        # fp32 GoogLeNet ported from TensorFlow, with weights quantized in PyTorch
-        "googlenet_fbgemm": GoogLeNet_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-    }
-)
--- a/torchvision/models/quantization/inception.py
+++ b/torchvision/models/quantization/inception.py
@@ -168,7 +168,7 @@ class QuantizableInception3(inception_module.Inception3):

 class Inception_V3_QuantizedWeights(WeightsEnum):
    IMAGENET1K_FBGEMM_V1 = Weights(
-        url="https://download.pytorch.org/models/quantized/inception_v3_google_fbgemm-71447a44.pth",
+        url="https://download.pytorch.org/models/quantized/inception_v3_google_fbgemm-a2837893.pth",
        transforms=partial(ImageClassification, crop_size=299, resize_size=342),
        meta={
            "num_params": 27161264,
@@ -183,6 +183,8 @@ class Inception_V3_QuantizedWeights(WeightsEnum):
                    "acc@5": 93.354,
                }
            },
+            "_ops": 5.713,
+            "_file_size": 23.146,
            "_docs": """
                These weights were produced by doing Post Training Quantization (eager mode) on top of the unquantized
                weights listed below.
@@ -263,22 +265,9 @@ def inception_v3(
        if quantize and not original_aux_logits:
            model.aux_logits = False
            model.AuxLogits = None
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
        if not quantize and not original_aux_logits:
            model.aux_logits = False
            model.AuxLogits = None

    return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..inception import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        # fp32 weights ported from TensorFlow, quantized in PyTorch
-        "inception_v3_google_fbgemm": Inception_V3_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-    }
-)
--- a/torchvision/models/quantization/mobilenetv2.py
+++ b/torchvision/models/quantization/mobilenetv2.py
@@ -80,6 +80,8 @@ class MobileNet_V2_QuantizedWeights(WeightsEnum):
                    "acc@5": 90.150,
                }
            },
+            "_ops": 0.301,
+            "_file_size": 3.423,
            "_docs": """
                These weights were produced by doing Quantization Aware Training (eager mode) on top of the unquantized
                weights listed below.
@@ -147,18 +149,6 @@ def mobilenet_v2(
        quantize_model(model, backend)

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))

    return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..mobilenetv2 import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        "mobilenet_v2_qnnpack": MobileNet_V2_QuantizedWeights.IMAGENET1K_QNNPACK_V1.url,
-    }
-)
--- a/torchvision/models/quantization/mobilenetv3.py
+++ b/torchvision/models/quantization/mobilenetv3.py
@@ -149,7 +149,7 @@ def _mobilenet_v3_model(
        torch.ao.quantization.prepare_qat(model, inplace=True)

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))

    if quantize:
        torch.ao.quantization.convert(model, inplace=True)
@@ -175,6 +175,8 @@ class MobileNet_V3_Large_QuantizedWeights(WeightsEnum):
                    "acc@5": 90.858,
                }
            },
+            "_ops": 0.217,
+            "_file_size": 21.554,
            "_docs": """
                These weights were produced by doing Quantization Aware Training (eager mode) on top of the unquantized
                weights listed below.
@@ -233,15 +235,3 @@ def mobilenet_v3_large(

    inverted_residual_setting, last_channel = _mobilenet_v3_conf("mobilenet_v3_large", **kwargs)
    return _mobilenet_v3_model(inverted_residual_setting, last_channel, weights, progress, quantize, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..mobilenetv3 import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        "mobilenet_v3_large_qnnpack": MobileNet_V3_Large_QuantizedWeights.IMAGENET1K_QNNPACK_V1.url,
-    }
-)
--- a/torchvision/models/quantization/resnet.py
+++ b/torchvision/models/quantization/resnet.py
@@ -144,7 +144,7 @@ def _resnet(
        quantize_model(model, backend)

    if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))

    return model

@@ -175,6 +175,8 @@ class ResNet18_QuantizedWeights(WeightsEnum):
                    "acc@5": 88.882,
                }
            },
+            "_ops": 1.814,
+            "_file_size": 11.238,
        },
    )
    DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -194,6 +196,8 @@ class ResNet50_QuantizedWeights(WeightsEnum):
                    "acc@5": 92.814,
                }
            },
+            "_ops": 4.089,
+            "_file_size": 24.759,
        },
    )
    IMAGENET1K_FBGEMM_V2 = Weights(
@@ -209,6 +213,8 @@ class ResNet50_QuantizedWeights(WeightsEnum):
                    "acc@5": 94.976,
                }
            },
+            "_ops": 4.089,
+            "_file_size": 24.953,
        },
    )
    DEFAULT = IMAGENET1K_FBGEMM_V2
@@ -228,6 +234,8 @@ class ResNeXt101_32X8D_QuantizedWeights(WeightsEnum):
                    "acc@5": 94.480,
                }
            },
+            "_ops": 16.414,
+            "_file_size": 86.034,
        },
    )
    IMAGENET1K_FBGEMM_V2 = Weights(
@@ -243,6 +251,8 @@ class ResNeXt101_32X8D_QuantizedWeights(WeightsEnum):
                    "acc@5": 96.132,
                }
            },
+            "_ops": 16.414,
+            "_file_size": 86.645,
        },
    )
    DEFAULT = IMAGENET1K_FBGEMM_V2
@@ -263,6 +273,8 @@ class ResNeXt101_64X4D_QuantizedWeights(WeightsEnum):
                    "acc@5": 96.326,
                }
            },
+            "_ops": 15.46,
+            "_file_size": 81.556,
        },
    )
    DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -470,17 +482,3 @@ def resnext101_64x4d(
    _ovewrite_named_param(kwargs, "groups", 64)
    _ovewrite_named_param(kwargs, "width_per_group", 4)
    return _resnet(QuantizableBottleneck, [3, 4, 23, 3], weights, progress, quantize, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..resnet import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        "resnet18_fbgemm": ResNet18_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-        "resnet50_fbgemm": ResNet50_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-        "resnext101_32x8d_fbgemm": ResNeXt101_32X8D_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-    }
-)