Fix typos and grammar errors (#7065)

* fix typos throughout the code base * fix grammar * revert formatting changes to gallery * revert 'an uXX' * remove 'number of the best'

Fix typos and grammar errors (#7065)
* fix typos throughout the code base * fix grammar * revert formatting changes to gallery * revert 'an uXX' * remove 'number of the best'
7dc5e5bd · Philip Meier · GitHub · ed2a0adb · 7dc5e5bd · 7dc5e5bd
Unverified Commit 7dc5e5bd authored Jan 11, 2023 by Philip Meier Committed by GitHub Jan 11, 2023
20 changed files
--- a/torchvision/datasets/svhn.py
+++ b/torchvision/datasets/svhn.py
@@ -78,7 +78,7 @@ class SVHN(VisionDataset):
        loaded_mat = sio.loadmat(os.path.join(self.root, self.filename))
        self.data = loaded_mat["X"]
-        # loading from the .mat file gives an np array of type np.uint8
+        # loading from the .mat file gives an np.ndarray of type np.uint8
        # converting to np.int64, so that we have a LongTensor after
        # the conversion from the numpy array
        # the squeeze is needed to obtain a 1D tensor

--- a/torchvision/datasets/ucf101.py
+++ b/torchvision/datasets/ucf101.py
@@ -93,7 +93,7 @@ class UCF101(VisionDataset):
            output_format=output_format,
        )
        # we bookkeep the full version of video clips because we want to be able
-        # to return the meta data of full version rather than the subset version of
+        # to return the metadata of full version rather than the subset version of
        # video clips
        self.full_video_clips = video_clips
        self.indices = self._select_fold(video_list, annotation_path, fold, train)

--- a/torchvision/datasets/video_utils.py
+++ b/torchvision/datasets/video_utils.py
@@ -49,7 +49,7 @@ class _VideoTimestampsDataset:
    Dataset used to parallelize the reading of the timestamps
    of a list of videos, given their paths in the filesystem.
-    Used in VideoClips and defined at top level so it can be
+    Used in VideoClips and defined at top level, so it can be
    pickled when forking.
    """

--- a/torchvision/io/_video_opt.py
+++ b/torchvision/io/_video_opt.py
@@ -137,8 +137,7 @@ def _read_video_from_file(
    audio_timebase: Fraction = default_timebase,
 ) -> Tuple[torch.Tensor, torch.Tensor, VideoMetaData]:
    """
-    Reads a video from a file, returning both the video frames as well as
+    Reads a video from a file, returning both the video frames and the audio frames
-    the audio frames
    Args:
    filename (str): path to the video file
@@ -281,8 +280,7 @@ def _read_video_from_memory(
    audio_timebase_denominator: int = 1,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
    """
-    Reads a video from memory, returning both the video frames as well as
+    Reads a video from memory, returning both the video frames as the audio frames
-    the audio frames
    This function is torchscriptable.
    Args:
@@ -337,7 +335,7 @@ def _read_video_from_memory(
    if not isinstance(video_data, torch.Tensor):
        with warnings.catch_warnings():
-            # Ignore the warning because we actually dont modify the buffer in this function
+            # Ignore the warning because we actually don't modify the buffer in this function
            warnings.filterwarnings("ignore", message="The given buffer is not writable")
            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
@@ -382,7 +380,7 @@ def _read_video_timestamps_from_memory(
    """
    if not isinstance(video_data, torch.Tensor):
        with warnings.catch_warnings():
-            # Ignore the warning because we actually dont modify the buffer in this function
+            # Ignore the warning because we actually don't modify the buffer in this function
            warnings.filterwarnings("ignore", message="The given buffer is not writable")
            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
    result = torch.ops.video_reader.read_video_from_memory(
@@ -423,7 +421,7 @@ def _probe_video_from_memory(
    """
    if not isinstance(video_data, torch.Tensor):
        with warnings.catch_warnings():
-            # Ignore the warning because we actually dont modify the buffer in this function
+            # Ignore the warning because we actually don't modify the buffer in this function
            warnings.filterwarnings("ignore", message="The given buffer is not writable")
            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
    result = torch.ops.video_reader.probe_video_from_memory(video_data)

--- a/torchvision/io/image.py
+++ b/torchvision/io/image.py
@@ -50,7 +50,7 @@ def read_file(path: str) -> torch.Tensor:
 def write_file(filename: str, data: torch.Tensor) -> None:
    """
-    Writes the contents of a uint8 tensor with one dimension to a
+    Writes the contents of an uint8 tensor with one dimension to a
    file.
    Args:

--- a/torchvision/io/video.py
+++ b/torchvision/io/video.py
@@ -241,8 +241,7 @@ def read_video(
    output_format: str = "THWC",
 ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
    """
-    Reads a video from a file, returning both the video frames as well as
+    Reads a video from a file, returning both the video frames and the audio frames
-    the audio frames
    Args:
        filename (str): path to the video file

--- a/torchvision/io/video_reader.py
+++ b/torchvision/io/video_reader.py
@@ -92,13 +92,13 @@ class VideoReader:
        Each stream descriptor consists of two parts: stream type (e.g. 'video') and
        a unique stream id (which are determined by the video encoding).
        In this way, if the video contaner contains multiple
-        streams of the same type, users can acces the one they want.
+        streams of the same type, users can access the one they want.
        If only stream type is passed, the decoder auto-detects first stream of that type.
    Args:
        src (string, bytes object, or tensor): The media source.
            If string-type, it must be a file path supported by FFMPEG.
-            If bytes shoud be an in memory representatin of a file supported by FFMPEG.
+            If bytes should be an in memory representatin of a file supported by FFMPEG.
            If Tensor, it is interpreted internally as byte buffer.
            It must be one-dimensional, of type ``torch.uint8``.
@@ -145,7 +145,7 @@ class VideoReader:
                src = io.BytesIO(src)
            else:
                with warnings.catch_warnings():
-                    # Ignore the warning because we actually dont modify the buffer in this function
+                    # Ignore the warning because we actually don't modify the buffer in this function
                    warnings.filterwarnings("ignore", message="The given buffer is not writable")
                    src = torch.frombuffer(src, dtype=torch.uint8)
        elif isinstance(src, torch.Tensor):
@@ -280,12 +280,12 @@ class VideoReader:
                Each descriptor consists of two parts: stream type (e.g. 'video') and
                a unique stream id (which are determined by video encoding).
                In this way, if the video contaner contains multiple
-                streams of the same type, users can acces the one they want.
+                streams of the same type, users can access the one they want.
                If only stream type is passed, the decoder auto-detects first stream
                of that type and returns it.
        Returns:
-            (bool): True on succes, False otherwise
+            (bool): True on success, False otherwise
        """
        if self.backend == "cuda":
            warnings.warn("GPU decoding only works with video stream.")

--- a/torchvision/models/_utils.py
+++ b/torchvision/models/_utils.py
@@ -191,7 +191,7 @@ def handle_legacy_interface(**weights: Tuple[str, Union[Optional[W], Callable[[D
                # used to be a pretrained parameter.
                pretrained_positional = weights_arg is not sentinel
                if pretrained_positional:
-                    # We put the pretrained argument under its legacy name in the keyword argument dictionary to have a
+                    # We put the pretrained argument under its legacy name in the keyword argument dictionary to have
                    # unified access to the value if the default value is a callable.
                    kwargs[pretrained_param] = pretrained_arg = kwargs.pop(weights_param)
                else:

--- a/torchvision/models/detection/_utils.py
+++ b/torchvision/models/detection/_utils.py
@@ -403,9 +403,9 @@ class Matcher:
        it is unmatched, then match it to the ground-truth with which it has the highest
        quality value.
        """
-        # For each gt, find the prediction with which it has highest quality
+        # For each gt, find the prediction with which it has the highest quality
        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
-        # Find highest quality match available, even if it is low, including ties
+        # Find the highest quality match available, even if it is low, including ties
        gt_pred_pairs_of_highest_quality = torch.where(match_quality_matrix == highest_quality_foreach_gt[:, None])
        # Example gt_pred_pairs_of_highest_quality:
        #   tensor([[    0, 39796],
@@ -501,14 +501,14 @@ def _topk_min(input: Tensor, orig_kval: int, axis: int) -> int:
    if K exceeds the number of elements along that axis. Previously, python's min() function was
    used to determine whether to use the provided k-value or the specified dim axis value.
-    However in cases where the model is being exported in tracing mode, python min() is
+    However, in cases where the model is being exported in tracing mode, python min() is
    static causing the model to be traced incorrectly and eventually fail at the topk node.
    In order to avoid this situation, in tracing mode, torch.min() is used instead.
    Args:
-        input (Tensor): The orignal input tensor.
+        input (Tensor): The original input tensor.
        orig_kval (int): The provided k-value.
-        axis(int): Axis along which we retreive the input size.
+        axis(int): Axis along which we retrieve the input size.
    Returns:
        min_kval (int): Appropriately selected k-value.

--- a/torchvision/models/detection/anchor_utils.py
+++ b/torchvision/models/detection/anchor_utils.py
@@ -145,7 +145,7 @@ class DefaultBoxGenerator(nn.Module):
            of the scales of each feature map. It is used only if the ``scales`` parameter is not provided.
        scales (List[float]], optional): The scales of the default boxes. If not provided it will be estimated using
            the ``min_ratio`` and ``max_ratio`` parameters.
-        steps (List[int]], optional): It's a hyper-parameter that affects the tiling of defalt boxes. If not provided
+        steps (List[int]], optional): It's a hyper-parameter that affects the tiling of default boxes. If not provided
            it will be estimated from the data.
        clip (bool): Whether the standardized values of default boxes should be clipped between 0 and 1. The clipping
            is applied while the boxes are encoded in format ``(cx, cy, w, h)``.

--- a/torchvision/models/detection/backbone_utils.py
+++ b/torchvision/models/detection/backbone_utils.py
@@ -102,12 +102,12 @@ def resnet_fpn_backbone(
        trainable_layers (int): number of trainable (not frozen) layers starting from final block.
            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
        returned_layers (list of int): The layers of the network to return. Each entry must be in ``[1, 4]``.
-            By default all layers are returned.
+            By default, all layers are returned.
        extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
            be performed. It is expected to take the fpn features, the original
            features and the names of the original features as input, and returns
            a new list of feature maps and their corresponding names. By
-            default a ``LastLevelMaxPool`` is used.
+            default, a ``LastLevelMaxPool`` is used.
    """
    backbone = resnet.__dict__[backbone_name](weights=weights, norm_layer=norm_layer)
    return _resnet_fpn_extractor(backbone, trainable_layers, returned_layers, extra_blocks)
@@ -121,7 +121,7 @@ def _resnet_fpn_extractor(
    norm_layer: Optional[Callable[..., nn.Module]] = None,
 ) -> BackboneWithFPN:
-    # select layers that wont be frozen
+    # select layers that won't be frozen
    if trainable_layers < 0 or trainable_layers > 5:
        raise ValueError(f"Trainable layers should be in the range [0,5], got {trainable_layers}")
    layers_to_train = ["layer4", "layer3", "layer2", "layer1", "conv1"][:trainable_layers]
@@ -208,7 +208,7 @@ def _mobilenet_extractor(
    stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
    num_stages = len(stage_indices)
-    # find the index of the layer from which we wont freeze
+    # find the index of the layer from which we won't freeze
    if trainable_layers < 0 or trainable_layers > num_stages:
        raise ValueError(f"Trainable layers should be in the range [0,{num_stages}], got {trainable_layers} ")
    freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers]

--- a/torchvision/models/detection/faster_rcnn.py
+++ b/torchvision/models/detection/faster_rcnn.py
@@ -47,9 +47,9 @@ class FasterRCNN(GeneralizedRCNN):
    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
    image, and should be in 0-1 range. Different images can have different sizes.
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -68,7 +68,7 @@ class FasterRCNN(GeneralizedRCNN):
    Args:
        backbone (nn.Module): the network used to compute the features for the model.
-            It should contain a out_channels attribute, which indicates the number of output
+            It should contain an out_channels attribute, which indicates the number of output
            channels that each feature map has (and it should be the same for all feature maps).
            The backbone should return a single Tensor or and OrderedDict[Tensor].
        num_classes (int): number of output classes of the model (including the background).
@@ -128,7 +128,7 @@ class FasterRCNN(GeneralizedRCNN):
        >>> # only the features
        >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
        >>> # FasterRCNN needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
        >>> # so we need to add it here
        >>> backbone.out_channels = 1280
        >>>
@@ -483,9 +483,9 @@ def fasterrcnn_resnet50_fpn(
    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
    image, and should be in ``0-1`` range. Different images can have different sizes.
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and a targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -714,7 +714,7 @@ def fasterrcnn_mobilenet_v3_large_320_fpn(
    **kwargs: Any,
 ) -> FasterRCNN:
    """
-    Low resolution Faster R-CNN model with a MobileNetV3-Large backbone tunned for mobile use cases.
+    Low resolution Faster R-CNN model with a MobileNetV3-Large backbone tuned for mobile use cases.
    .. betastatus:: detection module

--- a/torchvision/models/detection/fcos.py
+++ b/torchvision/models/detection/fcos.py
@@ -70,7 +70,7 @@ class FCOSHead(nn.Module):
            else:
                gt_classes_targets = targets_per_image["labels"][matched_idxs_per_image.clip(min=0)]
                gt_boxes_targets = targets_per_image["boxes"][matched_idxs_per_image.clip(min=0)]
-            gt_classes_targets[matched_idxs_per_image < 0] = -1  # backgroud
+            gt_classes_targets[matched_idxs_per_image < 0] = -1  # background
            all_gt_classes_targets.append(gt_classes_targets)
            all_gt_boxes_targets.append(gt_boxes_targets)
@@ -274,9 +274,9 @@ class FCOS(nn.Module):
    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
    image, and should be in 0-1 range. Different images can have different sizes.
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -329,7 +329,7 @@ class FCOS(nn.Module):
        >>> # only the features
        >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
        >>> # FCOS needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
        >>> # so we need to add it here
        >>> backbone.out_channels = 1280
        >>>
@@ -695,9 +695,9 @@ def fcos_resnet50_fpn(
    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
    image, and should be in ``0-1`` range. Different images can have different sizes.
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with

--- a/torchvision/models/detection/keypoint_rcnn.py
+++ b/torchvision/models/detection/keypoint_rcnn.py
@@ -29,9 +29,9 @@ class KeypointRCNN(FasterRCNN):
    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
    image, and should be in 0-1 range. Different images can have different sizes.
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -55,7 +55,7 @@ class KeypointRCNN(FasterRCNN):
    Args:
        backbone (nn.Module): the network used to compute the features for the model.
-            It should contain a out_channels attribute, which indicates the number of output
+            It should contain an out_channels attribute, which indicates the number of output
            channels that each feature map has (and it should be the same for all feature maps).
            The backbone should return a single Tensor or and OrderedDict[Tensor].
        num_classes (int): number of output classes of the model (including the background).
@@ -121,7 +121,7 @@ class KeypointRCNN(FasterRCNN):
        >>> # only the features
        >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
        >>> # KeypointRCNN needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
        >>> # so we need to add it here
        >>> backbone.out_channels = 1280
        >>>
@@ -387,9 +387,9 @@ def keypointrcnn_resnet50_fpn(
    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
    image, and should be in ``0-1`` range. Different images can have different sizes.
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with

--- a/torchvision/models/detection/mask_rcnn.py
+++ b/torchvision/models/detection/mask_rcnn.py
@@ -31,9 +31,9 @@ class MaskRCNN(FasterRCNN):
    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
    image, and should be in 0-1 range. Different images can have different sizes.
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -56,7 +56,7 @@ class MaskRCNN(FasterRCNN):
    Args:
        backbone (nn.Module): the network used to compute the features for the model.
-            It should contain a out_channels attribute, which indicates the number of output
+            It should contain an out_channels attribute, which indicates the number of output
            channels that each feature map has (and it should be the same for all feature maps).
            The backbone should return a single Tensor or and OrderedDict[Tensor].
        num_classes (int): number of output classes of the model (including the background).
@@ -123,7 +123,7 @@ class MaskRCNN(FasterRCNN):
        >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
        >>> # MaskRCNN needs to know the number of
        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
-        >>> # so we need to add it here
+        >>> # so we need to add it here,
        >>> backbone.out_channels = 1280
        >>>
        >>> # let's make the RPN generate 5 x 3 anchors per spatial
@@ -422,9 +422,9 @@ def maskrcnn_resnet50_fpn(
    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
    image, and should be in ``0-1`` range. Different images can have different sizes.
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with

--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -327,9 +327,9 @@ class RetinaNet(nn.Module):
    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
    image, and should be in 0-1 range. Different images can have different sizes.
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -382,7 +382,7 @@ class RetinaNet(nn.Module):
        >>> # only the features
        >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
        >>> # RetinaNet needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
        >>> # so we need to add it here
        >>> backbone.out_channels = 1280
        >>>
@@ -743,9 +743,9 @@ def retinanet_resnet50_fpn(
    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
    image, and should be in ``0-1`` range. Different images can have different sizes.
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with

--- a/torchvision/models/detection/roi_heads.py
+++ b/torchvision/models/detection/roi_heads.py
@@ -315,7 +315,7 @@ def keypointrcnn_loss(keypoint_logits, proposals, gt_keypoints, keypoint_matched
    valid = torch.cat(valid, dim=0).to(dtype=torch.uint8)
    valid = torch.where(valid)[0]
-    # torch.mean (in binary_cross_entropy_with_logits) does'nt
+    # torch.mean (in binary_cross_entropy_with_logits) doesn't
    # accept empty tensors, so handle it sepaartely
    if keypoint_targets.numel() == 0 or len(valid) == 0:
        return keypoint_logits.sum() * 0

--- a/torchvision/models/detection/ssd.py
+++ b/torchvision/models/detection/ssd.py
@@ -128,12 +128,12 @@ class SSD(nn.Module):
    Implements SSD architecture from `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
-    image, and should be in 0-1 range. Different images can have different sizes but they will be resized
+    image, and should be in 0-1 range. Different images can have different sizes, but they will be resized
    to a fixed size before passing it to the backbone.
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -556,7 +556,7 @@ def _vgg_extractor(backbone: VGG, highres: bool, trainable_layers: int):
    stage_indices = [0] + [i for i, b in enumerate(backbone) if isinstance(b, nn.MaxPool2d)][:-1]
    num_stages = len(stage_indices)
-    # find the index of the layer from which we wont freeze
+    # find the index of the layer from which we won't freeze
    torch._assert(
        0 <= trainable_layers <= num_stages,
        f"trainable_layers should be in the range [0, {num_stages}]. Instead got {trainable_layers}",
@@ -590,12 +590,12 @@ def ssd300_vgg16(
    .. betastatus:: detection module
    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
-    image, and should be in 0-1 range. Different images can have different sizes but they will be resized
+    image, and should be in 0-1 range. Different images can have different sizes, but they will be resized
    to a fixed size before passing it to the backbone.
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with

--- a/torchvision/models/detection/ssdlite.py
+++ b/torchvision/models/detection/ssdlite.py
@@ -172,7 +172,7 @@ def _mobilenet_extractor(
    stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
    num_stages = len(stage_indices)
-    # find the index of the layer from which we wont freeze
+    # find the index of the layer from which we won't freeze
    if not 0 <= trainable_layers <= num_stages:
        raise ValueError("trainable_layers should be in the range [0, {num_stages}], instead got {trainable_layers}")
    freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers]

--- a/torchvision/models/detection/transform.py
+++ b/torchvision/models/detection/transform.py
@@ -76,7 +76,7 @@ class GeneralizedRCNNTransform(nn.Module):
    Performs input / target transformation before feeding the data to a GeneralizedRCNN
    model.
-    The transformations it perform are:
+    The transformations it performs are:
        - input normalization (mean subtraction and std division)
        - input / target resizing to match min_size / max_size
@@ -158,7 +158,7 @@ class GeneralizedRCNNTransform(nn.Module):
    def torch_choice(self, k: List[int]) -> int:
        """
-        Implements `random.choice` via torch ops so it can be compiled with
+        Implements `random.choice` via torch ops, so it can be compiled with
        TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803
        is fixed.
        """