Fix typos and grammar errors (#7065)

* fix typos throughout the code base * fix grammar * revert formatting changes to gallery * revert 'an uXX' * remove 'number of the best'

Fix typos and grammar errors (#7065)
* fix typos throughout the code base * fix grammar * revert formatting changes to gallery * revert 'an uXX' * remove 'number of the best'
7dc5e5bd · Philip Meier · GitHub · ed2a0adb · 7dc5e5bd · 7dc5e5bd
Unverified Commit 7dc5e5bd authored Jan 11, 2023 by Philip Meier Committed by GitHub Jan 11, 2023
20 changed files
--- a/torchvision/models/feature_extraction.py
+++ b/torchvision/models/feature_extraction.py
@@ -18,7 +18,7 @@ __all__ = ["create_feature_extractor", "get_graph_node_names"]

 class LeafModuleAwareTracer(fx.Tracer):
    """
-    An fx.Tracer that allows the user to specify a set of leaf modules, ie.
+    An fx.Tracer that allows the user to specify a set of leaf modules, i.e.
    modules that are not to be traced through. The resulting graph ends up
    having single nodes referencing calls to the leaf modules' forward methods.
    """
@@ -103,7 +103,7 @@ class NodePathTracer(LeafModuleAwareTracer):

        if node.op != "call_module":
            # In this case module_qualname from torch.fx doesn't go all the
-            # way to the leaf function/op so we need to append it
+            # way to the leaf function/op, so we need to append it
            if len(node_qualname) > 0:
                # Only append '.' if we are deeper than the top level module
                node_qualname += "."
@@ -136,7 +136,7 @@ class NodePathTracer(LeafModuleAwareTracer):


 def _is_subseq(x, y):
-    """Check if y is a subseqence of x
+    """Check if y is a subsequence of x
    https://stackoverflow.com/a/24017747/4391249
    """
    iter_x = iter(x)
@@ -228,7 +228,7 @@ def get_graph_node_names(
        tracer_kwargs (dict, optional): a dictionary of keyword arguments for
            ``NodePathTracer`` (they are eventually passed onto
            `torch.fx.Tracer <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer>`_).
-            By default it will be set to wrap and make leaf nodes all torchvision ops:
+            By default, it will be set to wrap and make leaf nodes all torchvision ops:
            {"autowrap_modules": (math, torchvision.ops,),"leaf_modules": _get_leaf_modules_for_ops(),}
            WARNING: In case the user provides tracer_kwargs, above default arguments will be appended to the user
            provided dictionary.
@@ -391,7 +391,7 @@ def create_feature_extractor(
        tracer_kwargs (dict, optional): a dictionary of keyword arguments for
            ``NodePathTracer`` (which passes them onto it's parent class
            `torch.fx.Tracer <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer>`_).
-            By default it will be set to wrap and make leaf nodes all torchvision ops:
+            By default, it will be set to wrap and make leaf nodes all torchvision ops:
            {"autowrap_modules": (math, torchvision.ops,),"leaf_modules": _get_leaf_modules_for_ops(),}
            WARNING: In case the user provides tracer_kwargs, above default arguments will be appended to the user
            provided dictionary.
@@ -544,7 +544,7 @@ def create_feature_extractor(
        graph_module.graph.eliminate_dead_code()
        graph_module.recompile()

-        # Keep track of the tracer and graph so we can choose the main one
+        # Keep track of the tracer and graph, so we can choose the main one
        tracers[mode] = tracer
        graphs[mode] = graph


--- a/torchvision/models/inception.py
+++ b/torchvision/models/inception.py
@@ -48,7 +48,7 @@ class Inception3(nn.Module):
            )
            init_weights = True
        if len(inception_blocks) != 7:
-            raise ValueError(f"lenght of inception_blocks should be 7 instead of {len(inception_blocks)}")
+            raise ValueError(f"length of inception_blocks should be 7 instead of {len(inception_blocks)}")
        conv_block = inception_blocks[0]
        inception_a = inception_blocks[1]
        inception_b = inception_blocks[2]

--- a/torchvision/models/maxvit.py
+++ b/torchvision/models/maxvit.py
@@ -301,7 +301,7 @@ class PartitionAttentionLayer(nn.Module):
        self,
        in_channels: int,
        head_dim: int,
-        # partitioning parameteres
+        # partitioning parameters
        partition_size: int,
        partition_type: str,
        # grid size needs to be known at initialization time

--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -88,7 +88,7 @@ def _round_to_multiple_of(val: float, divisor: int, round_up_bias: float = 0.9)


 def _get_depths(alpha: float) -> List[int]:
-    """Scales tensor depths as in reference MobileNet code, prefers rouding up
+    """Scales tensor depths as in reference MobileNet code, prefers rounding up
    rather than down."""
    depths = [32, 16, 24, 40, 80, 96, 192, 320]
    return [_round_to_multiple_of(depth * alpha, 8) for depth in depths]

--- a/torchvision/models/mobilenetv2.py
+++ b/torchvision/models/mobilenetv2.py
@@ -23,7 +23,7 @@ class InvertedResidual(nn.Module):
        super().__init__()
        self.stride = stride
        if stride not in [1, 2]:
-            raise ValueError(f"stride should be 1 or 2 insted of {stride}")
+            raise ValueError(f"stride should be 1 or 2 instead of {stride}")

        if norm_layer is None:
            norm_layer = nn.BatchNorm2d

--- a/torchvision/models/optical_flow/raft.py
+++ b/torchvision/models/optical_flow/raft.py
@@ -35,7 +35,7 @@ class ResidualBlock(nn.Module):
        # But in the RAFT training reference, the BatchNorm2d layers are only activated for the first dataset,
        # and frozen for the rest of the training process (i.e. set as eval()). The bias term is thus still useful
        # for the rest of the datasets. Technically, we could remove the bias for other norm layers like Instance norm
-        # because these aren't frozen, but we don't bother (also, we woudn't be able to load the original weights).
+        # because these aren't frozen, but we don't bother (also, we wouldn't be able to load the original weights).
        self.convnormrelu1 = Conv2dNormActivation(
            in_channels, out_channels, norm_layer=norm_layer, kernel_size=3, stride=stride, bias=True
        )
@@ -318,7 +318,7 @@ class MaskPredictor(nn.Module):
    def __init__(self, *, in_channels, hidden_size, multiplier=0.25):
        super().__init__()
        self.convrelu = Conv2dNormActivation(in_channels, hidden_size, norm_layer=None, kernel_size=3)
-        # 8 * 8 * 9 because the predicted flow is downsampled by 8, from the downsampling of the initial FeatureEncoder
+        # 8 * 8 * 9 because the predicted flow is downsampled by 8, from the downsampling of the initial FeatureEncoder,
        # and we interpolate with all 9 surrounding neighbors. See paper and appendix B.
        self.conv = nn.Conv2d(hidden_size, 8 * 8 * 9, 1, padding=0)

@@ -430,7 +430,7 @@ class RAFT(nn.Module):
                Its input is ``image1``. As in the original implementation, its output will be split into 2 parts:

                - one part will be used as the actual "context", passed to the recurrent unit of the ``update_block``
-                - one part will be used to initialize the hidden state of the of the recurrent unit of
+                - one part will be used to initialize the hidden state of the recurrent unit of
                  the ``update_block``

                These 2 parts are split according to the ``hidden_state_size`` of the ``update_block``, so the output
@@ -474,7 +474,7 @@ class RAFT(nn.Module):
        if (h, w) != image2.shape[-2:]:
            raise ValueError(f"input images should have the same shape, instead got ({h}, {w}) != {image2.shape[-2:]}")
        if not (h % 8 == 0) and (w % 8 == 0):
-            raise ValueError(f"input image H and W should be divisible by 8, insted got {h} (h) and {w} (w)")
+            raise ValueError(f"input image H and W should be divisible by 8, instead got {h} (h) and {w} (w)")

        fmaps = self.feature_encoder(torch.cat([image1, image2], dim=0))
        fmap1, fmap2 = torch.chunk(fmaps, chunks=2, dim=0)

--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -212,7 +212,7 @@ class BlockParams:
        **kwargs: Any,
    ) -> "BlockParams":
        """
-        Programatically compute all the per-block settings,
+        Programmatically compute all the per-block settings,
        given the RegNet parameters.

        The first step is to compute the quantized linear block parameters,

--- a/torchvision/models/resnet.py
+++ b/torchvision/models/resnet.py
@@ -108,7 +108,7 @@ class BasicBlock(nn.Module):
 class Bottleneck(nn.Module):
    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
-    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # according to "Deep residual learning for image recognition" https://arxiv.org/abs/1512.03385.
    # This variant is also known as ResNet V1.5 and improves accuracy according to
    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.


--- a/torchvision/models/vision_transformer.py
+++ b/torchvision/models/vision_transformer.py
@@ -793,7 +793,7 @@ def interpolate_embeddings(
    interpolation_mode: str = "bicubic",
    reset_heads: bool = False,
 ) -> "OrderedDict[str, torch.Tensor]":
-    """This function helps interpolating positional embeddings during checkpoint loading,
+    """This function helps interpolate positional embeddings during checkpoint loading,
    especially when you want to apply a pre-trained model on images with different resolution.

    Args:
@@ -818,7 +818,7 @@ def interpolate_embeddings(
    # We do this by reshaping the positions embeddings to a 2d grid, performing
    # an interpolation in the (h, w) space and then reshaping back to a 1d grid.
    if new_seq_length != seq_length:
-        # The class token embedding shouldn't be interpolated so we split it up.
+        # The class token embedding shouldn't be interpolated, so we split it up.
        seq_length -= 1
        new_seq_length -= 1
        pos_embedding_token = pos_embedding[:, :1, :]

--- a/torchvision/ops/_box_convert.py
+++ b/torchvision/ops/_box_convert.py
@@ -50,7 +50,7 @@ def _box_xyxy_to_cxcywh(boxes: Tensor) -> Tensor:
 def _box_xywh_to_xyxy(boxes: Tensor) -> Tensor:
    """
    Converts bounding boxes from (x, y, w, h) format to (x1, y1, x2, y2) format.
-    (x, y) refers to top left of bouding box.
+    (x, y) refers to top left of bounding box.
    (w, h) refers to width and height of box.
    Args:
        boxes (Tensor[N, 4]): boxes in (x, y, w, h) which will be converted.

--- a/torchvision/ops/diou_loss.py
+++ b/torchvision/ops/diou_loss.py
@@ -36,7 +36,7 @@ def distance_box_iou_loss(
        Tensor: Loss tensor with the reduction option applied.

    Reference:
-        Zhaohui Zheng et. al: Distance Intersection over Union Loss:
+        Zhaohui Zheng et al.: Distance Intersection over Union Loss:
        https://arxiv.org/abs/1911.08287
    """


--- a/torchvision/ops/feature_pyramid_network.py
+++ b/torchvision/ops/feature_pyramid_network.py
@@ -178,7 +178,7 @@ class FeaturePyramidNetwork(nn.Module):

        Returns:
            results (OrderedDict[Tensor]): feature maps after FPN layers.
-                They are ordered from highest resolution first.
+                They are ordered from the highest resolution first.
        """
        # unpack OrderedDict into two lists for easier handling
        names = list(x.keys())

--- a/torchvision/ops/giou_loss.py
+++ b/torchvision/ops/giou_loss.py
@@ -33,7 +33,7 @@ def generalized_box_iou_loss(
        Tensor: Loss tensor with the reduction option applied.

    Reference:
-        Hamid Rezatofighi et. al: Generalized Intersection over Union:
+        Hamid Rezatofighi et al.: Generalized Intersection over Union:
        A Metric and A Loss for Bounding Box Regression:
        https://arxiv.org/abs/1902.09630
    """

--- a/torchvision/ops/misc.py
+++ b/torchvision/ops/misc.py
@@ -131,10 +131,10 @@ class Conv2dNormActivation(ConvNormActivation):
        out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
        stride (int, optional): Stride of the convolution. Default: 1
-        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will be calculated as ``padding = (kernel_size - 1) // 2 * dilation``
        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm2d``
-        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer won't be used. Default: ``torch.nn.BatchNorm2d``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer won't be used. Default: ``torch.nn.ReLU``
        dilation (int): Spacing between kernel elements. Default: 1
        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
@@ -181,10 +181,10 @@ class Conv3dNormActivation(ConvNormActivation):
        out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
        stride (int, optional): Stride of the convolution. Default: 1
-        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will be calculated as ``padding = (kernel_size - 1) // 2 * dilation``
        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm3d``
-        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer won't be used. Default: ``torch.nn.BatchNorm3d``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer won't be used. Default: ``torch.nn.ReLU``
        dilation (int): Spacing between kernel elements. Default: 1
        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
@@ -266,8 +266,8 @@ class MLP(torch.nn.Sequential):
    Args:
        in_channels (int): Number of channels of the input
        hidden_channels (List[int]): List of the hidden channel dimensions
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the linear layer. If ``None`` this layer wont be used. Default: ``None``
-        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the linear layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the linear layer. If ``None`` this layer won't be used. Default: ``None``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the linear layer. If ``None`` this layer won't be used. Default: ``torch.nn.ReLU``
        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
        bias (bool): Whether to use bias in the linear layer. Default ``True``
        dropout (float): The probability for the dropout layer. Default: 0.0

--- a/torchvision/ops/poolers.py
+++ b/torchvision/ops/poolers.py
@@ -160,8 +160,8 @@ def _multiscale_roi_align(
            reference. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
        output_size (Union[List[Tuple[int, int]], List[int]]): size of the output
        sampling_ratio (int): sampling ratio for ROIAlign
-        scales (Optional[List[float]]): If None, scales will be automatically infered. Default value is None.
-        mapper (Optional[LevelMapper]): If none, mapper will be automatically infered. Default value is None.
+        scales (Optional[List[float]]): If None, scales will be automatically inferred. Default value is None.
+        mapper (Optional[LevelMapper]): If none, mapper will be automatically inferred. Default value is None.
    Returns:
        result (Tensor)
    """

--- a/torchvision/prototype/models/depth/stereo/crestereo.py
+++ b/torchvision/prototype/models/depth/stereo/crestereo.py
@@ -54,7 +54,7 @@ def get_correlation(
 ) -> Tensor:
    """Function that computes a correlation product between the left and right features.

-    The correlation is computed in a sliding window fashion, namely the the left features are fixed
+    The correlation is computed in a sliding window fashion, namely the left features are fixed
    and for each ``(i, j)`` location we compute the correlation with a sliding window anchored in
    ``(i, j)`` from the right feature map. The sliding window selects pixels obtained in the range of the sliding
    window; i.e ``(i - window_size // 2, i + window_size // 2)`` respectively ``(j - window_size // 2, j + window_size // 2)``.
@@ -76,7 +76,7 @@ def get_correlation(
    # we expand the left features for broadcasting
    left_feature = left_feature.unsqueeze(1)
    # this will compute an element product of between [B, 1, C, H, W] * [B, n_views, C, H, W]
-    # to obtain correlations over the pixel canditates we perform a mean on the C dimension
+    # to obtain correlations over the pixel candidates we perform a mean on the C dimension
    correlation = torch.mean(left_feature * right_padded, dim=2, keepdim=False)
    # the final correlation tensor shape will be [B, n_views, H, W]
    # where on the i-th position of the n_views dimension we will have
@@ -138,7 +138,7 @@ class IterativeCorrelationLayer(nn.Module):
        self.search_pixels = np.prod(search_window_1d)
        self.groups = groups

-        # two selection tables for dealing withh the small_patch argument in the forward function
+        # two selection tables for dealing with the small_patch argument in the forward function
        self.patch_sizes = {
            "2d": [search_window_2d for _ in range(self.groups)],
            "1d": [search_window_1d for _ in range(self.groups)],
@@ -167,7 +167,7 @@ class IterativeCorrelationLayer(nn.Module):
        dilate_size_list = self.dilate_sizes[window_type]

        # chunking the left and right feature to perform group-wise correlation
-        # mechanism simillar to GroupNorm. See section 3.1 ``Group-wise correlation``.
+        # mechanism similar to GroupNorm. See section 3.1 ``Group-wise correlation``.
        left_groups = torch.chunk(left_feature, self.groups, dim=1)
        right_groups = torch.chunk(right_feature, self.groups, dim=1)

@@ -202,7 +202,7 @@ class AttentionOffsetCorrelationLayer(nn.Module):
        self.search_pixels = int(np.prod(search_window_1d))
        self.groups = groups

-        # two selection tables for dealing withh the small_patch argument in the forward function
+        # two selection tables for dealing with the small_patch argument in the forward function
        self.patch_sizes = {
            "2d": [search_window_2d for _ in range(self.groups)],
            "1d": [search_window_1d for _ in range(self.groups)],
@@ -234,7 +234,7 @@ class AttentionOffsetCorrelationLayer(nn.Module):
            # prepare for transformer required input shapes
            left_feature = left_feature.permute(0, 2, 3, 1).reshape(B, H * W, C)
            right_feature = right_feature.permute(0, 2, 3, 1).reshape(B, H * W, C)
-            # this can be either self attention or cross attention, hence the tupple return
+            # this can be either self attention or cross attention, hence the tuple return
            left_feature, right_feature = self.attention_module(left_feature, right_feature)
            left_feature = left_feature.reshape(B, H, W, C).permute(0, 3, 1, 2)
            right_feature = right_feature.reshape(B, H, W, C).permute(0, 3, 1, 2)
@@ -272,7 +272,7 @@ class AttentionOffsetCorrelationLayer(nn.Module):

            for d in (0, 2, 3):
                offsets = offsets.unsqueeze(d)
-            # extra offsets for search (i.e. deformed search indexes. Simillar concept to deformable convolutions)
+            # extra offsets for search (i.e. deformed search indexes. Similar concept to deformable convolutions)
            offsets = offsets + extra_offset

            coords = (
@@ -344,7 +344,7 @@ def elu_feature_map(x: Tensor) -> Tensor:
 class LinearAttention(nn.Module):
    """
    Linear attention operation from: https://arxiv.org/pdf/2006.16236.pdf
-    Cannonical implementation reference: https://github.com/idiap/fast-transformers/blob/master/fast_transformers/attention/linear_attention.py
+    Canonical implementation reference: https://github.com/idiap/fast-transformers/blob/master/fast_transformers/attention/linear_attention.py
    LoFTR implementation reference: https://github.com/zju3dv/LoFTR/blob/2122156015b61fbb650e28b58a958e4d632b1058/src/loftr/loftr_module/linear_attention.py
    """

@@ -437,7 +437,7 @@ class SoftmaxAttention(nn.Module):

 class PositionalEncodingSine(nn.Module):
    """
-    Sinusoidal positonal encodings
+    Sinusoidal positional encodings

    Using the scaling term from https://github.com/megvii-research/CREStereo/blob/master/nets/attention/position_encoding.py
    Reference implementation from https://github.com/facebookresearch/detr/blob/8a144f83a287f4d3fece4acdf073f387c5af387d/models/position_encoding.py#L28-L48
@@ -484,7 +484,7 @@ class PositionalEncodingSine(nn.Module):
 class LocalFeatureEncoderLayer(nn.Module):
    """
    LoFTR transformer module from: https://arxiv.org/pdf/2104.00680.pdf
-    Cannonical implementations at: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
+    Canonical implementations at: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
    """

    def __init__(
@@ -556,7 +556,7 @@ class LocalFeatureEncoderLayer(nn.Module):
 class LocalFeatureTransformer(nn.Module):
    """
    LoFTR transformer module from: https://arxiv.org/pdf/2104.00680.pdf
-    Cannonical implementations at: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
+    Canonical implementations at: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
    """

    def __init__(
@@ -652,9 +652,9 @@ class CREStereo(nn.Module):
        feature_downsample_rates (List[int]): The downsample rates used to build a feature pyramid from the outputs of the `feature_encoder`. Default: [2, 4]
        correlation_groups (int): In how many groups should the features be split when computer per-pixel correlation. Defaults 4.
        search_window_1d (Tuple[int, int]): The alternate search window size in the x and y directions for the 1D case. Defaults to (1, 9).
-        search_dilate_1d (Tuple[int, int]): The dilation used in the `search_window_1d` when selecting pixels. Simillar to `nn.Conv2d` dilate. Defaults to (1, 1).
+        search_dilate_1d (Tuple[int, int]): The dilation used in the `search_window_1d` when selecting pixels. Similar to `nn.Conv2d` dilate. Defaults to (1, 1).
        search_window_2d (Tuple[int, int]): The alternate search window size in the x and y directions for the 2D case. Defaults to (3, 3).
-        search_dilate_2d (Tuple[int, int]): The dilation used in the `search_window_2d` when selecting pixels. Simillar to `nn.Conv2d` dilate. Defaults to (1, 1).
+        search_dilate_2d (Tuple[int, int]): The dilation used in the `search_window_2d` when selecting pixels. Similar to `nn.Conv2d` dilate. Defaults to (1, 1).
    """

    def __init__(
@@ -699,7 +699,7 @@ class CREStereo(nn.Module):
            multiplier=0.25,
        )

-        # offsets modules for offseted feature selection
+        # offsets modules for offsetted feature selection
        self.offset_convs = nn.ModuleDict()
        self.correlation_layers = nn.ModuleDict()

@@ -715,7 +715,7 @@ class CREStereo(nn.Module):
        # useful for iterating through torch.jit.script module given the network forward pass
        #
        # Ignore the largest resolution. We handle that separately due to torch.jit.script
-        # not being to able access to runtime generated keys in ModuleDicts.
+        # not being able to access to runtime generated keys in ModuleDicts.
        # This way, we can keep a generic way of processing all pyramid levels but except
        # the final one
        iterative_correlation_layer = partial(
@@ -814,7 +814,7 @@ class CREStereo(nn.Module):
        flow_estimates: Dict[str, Tensor] = {}
        # we added this because of torch.script.jit
        # also, the predicition prior is always going to have the
-        # spatial size of the features outputed by the feature encoder
+        # spatial size of the features outputted by the feature encoder
        flow_pred_prior: Tensor = torch.empty(
            size=(B, 2, left_features.shape[2], left_features.shape[3]),
            dtype=l_pyramid[max_res].dtype,
@@ -860,9 +860,9 @@ class CREStereo(nn.Module):
                # compute the scale difference between the first pyramid scale and the current pyramid scale
                scale_to_base = l_pyramid[fine_grained_resolution].shape[2] // l_pyramid[resolution].shape[2]
                for it in range(num_iters // 2):
-                    # set wether or not we want to search on (X, Y) axes for correlation or just on X axis
+                    # set whether we want to search on (X, Y) axes for correlation or just on X axis
                    window_type = self._get_window_type(it)
-                    # we consider this a prior, therefor we do not want to back-propagate through it
+                    # we consider this a prior, therefore we do not want to back-propagate through it
                    flow_estimates[resolution] = flow_estimates[resolution].detach()

                    correlations = correlation_layer(
@@ -918,8 +918,8 @@ class CREStereo(nn.Module):
        # this coincides with the maximum resolution

        # we keep a separate loop here in order to avoid python control flow
-        # to decide how much iterations should we do based on the current resolution
-        # further more, if provided with an inital flow, there is no need to generate
+        # to decide how many iterations should we do based on the current resolution
+        # furthermore, if provided with an initial flow, there is no need to generate
        # a prior estimate when moving into the final refinement stage

        for it in range(num_iters):
@@ -1095,7 +1095,7 @@ class CREStereo_Base_Weights(WeightsEnum):
                    "_detailed": {
                        # 1 is the number of cascades
                        1: {
-                            # 2 is number of refininement interations
+                            # 2 is number of refininement iterations
                            2: {
                                "mae": 1.704,
                                "rmse": 3.738,
@@ -1307,7 +1307,7 @@ class CREStereo_Base_Weights(WeightsEnum):
                    "_detailed": {
                        # 1 is the number of cascades
                        1: {
-                            # 2 is number of refininement interations
+                            # 2 is number of refininement iterations
                            2: {
                                "mae": 1.85,
                                "rmse": 3.797,

--- a/torchvision/prototype/models/depth/stereo/raft_stereo.py
+++ b/torchvision/prototype/models/depth/stereo/raft_stereo.py
@@ -53,7 +53,7 @@ class BaseEncoder(raft.FeatureEncoder):
 class FeatureEncoder(nn.Module):
    """Feature Encoder for Raft-Stereo (see paper section 3.1) that may have shared weight with the Context Encoder.

-    The FeatureEncoder takes concatination of left and right image as input, it produce feature embedding that later
+    The FeatureEncoder takes concatenation of left and right image as input. It produces feature embedding that later
    will be used to construct correlation volume.
    """

@@ -89,7 +89,7 @@ class FeatureEncoder(nn.Module):
 class MultiLevelContextEncoder(nn.Module):
    """Context Encoder for Raft-Stereo (see paper section 3.1) that may have shared weight with the Feature Encoder.

-    The ContextEncoder takes left image as input and it outputs concatenated hidden_states and contexts.
+    The ContextEncoder takes left image as input, and it outputs concatenated hidden_states and contexts.
    In Raft-Stereo we have multi level GRUs and this context encoder will also multi outputs (list of Tensor)
    that correspond to each GRUs.
    Take note that the length of "out_with_blocks" parameter represent the number of GRU's level.
@@ -180,7 +180,7 @@ class MultiLevelUpdateBlock(nn.Module):

        # The GRU input size is the size of previous level hidden_dim plus next level hidden_dim
        # if this is the first gru, then we replace previous level with motion_encoder output channels
-        # for the last GRU, we dont add the next level hidden_dim
+        # for the last GRU, we don't add the next level hidden_dim
        gru_input_dims = []
        for i in range(len(hidden_dims)):
            input_dim = hidden_dims[i - 1] if i > 0 else motion_encoder.out_channels
@@ -191,8 +191,8 @@ class MultiLevelUpdateBlock(nn.Module):
        self.grus = nn.ModuleList(
            [
                ConvGRU(input_size=gru_input_dims[i], hidden_size=hidden_dims[i], kernel_size=3, padding=1)
-                # Ideally we should reverse the direction during forward to use the gru with smallest resolution first
-                # however currently there is no way to reverse a ModuleList that is jit script compatible
+                # Ideally we should reverse the direction during forward to use the gru with the smallest resolution
+                # first however currently there is no way to reverse a ModuleList that is jit script compatible
                # hence we reverse the ordering of self.grus on the constructor instead
                # see: https://github.com/pytorch/pytorch/issues/31772
                for i in reversed(list(range(len(hidden_dims))))
@@ -214,7 +214,7 @@ class MultiLevelUpdateBlock(nn.Module):
        for reverse_i, gru in enumerate(self.grus):
            i = len(self.grus) - 1 - reverse_i
            if level_processed[i]:
-                # X is concatination of 2x downsampled hidden_dim (or motion_features if no bigger dim) with
+                # X is concatenation of 2x downsampled hidden_dim (or motion_features if no bigger dim) with
                # upsampled hidden_dim (or nothing if not exist).
                if i == 0:
                    features = self.motion_encoder(disparity, corr_features)
@@ -237,7 +237,7 @@ class MultiLevelUpdateBlock(nn.Module):

                hidden_states[i] = gru(hidden_states[i], features, contexts[i])

-                # NOTE: For slow-fast gru, we dont always want to calculate delta disparity for every call on UpdateBlock
+                # NOTE: For slow-fast gru, we don't always want to calculate delta disparity for every call on UpdateBlock
                # Hence we move the delta disparity calculation to the RAFT-Stereo main forward

        return hidden_states
@@ -361,10 +361,10 @@ class RaftStereo(nn.Module):
                It has multi-level output and each level will have 2 parts:

                - one part will be used as the actual "context", passed to the recurrent unit of the ``update_block``
-                - one part will be used to initialize the hidden state of the of the recurrent unit of
+                - one part will be used to initialize the hidden state of the recurrent unit of
                  the ``update_block``

-            corr_pyramid (CorrPyramid1d): Module to buid the correlation pyramid from feature encoder output
+            corr_pyramid (CorrPyramid1d): Module to build the correlation pyramid from feature encoder output
            corr_block (CorrBlock1d): The correlation block, which uses the correlation pyramid indexes
                to create correlation features. It takes the coordinate of the centroid pixel and correlation pyramid
                as input and returns the correlation features.
@@ -382,7 +382,7 @@ class RaftStereo(nn.Module):
        super().__init__()
        _log_api_usage_once(self)

-        # This indicate that the disparity output will be only have 1 channel (represent horizontal axis).
+        # This indicates that the disparity output will be only have 1 channel (represent horizontal axis).
        # We need this because some stereo matching model like CREStereo might have 2 channel on the output
        self.output_channels = 1

@@ -409,7 +409,7 @@ class RaftStereo(nn.Module):
        self, left_image: Tensor, right_image: Tensor, flow_init: Optional[Tensor] = None, num_iters: int = 12
    ) -> List[Tensor]:
        """
-        Return disparity predictions on every iterations as a list of Tensor.
+        Return disparity predictions on every iteration as a list of Tensor.
        args:
            left_image (Tensor): The input left image with layout B, C, H, W
            right_image (Tensor): The input right image with layout B, C, H, W
@@ -424,7 +424,7 @@ class RaftStereo(nn.Module):

        torch._assert(
            (h % self.base_downsampling_ratio == 0 and w % self.base_downsampling_ratio == 0),
-            f"input image H and W should be divisible by {self.base_downsampling_ratio}, insted got H={h} and W={w}",
+            f"input image H and W should be divisible by {self.base_downsampling_ratio}, instead got H={h} and W={w}",
        )

        fmaps = self.feature_encoder(torch.cat([left_image, right_image], dim=0))
@@ -655,7 +655,7 @@ class Raft_Stereo_Base_Weights(WeightsEnum):
            "recipe": "https://github.com/princeton-vl/RAFT-Stereo",
            "_metrics": {
                # Following metrics from paper: https://arxiv.org/abs/2109.07547
-                # Using standard metrics for each datasets
+                # Using standard metrics for each dataset
                "Kitty2015": {
                    # Ratio of pixels with difference less than 3px from ground truth
                    "3px": 0.9426,

--- a/torchvision/prototype/transforms/functional/_color.py
+++ b/torchvision/prototype/transforms/functional/_color.py
@@ -187,7 +187,7 @@ def adjust_sharpness_image_tensor(image: torch.Tensor, sharpness_factor: float)
    # x+(1-r)*(y-x) = x + (1-r)*y - (1-r)*x = x*r + y*(1-r)
    view.add_(blurred_degenerate.sub_(view), alpha=(1.0 - sharpness_factor))

-    # The actual data of ouput have been modified by the above. We only need to clamp and cast now.
+    # The actual data of output have been modified by the above. We only need to clamp and cast now.
    output = output.clamp_(0, bound)
    if not fp:
        output = output.to(image.dtype)
@@ -236,7 +236,7 @@ def _rgb_to_hsv(image: torch.Tensor) -> torch.Tensor:
    #   + S channel has division by `maxc`, which is zero only if `maxc = minc`
    #   + H channel has division by `(maxc - minc)`.
    #
-    # Instead of overwriting NaN afterwards, we just prevent it from occuring so
+    # Instead of overwriting NaN afterwards, we just prevent it from occurring so
    # we don't need to deal with it in case we save the NaN in a buffer in
    # backprop, if it is ever supported, but it doesn't hurt to do so.
    eqc = maxc == minc

--- a/torchvision/transforms/_transforms_video.py
+++ b/torchvision/transforms/_transforms_video.py
@@ -151,7 +151,7 @@ class ToTensorVideo:

 class RandomHorizontalFlipVideo:
    """
-    Flip the video clip along the horizonal direction with a given probability
+    Flip the video clip along the horizontal direction with a given probability
    Args:
        p (float): probability of the clip being flipped. Default value is 0.5
    """

--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -427,7 +427,7 @@ def resize(
            the resized image: if the longer edge of the image is greater
            than ``max_size`` after being resized according to ``size``, then
            the image is resized again so that the longer edge is equal to
-            ``max_size``. As a result, ``size`` might be overruled, i.e the
+            ``max_size``. As a result, ``size`` might be overruled, i.e. the
            smaller edge may be shorter than ``size``. This is only supported
            if ``size`` is an int (or a sequence of length 1 in torchscript
            mode).
@@ -859,7 +859,7 @@ def adjust_brightness(img: Tensor, brightness_factor: float) -> Tensor:
            If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
            where ... means it can have an arbitrary number of leading dimensions.
        brightness_factor (float):  How much to adjust the brightness. Can be
-            any non negative number. 0 gives a black image, 1 gives the
+            any non-negative number. 0 gives a black image, 1 gives the
            original image while 2 increases the brightness by a factor of 2.

    Returns:
@@ -881,7 +881,7 @@ def adjust_contrast(img: Tensor, contrast_factor: float) -> Tensor:
            If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
            where ... means it can have an arbitrary number of leading dimensions.
        contrast_factor (float): How much to adjust the contrast. Can be any
-            non negative number. 0 gives a solid gray image, 1 gives the
+            non-negative number. 0 gives a solid gray image, 1 gives the
            original image while 2 increases the contrast by a factor of 2.

    Returns:
@@ -1143,8 +1143,8 @@ def affine(
        translate (sequence of integers): horizontal and vertical translations (post-rotation translation)
        scale (float): overall scale
        shear (float or sequence): shear angle value in degrees between -180 to 180, clockwise direction.
-            If a sequence is specified, the first value corresponds to a shear parallel to the x axis, while
-            the second value corresponds to a shear parallel to the y axis.
+            If a sequence is specified, the first value corresponds to a shear parallel to the x-axis, while
+            the second value corresponds to a shear parallel to the y-axis.
        interpolation (InterpolationMode): Desired interpolation enum defined by
            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
@@ -1295,7 +1295,7 @@ def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool
        h (int): Height of the erased region.
        w (int): Width of the erased region.
        v: Erasing value.
-        inplace(bool, optional): For in-place operations. By default is set False.
+        inplace(bool, optional): For in-place operations. By default, is set False.

    Returns:
        Tensor Image: Erased image.
@@ -1400,7 +1400,7 @@ def posterize(img: Tensor, bits: int) -> Tensor:

    Args:
        img (PIL Image or Tensor): Image to have its colors posterized.
-            If img is torch Tensor, it should be of type torch.uint8 and
+            If img is torch Tensor, it should be of type torch.uint8, and
            it is expected to be in [..., 1 or 3, H, W] format, where ... means
            it can have an arbitrary number of leading dimensions.
            If img is PIL Image, it is expected to be in mode "L" or "RGB".
@@ -1447,7 +1447,7 @@ def adjust_sharpness(img: Tensor, sharpness_factor: float) -> Tensor:
            If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
            where ... means it can have an arbitrary number of leading dimensions.
        sharpness_factor (float):  How much to adjust the sharpness. Can be
-            any non negative number. 0 gives a blurred image, 1 gives the
+            any non-negative number. 0 gives a blurred image, 1 gives the
            original image while 2 increases the sharpness by a factor of 2.

    Returns: