Unverified Commit 7dc5e5bd authored by Philip Meier's avatar Philip Meier Committed by GitHub
Browse files

Fix typos and grammar errors (#7065)

* fix typos throughout the code base

* fix grammar

* revert formatting changes to gallery

* revert 'an uXX'

* remove 'number of the best'
parent ed2a0adb
...@@ -78,7 +78,7 @@ class SVHN(VisionDataset): ...@@ -78,7 +78,7 @@ class SVHN(VisionDataset):
loaded_mat = sio.loadmat(os.path.join(self.root, self.filename)) loaded_mat = sio.loadmat(os.path.join(self.root, self.filename))
self.data = loaded_mat["X"] self.data = loaded_mat["X"]
# loading from the .mat file gives an np array of type np.uint8 # loading from the .mat file gives an np.ndarray of type np.uint8
# converting to np.int64, so that we have a LongTensor after # converting to np.int64, so that we have a LongTensor after
# the conversion from the numpy array # the conversion from the numpy array
# the squeeze is needed to obtain a 1D tensor # the squeeze is needed to obtain a 1D tensor
......
...@@ -93,7 +93,7 @@ class UCF101(VisionDataset): ...@@ -93,7 +93,7 @@ class UCF101(VisionDataset):
output_format=output_format, output_format=output_format,
) )
# we bookkeep the full version of video clips because we want to be able # we bookkeep the full version of video clips because we want to be able
# to return the meta data of full version rather than the subset version of # to return the metadata of full version rather than the subset version of
# video clips # video clips
self.full_video_clips = video_clips self.full_video_clips = video_clips
self.indices = self._select_fold(video_list, annotation_path, fold, train) self.indices = self._select_fold(video_list, annotation_path, fold, train)
......
...@@ -49,7 +49,7 @@ class _VideoTimestampsDataset: ...@@ -49,7 +49,7 @@ class _VideoTimestampsDataset:
Dataset used to parallelize the reading of the timestamps Dataset used to parallelize the reading of the timestamps
of a list of videos, given their paths in the filesystem. of a list of videos, given their paths in the filesystem.
Used in VideoClips and defined at top level so it can be Used in VideoClips and defined at top level, so it can be
pickled when forking. pickled when forking.
""" """
......
...@@ -137,8 +137,7 @@ def _read_video_from_file( ...@@ -137,8 +137,7 @@ def _read_video_from_file(
audio_timebase: Fraction = default_timebase, audio_timebase: Fraction = default_timebase,
) -> Tuple[torch.Tensor, torch.Tensor, VideoMetaData]: ) -> Tuple[torch.Tensor, torch.Tensor, VideoMetaData]:
""" """
Reads a video from a file, returning both the video frames as well as Reads a video from a file, returning both the video frames and the audio frames
the audio frames
Args: Args:
filename (str): path to the video file filename (str): path to the video file
...@@ -281,8 +280,7 @@ def _read_video_from_memory( ...@@ -281,8 +280,7 @@ def _read_video_from_memory(
audio_timebase_denominator: int = 1, audio_timebase_denominator: int = 1,
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> Tuple[torch.Tensor, torch.Tensor]:
""" """
Reads a video from memory, returning both the video frames as well as Reads a video from memory, returning both the video frames as the audio frames
the audio frames
This function is torchscriptable. This function is torchscriptable.
Args: Args:
...@@ -337,7 +335,7 @@ def _read_video_from_memory( ...@@ -337,7 +335,7 @@ def _read_video_from_memory(
if not isinstance(video_data, torch.Tensor): if not isinstance(video_data, torch.Tensor):
with warnings.catch_warnings(): with warnings.catch_warnings():
# Ignore the warning because we actually dont modify the buffer in this function # Ignore the warning because we actually don't modify the buffer in this function
warnings.filterwarnings("ignore", message="The given buffer is not writable") warnings.filterwarnings("ignore", message="The given buffer is not writable")
video_data = torch.frombuffer(video_data, dtype=torch.uint8) video_data = torch.frombuffer(video_data, dtype=torch.uint8)
...@@ -382,7 +380,7 @@ def _read_video_timestamps_from_memory( ...@@ -382,7 +380,7 @@ def _read_video_timestamps_from_memory(
""" """
if not isinstance(video_data, torch.Tensor): if not isinstance(video_data, torch.Tensor):
with warnings.catch_warnings(): with warnings.catch_warnings():
# Ignore the warning because we actually dont modify the buffer in this function # Ignore the warning because we actually don't modify the buffer in this function
warnings.filterwarnings("ignore", message="The given buffer is not writable") warnings.filterwarnings("ignore", message="The given buffer is not writable")
video_data = torch.frombuffer(video_data, dtype=torch.uint8) video_data = torch.frombuffer(video_data, dtype=torch.uint8)
result = torch.ops.video_reader.read_video_from_memory( result = torch.ops.video_reader.read_video_from_memory(
...@@ -423,7 +421,7 @@ def _probe_video_from_memory( ...@@ -423,7 +421,7 @@ def _probe_video_from_memory(
""" """
if not isinstance(video_data, torch.Tensor): if not isinstance(video_data, torch.Tensor):
with warnings.catch_warnings(): with warnings.catch_warnings():
# Ignore the warning because we actually dont modify the buffer in this function # Ignore the warning because we actually don't modify the buffer in this function
warnings.filterwarnings("ignore", message="The given buffer is not writable") warnings.filterwarnings("ignore", message="The given buffer is not writable")
video_data = torch.frombuffer(video_data, dtype=torch.uint8) video_data = torch.frombuffer(video_data, dtype=torch.uint8)
result = torch.ops.video_reader.probe_video_from_memory(video_data) result = torch.ops.video_reader.probe_video_from_memory(video_data)
......
...@@ -50,7 +50,7 @@ def read_file(path: str) -> torch.Tensor: ...@@ -50,7 +50,7 @@ def read_file(path: str) -> torch.Tensor:
def write_file(filename: str, data: torch.Tensor) -> None: def write_file(filename: str, data: torch.Tensor) -> None:
""" """
Writes the contents of a uint8 tensor with one dimension to a Writes the contents of an uint8 tensor with one dimension to a
file. file.
Args: Args:
......
...@@ -241,8 +241,7 @@ def read_video( ...@@ -241,8 +241,7 @@ def read_video(
output_format: str = "THWC", output_format: str = "THWC",
) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]: ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
""" """
Reads a video from a file, returning both the video frames as well as Reads a video from a file, returning both the video frames and the audio frames
the audio frames
Args: Args:
filename (str): path to the video file filename (str): path to the video file
......
...@@ -92,13 +92,13 @@ class VideoReader: ...@@ -92,13 +92,13 @@ class VideoReader:
Each stream descriptor consists of two parts: stream type (e.g. 'video') and Each stream descriptor consists of two parts: stream type (e.g. 'video') and
a unique stream id (which are determined by the video encoding). a unique stream id (which are determined by the video encoding).
In this way, if the video contaner contains multiple In this way, if the video contaner contains multiple
streams of the same type, users can acces the one they want. streams of the same type, users can access the one they want.
If only stream type is passed, the decoder auto-detects first stream of that type. If only stream type is passed, the decoder auto-detects first stream of that type.
Args: Args:
src (string, bytes object, or tensor): The media source. src (string, bytes object, or tensor): The media source.
If string-type, it must be a file path supported by FFMPEG. If string-type, it must be a file path supported by FFMPEG.
If bytes shoud be an in memory representatin of a file supported by FFMPEG. If bytes should be an in memory representatin of a file supported by FFMPEG.
If Tensor, it is interpreted internally as byte buffer. If Tensor, it is interpreted internally as byte buffer.
It must be one-dimensional, of type ``torch.uint8``. It must be one-dimensional, of type ``torch.uint8``.
...@@ -145,7 +145,7 @@ class VideoReader: ...@@ -145,7 +145,7 @@ class VideoReader:
src = io.BytesIO(src) src = io.BytesIO(src)
else: else:
with warnings.catch_warnings(): with warnings.catch_warnings():
# Ignore the warning because we actually dont modify the buffer in this function # Ignore the warning because we actually don't modify the buffer in this function
warnings.filterwarnings("ignore", message="The given buffer is not writable") warnings.filterwarnings("ignore", message="The given buffer is not writable")
src = torch.frombuffer(src, dtype=torch.uint8) src = torch.frombuffer(src, dtype=torch.uint8)
elif isinstance(src, torch.Tensor): elif isinstance(src, torch.Tensor):
...@@ -280,12 +280,12 @@ class VideoReader: ...@@ -280,12 +280,12 @@ class VideoReader:
Each descriptor consists of two parts: stream type (e.g. 'video') and Each descriptor consists of two parts: stream type (e.g. 'video') and
a unique stream id (which are determined by video encoding). a unique stream id (which are determined by video encoding).
In this way, if the video contaner contains multiple In this way, if the video contaner contains multiple
streams of the same type, users can acces the one they want. streams of the same type, users can access the one they want.
If only stream type is passed, the decoder auto-detects first stream If only stream type is passed, the decoder auto-detects first stream
of that type and returns it. of that type and returns it.
Returns: Returns:
(bool): True on succes, False otherwise (bool): True on success, False otherwise
""" """
if self.backend == "cuda": if self.backend == "cuda":
warnings.warn("GPU decoding only works with video stream.") warnings.warn("GPU decoding only works with video stream.")
......
...@@ -191,7 +191,7 @@ def handle_legacy_interface(**weights: Tuple[str, Union[Optional[W], Callable[[D ...@@ -191,7 +191,7 @@ def handle_legacy_interface(**weights: Tuple[str, Union[Optional[W], Callable[[D
# used to be a pretrained parameter. # used to be a pretrained parameter.
pretrained_positional = weights_arg is not sentinel pretrained_positional = weights_arg is not sentinel
if pretrained_positional: if pretrained_positional:
# We put the pretrained argument under its legacy name in the keyword argument dictionary to have a # We put the pretrained argument under its legacy name in the keyword argument dictionary to have
# unified access to the value if the default value is a callable. # unified access to the value if the default value is a callable.
kwargs[pretrained_param] = pretrained_arg = kwargs.pop(weights_param) kwargs[pretrained_param] = pretrained_arg = kwargs.pop(weights_param)
else: else:
......
...@@ -403,9 +403,9 @@ class Matcher: ...@@ -403,9 +403,9 @@ class Matcher:
it is unmatched, then match it to the ground-truth with which it has the highest it is unmatched, then match it to the ground-truth with which it has the highest
quality value. quality value.
""" """
# For each gt, find the prediction with which it has highest quality # For each gt, find the prediction with which it has the highest quality
highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1) highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
# Find highest quality match available, even if it is low, including ties # Find the highest quality match available, even if it is low, including ties
gt_pred_pairs_of_highest_quality = torch.where(match_quality_matrix == highest_quality_foreach_gt[:, None]) gt_pred_pairs_of_highest_quality = torch.where(match_quality_matrix == highest_quality_foreach_gt[:, None])
# Example gt_pred_pairs_of_highest_quality: # Example gt_pred_pairs_of_highest_quality:
# tensor([[ 0, 39796], # tensor([[ 0, 39796],
...@@ -501,14 +501,14 @@ def _topk_min(input: Tensor, orig_kval: int, axis: int) -> int: ...@@ -501,14 +501,14 @@ def _topk_min(input: Tensor, orig_kval: int, axis: int) -> int:
if K exceeds the number of elements along that axis. Previously, python's min() function was if K exceeds the number of elements along that axis. Previously, python's min() function was
used to determine whether to use the provided k-value or the specified dim axis value. used to determine whether to use the provided k-value or the specified dim axis value.
However in cases where the model is being exported in tracing mode, python min() is However, in cases where the model is being exported in tracing mode, python min() is
static causing the model to be traced incorrectly and eventually fail at the topk node. static causing the model to be traced incorrectly and eventually fail at the topk node.
In order to avoid this situation, in tracing mode, torch.min() is used instead. In order to avoid this situation, in tracing mode, torch.min() is used instead.
Args: Args:
input (Tensor): The orignal input tensor. input (Tensor): The original input tensor.
orig_kval (int): The provided k-value. orig_kval (int): The provided k-value.
axis(int): Axis along which we retreive the input size. axis(int): Axis along which we retrieve the input size.
Returns: Returns:
min_kval (int): Appropriately selected k-value. min_kval (int): Appropriately selected k-value.
......
...@@ -145,7 +145,7 @@ class DefaultBoxGenerator(nn.Module): ...@@ -145,7 +145,7 @@ class DefaultBoxGenerator(nn.Module):
of the scales of each feature map. It is used only if the ``scales`` parameter is not provided. of the scales of each feature map. It is used only if the ``scales`` parameter is not provided.
scales (List[float]], optional): The scales of the default boxes. If not provided it will be estimated using scales (List[float]], optional): The scales of the default boxes. If not provided it will be estimated using
the ``min_ratio`` and ``max_ratio`` parameters. the ``min_ratio`` and ``max_ratio`` parameters.
steps (List[int]], optional): It's a hyper-parameter that affects the tiling of defalt boxes. If not provided steps (List[int]], optional): It's a hyper-parameter that affects the tiling of default boxes. If not provided
it will be estimated from the data. it will be estimated from the data.
clip (bool): Whether the standardized values of default boxes should be clipped between 0 and 1. The clipping clip (bool): Whether the standardized values of default boxes should be clipped between 0 and 1. The clipping
is applied while the boxes are encoded in format ``(cx, cy, w, h)``. is applied while the boxes are encoded in format ``(cx, cy, w, h)``.
......
...@@ -102,12 +102,12 @@ def resnet_fpn_backbone( ...@@ -102,12 +102,12 @@ def resnet_fpn_backbone(
trainable_layers (int): number of trainable (not frozen) layers starting from final block. trainable_layers (int): number of trainable (not frozen) layers starting from final block.
Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable. Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
returned_layers (list of int): The layers of the network to return. Each entry must be in ``[1, 4]``. returned_layers (list of int): The layers of the network to return. Each entry must be in ``[1, 4]``.
By default all layers are returned. By default, all layers are returned.
extra_blocks (ExtraFPNBlock or None): if provided, extra operations will extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
be performed. It is expected to take the fpn features, the original be performed. It is expected to take the fpn features, the original
features and the names of the original features as input, and returns features and the names of the original features as input, and returns
a new list of feature maps and their corresponding names. By a new list of feature maps and their corresponding names. By
default a ``LastLevelMaxPool`` is used. default, a ``LastLevelMaxPool`` is used.
""" """
backbone = resnet.__dict__[backbone_name](weights=weights, norm_layer=norm_layer) backbone = resnet.__dict__[backbone_name](weights=weights, norm_layer=norm_layer)
return _resnet_fpn_extractor(backbone, trainable_layers, returned_layers, extra_blocks) return _resnet_fpn_extractor(backbone, trainable_layers, returned_layers, extra_blocks)
...@@ -121,7 +121,7 @@ def _resnet_fpn_extractor( ...@@ -121,7 +121,7 @@ def _resnet_fpn_extractor(
norm_layer: Optional[Callable[..., nn.Module]] = None, norm_layer: Optional[Callable[..., nn.Module]] = None,
) -> BackboneWithFPN: ) -> BackboneWithFPN:
# select layers that wont be frozen # select layers that won't be frozen
if trainable_layers < 0 or trainable_layers > 5: if trainable_layers < 0 or trainable_layers > 5:
raise ValueError(f"Trainable layers should be in the range [0,5], got {trainable_layers}") raise ValueError(f"Trainable layers should be in the range [0,5], got {trainable_layers}")
layers_to_train = ["layer4", "layer3", "layer2", "layer1", "conv1"][:trainable_layers] layers_to_train = ["layer4", "layer3", "layer2", "layer1", "conv1"][:trainable_layers]
...@@ -208,7 +208,7 @@ def _mobilenet_extractor( ...@@ -208,7 +208,7 @@ def _mobilenet_extractor(
stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1] stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
num_stages = len(stage_indices) num_stages = len(stage_indices)
# find the index of the layer from which we wont freeze # find the index of the layer from which we won't freeze
if trainable_layers < 0 or trainable_layers > num_stages: if trainable_layers < 0 or trainable_layers > num_stages:
raise ValueError(f"Trainable layers should be in the range [0,{num_stages}], got {trainable_layers} ") raise ValueError(f"Trainable layers should be in the range [0,{num_stages}], got {trainable_layers} ")
freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers] freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers]
......
...@@ -47,9 +47,9 @@ class FasterRCNN(GeneralizedRCNN): ...@@ -47,9 +47,9 @@ class FasterRCNN(GeneralizedRCNN):
The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
image, and should be in 0-1 range. Different images can have different sizes. image, and should be in 0-1 range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode. The behavior of the model changes depending on if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary), During training, the model expects both the input tensors and targets (list of dictionary),
containing: containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
...@@ -68,7 +68,7 @@ class FasterRCNN(GeneralizedRCNN): ...@@ -68,7 +68,7 @@ class FasterRCNN(GeneralizedRCNN):
Args: Args:
backbone (nn.Module): the network used to compute the features for the model. backbone (nn.Module): the network used to compute the features for the model.
It should contain a out_channels attribute, which indicates the number of output It should contain an out_channels attribute, which indicates the number of output
channels that each feature map has (and it should be the same for all feature maps). channels that each feature map has (and it should be the same for all feature maps).
The backbone should return a single Tensor or and OrderedDict[Tensor]. The backbone should return a single Tensor or and OrderedDict[Tensor].
num_classes (int): number of output classes of the model (including the background). num_classes (int): number of output classes of the model (including the background).
...@@ -128,7 +128,7 @@ class FasterRCNN(GeneralizedRCNN): ...@@ -128,7 +128,7 @@ class FasterRCNN(GeneralizedRCNN):
>>> # only the features >>> # only the features
>>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
>>> # FasterRCNN needs to know the number of >>> # FasterRCNN needs to know the number of
>>> # output channels in a backbone. For mobilenet_v2, it's 1280 >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
>>> # so we need to add it here >>> # so we need to add it here
>>> backbone.out_channels = 1280 >>> backbone.out_channels = 1280
>>> >>>
...@@ -483,9 +483,9 @@ def fasterrcnn_resnet50_fpn( ...@@ -483,9 +483,9 @@ def fasterrcnn_resnet50_fpn(
The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
image, and should be in ``0-1`` range. Different images can have different sizes. image, and should be in ``0-1`` range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode. The behavior of the model changes depending on if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary), During training, the model expects both the input tensors and a targets (list of dictionary),
containing: containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
...@@ -714,7 +714,7 @@ def fasterrcnn_mobilenet_v3_large_320_fpn( ...@@ -714,7 +714,7 @@ def fasterrcnn_mobilenet_v3_large_320_fpn(
**kwargs: Any, **kwargs: Any,
) -> FasterRCNN: ) -> FasterRCNN:
""" """
Low resolution Faster R-CNN model with a MobileNetV3-Large backbone tunned for mobile use cases. Low resolution Faster R-CNN model with a MobileNetV3-Large backbone tuned for mobile use cases.
.. betastatus:: detection module .. betastatus:: detection module
......
...@@ -70,7 +70,7 @@ class FCOSHead(nn.Module): ...@@ -70,7 +70,7 @@ class FCOSHead(nn.Module):
else: else:
gt_classes_targets = targets_per_image["labels"][matched_idxs_per_image.clip(min=0)] gt_classes_targets = targets_per_image["labels"][matched_idxs_per_image.clip(min=0)]
gt_boxes_targets = targets_per_image["boxes"][matched_idxs_per_image.clip(min=0)] gt_boxes_targets = targets_per_image["boxes"][matched_idxs_per_image.clip(min=0)]
gt_classes_targets[matched_idxs_per_image < 0] = -1 # backgroud gt_classes_targets[matched_idxs_per_image < 0] = -1 # background
all_gt_classes_targets.append(gt_classes_targets) all_gt_classes_targets.append(gt_classes_targets)
all_gt_boxes_targets.append(gt_boxes_targets) all_gt_boxes_targets.append(gt_boxes_targets)
...@@ -274,9 +274,9 @@ class FCOS(nn.Module): ...@@ -274,9 +274,9 @@ class FCOS(nn.Module):
The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
image, and should be in 0-1 range. Different images can have different sizes. image, and should be in 0-1 range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode. The behavior of the model changes depending on if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary), During training, the model expects both the input tensors and targets (list of dictionary),
containing: containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
...@@ -329,7 +329,7 @@ class FCOS(nn.Module): ...@@ -329,7 +329,7 @@ class FCOS(nn.Module):
>>> # only the features >>> # only the features
>>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
>>> # FCOS needs to know the number of >>> # FCOS needs to know the number of
>>> # output channels in a backbone. For mobilenet_v2, it's 1280 >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
>>> # so we need to add it here >>> # so we need to add it here
>>> backbone.out_channels = 1280 >>> backbone.out_channels = 1280
>>> >>>
...@@ -695,9 +695,9 @@ def fcos_resnet50_fpn( ...@@ -695,9 +695,9 @@ def fcos_resnet50_fpn(
The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
image, and should be in ``0-1`` range. Different images can have different sizes. image, and should be in ``0-1`` range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode. The behavior of the model changes depending on if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary), During training, the model expects both the input tensors and targets (list of dictionary),
containing: containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
......
...@@ -29,9 +29,9 @@ class KeypointRCNN(FasterRCNN): ...@@ -29,9 +29,9 @@ class KeypointRCNN(FasterRCNN):
The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
image, and should be in 0-1 range. Different images can have different sizes. image, and should be in 0-1 range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode. The behavior of the model changes depending on if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary), During training, the model expects both the input tensors and targets (list of dictionary),
containing: containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
...@@ -55,7 +55,7 @@ class KeypointRCNN(FasterRCNN): ...@@ -55,7 +55,7 @@ class KeypointRCNN(FasterRCNN):
Args: Args:
backbone (nn.Module): the network used to compute the features for the model. backbone (nn.Module): the network used to compute the features for the model.
It should contain a out_channels attribute, which indicates the number of output It should contain an out_channels attribute, which indicates the number of output
channels that each feature map has (and it should be the same for all feature maps). channels that each feature map has (and it should be the same for all feature maps).
The backbone should return a single Tensor or and OrderedDict[Tensor]. The backbone should return a single Tensor or and OrderedDict[Tensor].
num_classes (int): number of output classes of the model (including the background). num_classes (int): number of output classes of the model (including the background).
...@@ -121,7 +121,7 @@ class KeypointRCNN(FasterRCNN): ...@@ -121,7 +121,7 @@ class KeypointRCNN(FasterRCNN):
>>> # only the features >>> # only the features
>>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
>>> # KeypointRCNN needs to know the number of >>> # KeypointRCNN needs to know the number of
>>> # output channels in a backbone. For mobilenet_v2, it's 1280 >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
>>> # so we need to add it here >>> # so we need to add it here
>>> backbone.out_channels = 1280 >>> backbone.out_channels = 1280
>>> >>>
...@@ -387,9 +387,9 @@ def keypointrcnn_resnet50_fpn( ...@@ -387,9 +387,9 @@ def keypointrcnn_resnet50_fpn(
The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
image, and should be in ``0-1`` range. Different images can have different sizes. image, and should be in ``0-1`` range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode. The behavior of the model changes depending on if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary), During training, the model expects both the input tensors and targets (list of dictionary),
containing: containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
......
...@@ -31,9 +31,9 @@ class MaskRCNN(FasterRCNN): ...@@ -31,9 +31,9 @@ class MaskRCNN(FasterRCNN):
The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
image, and should be in 0-1 range. Different images can have different sizes. image, and should be in 0-1 range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode. The behavior of the model changes depending on if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary), During training, the model expects both the input tensors and targets (list of dictionary),
containing: containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
...@@ -56,7 +56,7 @@ class MaskRCNN(FasterRCNN): ...@@ -56,7 +56,7 @@ class MaskRCNN(FasterRCNN):
Args: Args:
backbone (nn.Module): the network used to compute the features for the model. backbone (nn.Module): the network used to compute the features for the model.
It should contain a out_channels attribute, which indicates the number of output It should contain an out_channels attribute, which indicates the number of output
channels that each feature map has (and it should be the same for all feature maps). channels that each feature map has (and it should be the same for all feature maps).
The backbone should return a single Tensor or and OrderedDict[Tensor]. The backbone should return a single Tensor or and OrderedDict[Tensor].
num_classes (int): number of output classes of the model (including the background). num_classes (int): number of output classes of the model (including the background).
...@@ -123,7 +123,7 @@ class MaskRCNN(FasterRCNN): ...@@ -123,7 +123,7 @@ class MaskRCNN(FasterRCNN):
>>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
>>> # MaskRCNN needs to know the number of >>> # MaskRCNN needs to know the number of
>>> # output channels in a backbone. For mobilenet_v2, it's 1280 >>> # output channels in a backbone. For mobilenet_v2, it's 1280
>>> # so we need to add it here >>> # so we need to add it here,
>>> backbone.out_channels = 1280 >>> backbone.out_channels = 1280
>>> >>>
>>> # let's make the RPN generate 5 x 3 anchors per spatial >>> # let's make the RPN generate 5 x 3 anchors per spatial
...@@ -422,9 +422,9 @@ def maskrcnn_resnet50_fpn( ...@@ -422,9 +422,9 @@ def maskrcnn_resnet50_fpn(
The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
image, and should be in ``0-1`` range. Different images can have different sizes. image, and should be in ``0-1`` range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode. The behavior of the model changes depending on if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary), During training, the model expects both the input tensors and targets (list of dictionary),
containing: containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
......
...@@ -327,9 +327,9 @@ class RetinaNet(nn.Module): ...@@ -327,9 +327,9 @@ class RetinaNet(nn.Module):
The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
image, and should be in 0-1 range. Different images can have different sizes. image, and should be in 0-1 range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode. The behavior of the model changes depending on if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary), During training, the model expects both the input tensors and targets (list of dictionary),
containing: containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
...@@ -382,7 +382,7 @@ class RetinaNet(nn.Module): ...@@ -382,7 +382,7 @@ class RetinaNet(nn.Module):
>>> # only the features >>> # only the features
>>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
>>> # RetinaNet needs to know the number of >>> # RetinaNet needs to know the number of
>>> # output channels in a backbone. For mobilenet_v2, it's 1280 >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
>>> # so we need to add it here >>> # so we need to add it here
>>> backbone.out_channels = 1280 >>> backbone.out_channels = 1280
>>> >>>
...@@ -743,9 +743,9 @@ def retinanet_resnet50_fpn( ...@@ -743,9 +743,9 @@ def retinanet_resnet50_fpn(
The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
image, and should be in ``0-1`` range. Different images can have different sizes. image, and should be in ``0-1`` range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode. The behavior of the model changes depending on if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary), During training, the model expects both the input tensors and targets (list of dictionary),
containing: containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
......
...@@ -315,7 +315,7 @@ def keypointrcnn_loss(keypoint_logits, proposals, gt_keypoints, keypoint_matched ...@@ -315,7 +315,7 @@ def keypointrcnn_loss(keypoint_logits, proposals, gt_keypoints, keypoint_matched
valid = torch.cat(valid, dim=0).to(dtype=torch.uint8) valid = torch.cat(valid, dim=0).to(dtype=torch.uint8)
valid = torch.where(valid)[0] valid = torch.where(valid)[0]
# torch.mean (in binary_cross_entropy_with_logits) does'nt # torch.mean (in binary_cross_entropy_with_logits) doesn't
# accept empty tensors, so handle it sepaartely # accept empty tensors, so handle it sepaartely
if keypoint_targets.numel() == 0 or len(valid) == 0: if keypoint_targets.numel() == 0 or len(valid) == 0:
return keypoint_logits.sum() * 0 return keypoint_logits.sum() * 0
......
...@@ -128,12 +128,12 @@ class SSD(nn.Module): ...@@ -128,12 +128,12 @@ class SSD(nn.Module):
Implements SSD architecture from `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_. Implements SSD architecture from `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
image, and should be in 0-1 range. Different images can have different sizes but they will be resized image, and should be in 0-1 range. Different images can have different sizes, but they will be resized
to a fixed size before passing it to the backbone. to a fixed size before passing it to the backbone.
The behavior of the model changes depending if it is in training or evaluation mode. The behavior of the model changes depending on if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary), During training, the model expects both the input tensors and targets (list of dictionary),
containing: containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
...@@ -556,7 +556,7 @@ def _vgg_extractor(backbone: VGG, highres: bool, trainable_layers: int): ...@@ -556,7 +556,7 @@ def _vgg_extractor(backbone: VGG, highres: bool, trainable_layers: int):
stage_indices = [0] + [i for i, b in enumerate(backbone) if isinstance(b, nn.MaxPool2d)][:-1] stage_indices = [0] + [i for i, b in enumerate(backbone) if isinstance(b, nn.MaxPool2d)][:-1]
num_stages = len(stage_indices) num_stages = len(stage_indices)
# find the index of the layer from which we wont freeze # find the index of the layer from which we won't freeze
torch._assert( torch._assert(
0 <= trainable_layers <= num_stages, 0 <= trainable_layers <= num_stages,
f"trainable_layers should be in the range [0, {num_stages}]. Instead got {trainable_layers}", f"trainable_layers should be in the range [0, {num_stages}]. Instead got {trainable_layers}",
...@@ -590,12 +590,12 @@ def ssd300_vgg16( ...@@ -590,12 +590,12 @@ def ssd300_vgg16(
.. betastatus:: detection module .. betastatus:: detection module
The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
image, and should be in 0-1 range. Different images can have different sizes but they will be resized image, and should be in 0-1 range. Different images can have different sizes, but they will be resized
to a fixed size before passing it to the backbone. to a fixed size before passing it to the backbone.
The behavior of the model changes depending if it is in training or evaluation mode. The behavior of the model changes depending on if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary), During training, the model expects both the input tensors and targets (list of dictionary),
containing: containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
......
...@@ -172,7 +172,7 @@ def _mobilenet_extractor( ...@@ -172,7 +172,7 @@ def _mobilenet_extractor(
stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1] stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
num_stages = len(stage_indices) num_stages = len(stage_indices)
# find the index of the layer from which we wont freeze # find the index of the layer from which we won't freeze
if not 0 <= trainable_layers <= num_stages: if not 0 <= trainable_layers <= num_stages:
raise ValueError("trainable_layers should be in the range [0, {num_stages}], instead got {trainable_layers}") raise ValueError("trainable_layers should be in the range [0, {num_stages}], instead got {trainable_layers}")
freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers] freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers]
......
...@@ -76,7 +76,7 @@ class GeneralizedRCNNTransform(nn.Module): ...@@ -76,7 +76,7 @@ class GeneralizedRCNNTransform(nn.Module):
Performs input / target transformation before feeding the data to a GeneralizedRCNN Performs input / target transformation before feeding the data to a GeneralizedRCNN
model. model.
The transformations it perform are: The transformations it performs are:
- input normalization (mean subtraction and std division) - input normalization (mean subtraction and std division)
- input / target resizing to match min_size / max_size - input / target resizing to match min_size / max_size
...@@ -158,7 +158,7 @@ class GeneralizedRCNNTransform(nn.Module): ...@@ -158,7 +158,7 @@ class GeneralizedRCNNTransform(nn.Module):
def torch_choice(self, k: List[int]) -> int: def torch_choice(self, k: List[int]) -> int:
""" """
Implements `random.choice` via torch ops so it can be compiled with Implements `random.choice` via torch ops, so it can be compiled with
TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803 TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803
is fixed. is fixed.
""" """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment