Unverified Commit 332bff93 authored by Nicolas Hug's avatar Nicolas Hug Committed by GitHub
Browse files

Renaming: `BoundingBox` -> `BoundingBoxes` (#7778)

parent d4e5aa21
......@@ -6,7 +6,7 @@ import numpy as np
import torch
from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper
from torchvision.datapoints import BoundingBox
from torchvision.datapoints import BoundingBoxes
from torchvision.prototype.datapoints import Label
from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
from torchvision.prototype.datasets.utils._internal import (
......@@ -112,7 +112,7 @@ class Caltech101(Dataset):
image_path=image_path,
image=image,
ann_path=ann_path,
bounding_box=BoundingBox(
bounding_boxes=BoundingBoxes(
ann["box_coord"].astype(np.int64).squeeze()[[2, 0, 3, 1]],
format="xyxy",
spatial_size=image.spatial_size,
......
......@@ -4,7 +4,7 @@ from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Sequence, Tupl
import torch
from torchdata.datapipes.iter import Filter, IterDataPipe, IterKeyZipper, Mapper, Zipper
from torchvision.datapoints import BoundingBox
from torchvision.datapoints import BoundingBoxes
from torchvision.prototype.datapoints import Label
from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
from torchvision.prototype.datasets.utils._internal import (
......@@ -137,15 +137,15 @@ class CelebA(Dataset):
path, buffer = image_data
image = EncodedImage.from_file(buffer)
(_, identity), (_, attributes), (_, bounding_box), (_, landmarks) = ann_data
(_, identity), (_, attributes), (_, bounding_boxes), (_, landmarks) = ann_data
return dict(
path=path,
image=image,
identity=Label(int(identity["identity"])),
attributes={attr: value == "1" for attr, value in attributes.items()},
bounding_box=BoundingBox(
[int(bounding_box[key]) for key in ("x_1", "y_1", "width", "height")],
bounding_boxes=BoundingBoxes(
[int(bounding_boxes[key]) for key in ("x_1", "y_1", "width", "height")],
format="xywh",
spatial_size=image.spatial_size,
),
......
......@@ -14,7 +14,7 @@ from torchdata.datapipes.iter import (
Mapper,
UnBatcher,
)
from torchvision.datapoints import BoundingBox, Mask
from torchvision.datapoints import BoundingBoxes, Mask
from torchvision.prototype.datapoints import Label
from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
from torchvision.prototype.datasets.utils._internal import (
......@@ -126,7 +126,7 @@ class Coco(Dataset):
),
areas=torch.as_tensor([ann["area"] for ann in anns]),
crowds=torch.as_tensor([ann["iscrowd"] for ann in anns], dtype=torch.bool),
bounding_boxes=BoundingBox(
bounding_boxes=BoundingBoxes(
[ann["bbox"] for ann in anns],
format="xywh",
spatial_size=spatial_size,
......
......@@ -15,7 +15,7 @@ from torchdata.datapipes.iter import (
Mapper,
)
from torchdata.datapipes.map import IterToMapConverter
from torchvision.datapoints import BoundingBox
from torchvision.datapoints import BoundingBoxes
from torchvision.prototype.datapoints import Label
from torchvision.prototype.datasets.utils import Dataset, EncodedImage, GDriveResource, OnlineResource
from torchvision.prototype.datasets.utils._internal import (
......@@ -134,11 +134,11 @@ class CUB200(Dataset):
def _2011_prepare_ann(
self, data: Tuple[str, Tuple[List[str], Tuple[str, BinaryIO]]], spatial_size: Tuple[int, int]
) -> Dict[str, Any]:
_, (bounding_box_data, segmentation_data) = data
_, (bounding_boxes_data, segmentation_data) = data
segmentation_path, segmentation_buffer = segmentation_data
return dict(
bounding_box=BoundingBox(
[float(part) for part in bounding_box_data[1:]], format="xywh", spatial_size=spatial_size
bounding_boxes=BoundingBoxes(
[float(part) for part in bounding_boxes_data[1:]], format="xywh", spatial_size=spatial_size
),
segmentation_path=segmentation_path,
segmentation=EncodedImage.from_file(segmentation_buffer),
......@@ -158,7 +158,7 @@ class CUB200(Dataset):
content = read_mat(buffer)
return dict(
ann_path=path,
bounding_box=BoundingBox(
bounding_boxes=BoundingBoxes(
[int(content["bbox"][coord]) for coord in ("left", "bottom", "right", "top")],
format="xyxy",
spatial_size=spatial_size,
......
......@@ -2,7 +2,7 @@ import pathlib
from typing import Any, Dict, List, Optional, Tuple, Union
from torchdata.datapipes.iter import CSVDictParser, Demultiplexer, Filter, IterDataPipe, Mapper, Zipper
from torchvision.datapoints import BoundingBox
from torchvision.datapoints import BoundingBoxes
from torchvision.prototype.datapoints import Label
from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
from torchvision.prototype.datasets.utils._internal import (
......@@ -76,7 +76,7 @@ class GTSRB(Dataset):
(path, buffer), csv_info = data
label = int(csv_info["ClassId"])
bounding_box = BoundingBox(
bounding_boxes = BoundingBoxes(
[int(csv_info[k]) for k in ("Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2")],
format="xyxy",
spatial_size=(int(csv_info["Height"]), int(csv_info["Width"])),
......@@ -86,7 +86,7 @@ class GTSRB(Dataset):
"path": path,
"image": EncodedImage.from_file(buffer),
"label": Label(label, categories=self._categories),
"bounding_box": bounding_box,
"bounding_boxes": bounding_boxes,
}
def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
......
......@@ -2,7 +2,7 @@ import pathlib
from typing import Any, BinaryIO, Dict, Iterator, List, Tuple, Union
from torchdata.datapipes.iter import Filter, IterDataPipe, Mapper, Zipper
from torchvision.datapoints import BoundingBox
from torchvision.datapoints import BoundingBoxes
from torchvision.prototype.datapoints import Label
from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
from torchvision.prototype.datasets.utils._internal import (
......@@ -90,7 +90,7 @@ class StanfordCars(Dataset):
path=path,
image=image,
label=Label(target[4] - 1, categories=self._categories),
bounding_box=BoundingBox(target[:4], format="xyxy", spatial_size=image.spatial_size),
bounding_boxes=BoundingBoxes(target[:4], format="xyxy", spatial_size=image.spatial_size),
)
def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]:
......
......@@ -5,7 +5,7 @@ from typing import Any, BinaryIO, cast, Dict, List, Optional, Tuple, Union
from xml.etree import ElementTree
from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, LineReader, Mapper
from torchvision.datapoints import BoundingBox
from torchvision.datapoints import BoundingBoxes
from torchvision.datasets import VOCDetection
from torchvision.prototype.datapoints import Label
from torchvision.prototype.datasets.utils import Dataset, EncodedImage, HttpResource, OnlineResource
......@@ -103,7 +103,7 @@ class VOC(Dataset):
anns = self._parse_detection_ann(buffer)
instances = anns["object"]
return dict(
bounding_boxes=BoundingBox(
bounding_boxes=BoundingBoxes(
[
[int(instance["bndbox"][part]) for part in ("xmin", "ymin", "xmax", "ymax")]
for instance in instances
......
......@@ -26,7 +26,7 @@ class _BaseMixupCutmix(_RandomApplyTransform):
and has_any(flat_inputs, proto_datapoints.OneHotLabel)
):
raise TypeError(f"{type(self).__name__}() is only defined for tensor images/videos and one-hot labels.")
if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBox, datapoints.Mask, proto_datapoints.Label):
if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBoxes, datapoints.Mask, proto_datapoints.Label):
raise TypeError(
f"{type(self).__name__}() does not support PIL images, bounding boxes, masks and plain labels."
)
......@@ -175,7 +175,7 @@ class SimpleCopyPaste(Transform):
# There is a similar +1 in other reference implementations:
# https://github.com/pytorch/vision/blob/b6feccbc4387766b76a3e22b13815dbbbfa87c0f/torchvision/models/detection/roi_heads.py#L418-L422
xyxy_boxes[:, 2:] += 1
boxes = F.convert_format_bounding_box(
boxes = F.convert_format_bounding_boxes(
xyxy_boxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=bbox_format, inplace=True
)
out_target["boxes"] = torch.cat([boxes, paste_boxes])
......@@ -184,7 +184,7 @@ class SimpleCopyPaste(Transform):
out_target["labels"] = torch.cat([labels, paste_labels])
# Check for degenerated boxes and remove them
boxes = F.convert_format_bounding_box(
boxes = F.convert_format_bounding_boxes(
out_target["boxes"], old_format=bbox_format, new_format=datapoints.BoundingBoxFormat.XYXY
)
degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
......@@ -201,14 +201,14 @@ class SimpleCopyPaste(Transform):
self, flat_sample: List[Any]
) -> Tuple[List[datapoints._TensorImageType], List[Dict[str, Any]]]:
# fetch all images, bboxes, masks and labels from unstructured input
# with List[image], List[BoundingBox], List[Mask], List[Label]
# with List[image], List[BoundingBoxes], List[Mask], List[Label]
images, bboxes, masks, labels = [], [], [], []
for obj in flat_sample:
if isinstance(obj, datapoints.Image) or is_simple_tensor(obj):
images.append(obj)
elif isinstance(obj, PIL.Image.Image):
images.append(F.to_image_tensor(obj))
elif isinstance(obj, datapoints.BoundingBox):
elif isinstance(obj, datapoints.BoundingBoxes):
bboxes.append(obj)
elif isinstance(obj, datapoints.Mask):
masks.append(obj)
......@@ -218,7 +218,7 @@ class SimpleCopyPaste(Transform):
if not (len(images) == len(bboxes) == len(masks) == len(labels)):
raise TypeError(
f"{type(self).__name__}() requires input sample to contain equal sized list of Images, "
"BoundingBoxes, Masks and Labels or OneHotLabels."
"BoundingBoxeses, Masks and Labels or OneHotLabels."
)
targets = []
......@@ -244,8 +244,8 @@ class SimpleCopyPaste(Transform):
elif is_simple_tensor(obj):
flat_sample[i] = output_images[c0]
c0 += 1
elif isinstance(obj, datapoints.BoundingBox):
flat_sample[i] = datapoints.BoundingBox.wrap_like(obj, output_targets[c1]["boxes"])
elif isinstance(obj, datapoints.BoundingBoxes):
flat_sample[i] = datapoints.BoundingBoxes.wrap_like(obj, output_targets[c1]["boxes"])
c1 += 1
elif isinstance(obj, datapoints.Mask):
flat_sample[i] = datapoints.Mask.wrap_like(obj, output_targets[c2]["masks"])
......
......@@ -7,7 +7,7 @@ from torchvision import datapoints
from torchvision.prototype.datapoints import Label, OneHotLabel
from torchvision.transforms.v2 import functional as F, Transform
from torchvision.transforms.v2._utils import _setup_fill_arg, _setup_size
from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_bounding_box, query_spatial_size
from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_bounding_boxes, query_spatial_size
class FixedSizeCrop(Transform):
......@@ -39,9 +39,9 @@ class FixedSizeCrop(Transform):
f"{type(self).__name__}() requires input sample to contain an tensor or PIL image or a Video."
)
if has_any(flat_inputs, datapoints.BoundingBox) and not has_any(flat_inputs, Label, OneHotLabel):
if has_any(flat_inputs, datapoints.BoundingBoxes) and not has_any(flat_inputs, Label, OneHotLabel):
raise TypeError(
f"If a BoundingBox is contained in the input sample, "
f"If a BoundingBoxes is contained in the input sample, "
f"{type(self).__name__}() also requires it to contain a Label or OneHotLabel."
)
......@@ -61,13 +61,13 @@ class FixedSizeCrop(Transform):
bounding_boxes: Optional[torch.Tensor]
try:
bounding_boxes = query_bounding_box(flat_inputs)
bounding_boxes = query_bounding_boxes(flat_inputs)
except ValueError:
bounding_boxes = None
if needs_crop and bounding_boxes is not None:
format = bounding_boxes.format
bounding_boxes, spatial_size = F.crop_bounding_box(
bounding_boxes, spatial_size = F.crop_bounding_boxes(
bounding_boxes.as_subclass(torch.Tensor),
format=format,
top=top,
......@@ -75,8 +75,8 @@ class FixedSizeCrop(Transform):
height=new_height,
width=new_width,
)
bounding_boxes = F.clamp_bounding_box(bounding_boxes, format=format, spatial_size=spatial_size)
height_and_width = F.convert_format_bounding_box(
bounding_boxes = F.clamp_bounding_boxes(bounding_boxes, format=format, spatial_size=spatial_size)
height_and_width = F.convert_format_bounding_boxes(
bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYWH
)[..., 2:]
is_valid = torch.all(height_and_width > 0, dim=-1)
......@@ -112,10 +112,12 @@ class FixedSizeCrop(Transform):
if params["is_valid"] is not None:
if isinstance(inpt, (Label, OneHotLabel, datapoints.Mask)):
inpt = inpt.wrap_like(inpt, inpt[params["is_valid"]]) # type: ignore[arg-type]
elif isinstance(inpt, datapoints.BoundingBox):
inpt = datapoints.BoundingBox.wrap_like(
elif isinstance(inpt, datapoints.BoundingBoxes):
inpt = datapoints.BoundingBoxes.wrap_like(
inpt,
F.clamp_bounding_box(inpt[params["is_valid"]], format=inpt.format, spatial_size=inpt.spatial_size),
F.clamp_bounding_boxes(
inpt[params["is_valid"]], format=inpt.format, spatial_size=inpt.spatial_size
),
)
if params["needs_pad"]:
......
......@@ -39,7 +39,7 @@ from ._geometry import (
ScaleJitter,
TenCrop,
)
from ._meta import ClampBoundingBox, ConvertBoundingBoxFormat
from ._meta import ClampBoundingBoxes, ConvertBoundingBoxFormat
from ._misc import (
ConvertImageDtype,
GaussianBlur,
......@@ -47,7 +47,7 @@ from ._misc import (
Lambda,
LinearTransformation,
Normalize,
SanitizeBoundingBox,
SanitizeBoundingBoxes,
ToDtype,
)
from ._temporal import UniformTemporalSubsample
......
......@@ -155,7 +155,7 @@ class _BaseMixupCutmix(Transform):
flat_inputs, spec = tree_flatten(inputs)
needs_transform_list = self._needs_transform_list(flat_inputs)
if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBox, datapoints.Mask):
if has_any(flat_inputs, PIL.Image.Image, datapoints.BoundingBoxes, datapoints.Mask):
raise ValueError(f"{type(self).__name__}() does not support PIL images, bounding boxes and masks.")
labels = self._labels_getter(inputs)
......
......@@ -34,7 +34,7 @@ class _AutoAugmentBase(Transform):
def _flatten_and_extract_image_or_video(
self,
inputs: Any,
unsupported_types: Tuple[Type, ...] = (datapoints.BoundingBox, datapoints.Mask),
unsupported_types: Tuple[Type, ...] = (datapoints.BoundingBoxes, datapoints.Mask),
) -> Tuple[Tuple[List[Any], TreeSpec, int], Union[datapoints._ImageType, datapoints._VideoType]]:
flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
needs_transform_list = self._needs_transform_list(flat_inputs)
......
......@@ -22,7 +22,7 @@ from ._utils import (
_setup_float_or_seq,
_setup_size,
)
from .utils import has_all, has_any, is_simple_tensor, query_bounding_box, query_spatial_size
from .utils import has_all, has_any, is_simple_tensor, query_bounding_boxes, query_spatial_size
class RandomHorizontalFlip(_RandomApplyTransform):
......@@ -31,7 +31,7 @@ class RandomHorizontalFlip(_RandomApplyTransform):
.. v2betastatus:: RandomHorizontalFlip transform
If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
......@@ -51,7 +51,7 @@ class RandomVerticalFlip(_RandomApplyTransform):
.. v2betastatus:: RandomVerticalFlip transform
If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
......@@ -71,7 +71,7 @@ class Resize(Transform):
.. v2betastatus:: Resize transform
If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
......@@ -165,7 +165,7 @@ class CenterCrop(Transform):
.. v2betastatus:: CenterCrop transform
If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
......@@ -193,7 +193,7 @@ class RandomResizedCrop(Transform):
.. v2betastatus:: RandomResizedCrop transform
If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
......@@ -371,8 +371,8 @@ class FiveCrop(Transform):
return F.five_crop(inpt, self.size)
def _check_inputs(self, flat_inputs: List[Any]) -> None:
if has_any(flat_inputs, datapoints.BoundingBox, datapoints.Mask):
raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
if has_any(flat_inputs, datapoints.BoundingBoxes, datapoints.Mask):
raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
class TenCrop(Transform):
......@@ -414,8 +414,8 @@ class TenCrop(Transform):
self.vertical_flip = vertical_flip
def _check_inputs(self, flat_inputs: List[Any]) -> None:
if has_any(flat_inputs, datapoints.BoundingBox, datapoints.Mask):
raise TypeError(f"BoundingBox'es and Mask's are not supported by {type(self).__name__}()")
if has_any(flat_inputs, datapoints.BoundingBoxes, datapoints.Mask):
raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
def _transform(
self, inpt: Union[datapoints._ImageType, datapoints._VideoType], params: Dict[str, Any]
......@@ -440,7 +440,7 @@ class Pad(Transform):
.. v2betastatus:: Pad transform
If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
......@@ -525,7 +525,7 @@ class RandomZoomOut(_RandomApplyTransform):
output_height = input_height * r
If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
......@@ -584,7 +584,7 @@ class RandomRotation(Transform):
.. v2betastatus:: RandomRotation transform
If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
......@@ -657,7 +657,7 @@ class RandomAffine(Transform):
.. v2betastatus:: RandomAffine transform
If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
......@@ -778,7 +778,7 @@ class RandomCrop(Transform):
.. v2betastatus:: RandomCrop transform
If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
......@@ -933,7 +933,7 @@ class RandomPerspective(_RandomApplyTransform):
.. v2betastatus:: RandomPerspective transform
If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
......@@ -1019,7 +1019,7 @@ class ElasticTransform(Transform):
.. v2betastatus:: RandomPerspective transform
If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
......@@ -1110,15 +1110,15 @@ class RandomIoUCrop(Transform):
.. v2betastatus:: RandomIoUCrop transform
This transformation requires an image or video data and ``datapoints.BoundingBox`` in the input.
This transformation requires an image or video data and ``datapoints.BoundingBoxes`` in the input.
.. warning::
In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop`
must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBox`, either immediately
must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately
after or later in the transforms pipeline.
If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
......@@ -1155,7 +1155,7 @@ class RandomIoUCrop(Transform):
def _check_inputs(self, flat_inputs: List[Any]) -> None:
if not (
has_all(flat_inputs, datapoints.BoundingBox)
has_all(flat_inputs, datapoints.BoundingBoxes)
and has_any(flat_inputs, PIL.Image.Image, datapoints.Image, is_simple_tensor)
):
raise TypeError(
......@@ -1165,7 +1165,7 @@ class RandomIoUCrop(Transform):
def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
orig_h, orig_w = query_spatial_size(flat_inputs)
bboxes = query_bounding_box(flat_inputs)
bboxes = query_bounding_boxes(flat_inputs)
while True:
# sample an option
......@@ -1193,7 +1193,7 @@ class RandomIoUCrop(Transform):
continue
# check for any valid boxes with centers within the crop area
xyxy_bboxes = F.convert_format_bounding_box(
xyxy_bboxes = F.convert_format_bounding_boxes(
bboxes.as_subclass(torch.Tensor), bboxes.format, datapoints.BoundingBoxFormat.XYXY
)
cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2])
......@@ -1220,9 +1220,9 @@ class RandomIoUCrop(Transform):
output = F.crop(inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"])
if isinstance(output, datapoints.BoundingBox):
if isinstance(output, datapoints.BoundingBoxes):
# We "mark" the invalid boxes as degenreate, and they can be
# removed by a later call to SanitizeBoundingBox()
# removed by a later call to SanitizeBoundingBoxes()
output[~params["is_within_crop_area"]] = 0
return output
......@@ -1235,7 +1235,7 @@ class ScaleJitter(Transform):
.. v2betastatus:: ScaleJitter transform
If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
......@@ -1301,7 +1301,7 @@ class RandomShortestSize(Transform):
.. v2betastatus:: RandomShortestSize transform
If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
......@@ -1380,7 +1380,7 @@ class RandomResize(Transform):
output_height = size
If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
:class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
......
......@@ -15,7 +15,7 @@ class ConvertBoundingBoxFormat(Transform):
string values match the enums, e.g. "XYXY" or "XYWH" etc.
"""
_transformed_types = (datapoints.BoundingBox,)
_transformed_types = (datapoints.BoundingBoxes,)
def __init__(self, format: Union[str, datapoints.BoundingBoxFormat]) -> None:
super().__init__()
......@@ -23,20 +23,20 @@ class ConvertBoundingBoxFormat(Transform):
format = datapoints.BoundingBoxFormat[format]
self.format = format
def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox:
return F.convert_format_bounding_box(inpt, new_format=self.format) # type: ignore[return-value]
def _transform(self, inpt: datapoints.BoundingBoxes, params: Dict[str, Any]) -> datapoints.BoundingBoxes:
return F.convert_format_bounding_boxes(inpt, new_format=self.format) # type: ignore[return-value]
class ClampBoundingBox(Transform):
class ClampBoundingBoxes(Transform):
"""[BETA] Clamp bounding boxes to their corresponding image dimensions.
The clamping is done according to the bounding boxes' ``spatial_size`` meta-data.
.. v2betastatus:: ClampBoundingBox transform
.. v2betastatus:: ClampBoundingBoxes transform
"""
_transformed_types = (datapoints.BoundingBox,)
_transformed_types = (datapoints.BoundingBoxes,)
def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox:
return F.clamp_bounding_box(inpt) # type: ignore[return-value]
def _transform(self, inpt: datapoints.BoundingBoxes, params: Dict[str, Any]) -> datapoints.BoundingBoxes:
return F.clamp_bounding_boxes(inpt) # type: ignore[return-value]
......@@ -10,7 +10,7 @@ from torchvision import datapoints, transforms as _transforms
from torchvision.transforms.v2 import functional as F, Transform
from ._utils import _parse_labels_getter, _setup_float_or_seq, _setup_size
from .utils import has_any, is_simple_tensor, query_bounding_box
from .utils import has_any, is_simple_tensor, query_bounding_boxes
# TODO: do we want/need to expose this?
......@@ -332,16 +332,16 @@ class ConvertImageDtype(Transform):
return F.to_dtype(inpt, dtype=self.dtype, scale=True)
class SanitizeBoundingBox(Transform):
class SanitizeBoundingBoxes(Transform):
"""[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and masks.
.. v2betastatus:: SanitizeBoundingBox transform
.. v2betastatus:: SanitizeBoundingBoxes transform
This transform removes bounding boxes and their associated labels/masks that:
- are below a given ``min_size``: by default this also removes degenerate boxes that have e.g. X2 <= X1.
- have any coordinate outside of their corresponding image. You may want to
call :class:`~torchvision.transforms.v2.ClampBoundingBox` first to avoid undesired removals.
call :class:`~torchvision.transforms.v2.ClampBoundingBoxes` first to avoid undesired removals.
It is recommended to call it at the end of a pipeline, before passing the
input to the models. It is critical to call this transform if
......@@ -384,10 +384,10 @@ class SanitizeBoundingBox(Transform):
)
flat_inputs, spec = tree_flatten(inputs)
# TODO: this enforces one single BoundingBox entry.
# TODO: this enforces one single BoundingBoxes entry.
# Assuming this transform needs to be called at the end of *any* pipeline that has bboxes...
# should we just enforce it for all transforms?? What are the benefits of *not* enforcing this?
boxes = query_bounding_box(flat_inputs)
boxes = query_bounding_boxes(flat_inputs)
if boxes.ndim != 2:
raise ValueError(f"boxes must be of shape (num_boxes, 4), got {boxes.shape}")
......@@ -398,8 +398,8 @@ class SanitizeBoundingBox(Transform):
)
boxes = cast(
datapoints.BoundingBox,
F.convert_format_bounding_box(
datapoints.BoundingBoxes,
F.convert_format_bounding_boxes(
boxes,
new_format=datapoints.BoundingBoxFormat.XYXY,
),
......@@ -415,7 +415,7 @@ class SanitizeBoundingBox(Transform):
params = dict(valid=valid, labels=labels)
flat_outputs = [
# Even-though it may look like we're transforming all inputs, we don't:
# _transform() will only care about BoundingBoxes and the labels
# _transform() will only care about BoundingBoxeses and the labels
self._transform(inpt, params)
for inpt in flat_inputs
]
......@@ -424,9 +424,9 @@ class SanitizeBoundingBox(Transform):
def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
is_label = inpt is not None and inpt is params["labels"]
is_bounding_box_or_mask = isinstance(inpt, (datapoints.BoundingBox, datapoints.Mask))
is_bounding_boxes_or_mask = isinstance(inpt, (datapoints.BoundingBoxes, datapoints.Mask))
if not (is_label or is_bounding_box_or_mask):
if not (is_label or is_bounding_boxes_or_mask):
return inpt
output = inpt[params["valid"]]
......
......@@ -3,8 +3,8 @@ from torchvision.transforms import InterpolationMode # usort: skip
from ._utils import is_simple_tensor # usort: skip
from ._meta import (
clamp_bounding_box,
convert_format_bounding_box,
clamp_bounding_boxes,
convert_format_bounding_boxes,
get_dimensions_image_tensor,
get_dimensions_image_pil,
get_dimensions,
......@@ -15,7 +15,7 @@ from ._meta import (
get_num_channels_image_pil,
get_num_channels_video,
get_num_channels,
get_spatial_size_bounding_box,
get_spatial_size_bounding_boxes,
get_spatial_size_image_tensor,
get_spatial_size_image_pil,
get_spatial_size_mask,
......@@ -76,25 +76,25 @@ from ._color import (
)
from ._geometry import (
affine,
affine_bounding_box,
affine_bounding_boxes,
affine_image_pil,
affine_image_tensor,
affine_mask,
affine_video,
center_crop,
center_crop_bounding_box,
center_crop_bounding_boxes,
center_crop_image_pil,
center_crop_image_tensor,
center_crop_mask,
center_crop_video,
crop,
crop_bounding_box,
crop_bounding_boxes,
crop_image_pil,
crop_image_tensor,
crop_mask,
crop_video,
elastic,
elastic_bounding_box,
elastic_bounding_boxes,
elastic_image_pil,
elastic_image_tensor,
elastic_mask,
......@@ -106,37 +106,37 @@ from ._geometry import (
five_crop_video,
hflip, # TODO: Consider moving all pure alias definitions at the bottom of the file
horizontal_flip,
horizontal_flip_bounding_box,
horizontal_flip_bounding_boxes,
horizontal_flip_image_pil,
horizontal_flip_image_tensor,
horizontal_flip_mask,
horizontal_flip_video,
pad,
pad_bounding_box,
pad_bounding_boxes,
pad_image_pil,
pad_image_tensor,
pad_mask,
pad_video,
perspective,
perspective_bounding_box,
perspective_bounding_boxes,
perspective_image_pil,
perspective_image_tensor,
perspective_mask,
perspective_video,
resize,
resize_bounding_box,
resize_bounding_boxes,
resize_image_pil,
resize_image_tensor,
resize_mask,
resize_video,
resized_crop,
resized_crop_bounding_box,
resized_crop_bounding_boxes,
resized_crop_image_pil,
resized_crop_image_tensor,
resized_crop_mask,
resized_crop_video,
rotate,
rotate_bounding_box,
rotate_bounding_boxes,
rotate_image_pil,
rotate_image_tensor,
rotate_mask,
......@@ -146,7 +146,7 @@ from ._geometry import (
ten_crop_image_tensor,
ten_crop_video,
vertical_flip,
vertical_flip_bounding_box,
vertical_flip_bounding_boxes,
vertical_flip_image_pil,
vertical_flip_image_tensor,
vertical_flip_mask,
......
......@@ -23,7 +23,7 @@ from torchvision.transforms.functional import (
from torchvision.utils import _log_api_usage_once
from ._meta import clamp_bounding_box, convert_format_bounding_box, get_spatial_size_image_pil
from ._meta import clamp_bounding_boxes, convert_format_bounding_boxes, get_spatial_size_image_pil
from ._utils import is_simple_tensor
......@@ -51,21 +51,21 @@ def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor:
return horizontal_flip_image_tensor(mask)
def horizontal_flip_bounding_box(
bounding_box: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int]
def horizontal_flip_bounding_boxes(
bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int]
) -> torch.Tensor:
shape = bounding_box.shape
shape = bounding_boxes.shape
bounding_box = bounding_box.clone().reshape(-1, 4)
bounding_boxes = bounding_boxes.clone().reshape(-1, 4)
if format == datapoints.BoundingBoxFormat.XYXY:
bounding_box[:, [2, 0]] = bounding_box[:, [0, 2]].sub_(spatial_size[1]).neg_()
bounding_boxes[:, [2, 0]] = bounding_boxes[:, [0, 2]].sub_(spatial_size[1]).neg_()
elif format == datapoints.BoundingBoxFormat.XYWH:
bounding_box[:, 0].add_(bounding_box[:, 2]).sub_(spatial_size[1]).neg_()
bounding_boxes[:, 0].add_(bounding_boxes[:, 2]).sub_(spatial_size[1]).neg_()
else: # format == datapoints.BoundingBoxFormat.CXCYWH:
bounding_box[:, 0].sub_(spatial_size[1]).neg_()
bounding_boxes[:, 0].sub_(spatial_size[1]).neg_()
return bounding_box.reshape(shape)
return bounding_boxes.reshape(shape)
def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
......@@ -101,21 +101,21 @@ def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor:
return vertical_flip_image_tensor(mask)
def vertical_flip_bounding_box(
bounding_box: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int]
def vertical_flip_bounding_boxes(
bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int]
) -> torch.Tensor:
shape = bounding_box.shape
shape = bounding_boxes.shape
bounding_box = bounding_box.clone().reshape(-1, 4)
bounding_boxes = bounding_boxes.clone().reshape(-1, 4)
if format == datapoints.BoundingBoxFormat.XYXY:
bounding_box[:, [1, 3]] = bounding_box[:, [3, 1]].sub_(spatial_size[0]).neg_()
bounding_boxes[:, [1, 3]] = bounding_boxes[:, [3, 1]].sub_(spatial_size[0]).neg_()
elif format == datapoints.BoundingBoxFormat.XYWH:
bounding_box[:, 1].add_(bounding_box[:, 3]).sub_(spatial_size[0]).neg_()
bounding_boxes[:, 1].add_(bounding_boxes[:, 3]).sub_(spatial_size[0]).neg_()
else: # format == datapoints.BoundingBoxFormat.CXCYWH:
bounding_box[:, 1].sub_(spatial_size[0]).neg_()
bounding_boxes[:, 1].sub_(spatial_size[0]).neg_()
return bounding_box.reshape(shape)
return bounding_boxes.reshape(shape)
def vertical_flip_video(video: torch.Tensor) -> torch.Tensor:
......@@ -274,20 +274,20 @@ def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = N
return output
def resize_bounding_box(
bounding_box: torch.Tensor, spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
def resize_bounding_boxes(
bounding_boxes: torch.Tensor, spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
) -> Tuple[torch.Tensor, Tuple[int, int]]:
old_height, old_width = spatial_size
new_height, new_width = _compute_resized_output_size(spatial_size, size=size, max_size=max_size)
if (new_height, new_width) == (old_height, old_width):
return bounding_box, spatial_size
return bounding_boxes, spatial_size
w_ratio = new_width / old_width
h_ratio = new_height / old_height
ratios = torch.tensor([w_ratio, h_ratio, w_ratio, h_ratio], device=bounding_box.device)
ratios = torch.tensor([w_ratio, h_ratio, w_ratio, h_ratio], device=bounding_boxes.device)
return (
bounding_box.mul(ratios).to(bounding_box.dtype),
bounding_boxes.mul(ratios).to(bounding_boxes.dtype),
(new_height, new_width),
)
......@@ -650,8 +650,8 @@ def affine_image_pil(
return _FP.affine(image, matrix, interpolation=pil_modes_mapping[interpolation], fill=fill)
def _affine_bounding_box_with_expand(
bounding_box: torch.Tensor,
def _affine_bounding_boxes_with_expand(
bounding_boxes: torch.Tensor,
format: datapoints.BoundingBoxFormat,
spatial_size: Tuple[int, int],
angle: Union[int, float],
......@@ -661,17 +661,17 @@ def _affine_bounding_box_with_expand(
center: Optional[List[float]] = None,
expand: bool = False,
) -> Tuple[torch.Tensor, Tuple[int, int]]:
if bounding_box.numel() == 0:
return bounding_box, spatial_size
original_shape = bounding_box.shape
original_dtype = bounding_box.dtype
bounding_box = bounding_box.clone() if bounding_box.is_floating_point() else bounding_box.float()
dtype = bounding_box.dtype
device = bounding_box.device
bounding_box = (
convert_format_bounding_box(
bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
if bounding_boxes.numel() == 0:
return bounding_boxes, spatial_size
original_shape = bounding_boxes.shape
original_dtype = bounding_boxes.dtype
bounding_boxes = bounding_boxes.clone() if bounding_boxes.is_floating_point() else bounding_boxes.float()
dtype = bounding_boxes.dtype
device = bounding_boxes.device
bounding_boxes = (
convert_format_bounding_boxes(
bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
)
).reshape(-1, 4)
......@@ -697,7 +697,7 @@ def _affine_bounding_box_with_expand(
# Tensor of points has shape (N * 4, 3), where N is the number of bboxes
# Single point structure is similar to
# [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
points = bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
points = torch.cat([points, torch.ones(points.shape[0], 1, device=device, dtype=dtype)], dim=-1)
# 2) Now let's transform the points using affine matrix
transformed_points = torch.matmul(points, transposed_affine_matrix)
......@@ -730,8 +730,8 @@ def _affine_bounding_box_with_expand(
new_width, new_height = _compute_affine_output_size(affine_vector, width, height)
spatial_size = (new_height, new_width)
out_bboxes = clamp_bounding_box(out_bboxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size)
out_bboxes = convert_format_bounding_box(
out_bboxes = clamp_bounding_boxes(out_bboxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size)
out_bboxes = convert_format_bounding_boxes(
out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
).reshape(original_shape)
......@@ -739,8 +739,8 @@ def _affine_bounding_box_with_expand(
return out_bboxes, spatial_size
def affine_bounding_box(
bounding_box: torch.Tensor,
def affine_bounding_boxes(
bounding_boxes: torch.Tensor,
format: datapoints.BoundingBoxFormat,
spatial_size: Tuple[int, int],
angle: Union[int, float],
......@@ -749,8 +749,8 @@ def affine_bounding_box(
shear: List[float],
center: Optional[List[float]] = None,
) -> torch.Tensor:
out_box, _ = _affine_bounding_box_with_expand(
bounding_box,
out_box, _ = _affine_bounding_boxes_with_expand(
bounding_boxes,
format=format,
spatial_size=spatial_size,
angle=angle,
......@@ -927,8 +927,8 @@ def rotate_image_pil(
)
def rotate_bounding_box(
bounding_box: torch.Tensor,
def rotate_bounding_boxes(
bounding_boxes: torch.Tensor,
format: datapoints.BoundingBoxFormat,
spatial_size: Tuple[int, int],
angle: float,
......@@ -938,8 +938,8 @@ def rotate_bounding_box(
if center is not None and expand:
warnings.warn("The provided center argument has no effect on the result if expand is True")
return _affine_bounding_box_with_expand(
bounding_box,
return _affine_bounding_boxes_with_expand(
bounding_boxes,
format=format,
spatial_size=spatial_size,
angle=-angle,
......@@ -1165,8 +1165,8 @@ def pad_mask(
return output
def pad_bounding_box(
bounding_box: torch.Tensor,
def pad_bounding_boxes(
bounding_boxes: torch.Tensor,
format: datapoints.BoundingBoxFormat,
spatial_size: Tuple[int, int],
padding: List[int],
......@@ -1182,14 +1182,14 @@ def pad_bounding_box(
pad = [left, top, left, top]
else:
pad = [left, top, 0, 0]
bounding_box = bounding_box + torch.tensor(pad, dtype=bounding_box.dtype, device=bounding_box.device)
bounding_boxes = bounding_boxes + torch.tensor(pad, dtype=bounding_boxes.dtype, device=bounding_boxes.device)
height, width = spatial_size
height += top + bottom
width += left + right
spatial_size = (height, width)
return clamp_bounding_box(bounding_box, format=format, spatial_size=spatial_size), spatial_size
return clamp_bounding_boxes(bounding_boxes, format=format, spatial_size=spatial_size), spatial_size
def pad_video(
......@@ -1245,8 +1245,8 @@ def crop_image_tensor(image: torch.Tensor, top: int, left: int, height: int, wid
crop_image_pil = _FP.crop
def crop_bounding_box(
bounding_box: torch.Tensor,
def crop_bounding_boxes(
bounding_boxes: torch.Tensor,
format: datapoints.BoundingBoxFormat,
top: int,
left: int,
......@@ -1260,10 +1260,10 @@ def crop_bounding_box(
else:
sub = [left, top, 0, 0]
bounding_box = bounding_box - torch.tensor(sub, dtype=bounding_box.dtype, device=bounding_box.device)
bounding_boxes = bounding_boxes - torch.tensor(sub, dtype=bounding_boxes.dtype, device=bounding_boxes.device)
spatial_size = (height, width)
return clamp_bounding_box(bounding_box, format=format, spatial_size=spatial_size), spatial_size
return clamp_bounding_boxes(bounding_boxes, format=format, spatial_size=spatial_size), spatial_size
def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
......@@ -1409,27 +1409,27 @@ def perspective_image_pil(
return _FP.perspective(image, perspective_coeffs, interpolation=pil_modes_mapping[interpolation], fill=fill)
def perspective_bounding_box(
bounding_box: torch.Tensor,
def perspective_bounding_boxes(
bounding_boxes: torch.Tensor,
format: datapoints.BoundingBoxFormat,
spatial_size: Tuple[int, int],
startpoints: Optional[List[List[int]]],
endpoints: Optional[List[List[int]]],
coefficients: Optional[List[float]] = None,
) -> torch.Tensor:
if bounding_box.numel() == 0:
return bounding_box
if bounding_boxes.numel() == 0:
return bounding_boxes
perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
original_shape = bounding_box.shape
# TODO: first cast to float if bbox is int64 before convert_format_bounding_box
bounding_box = (
convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
original_shape = bounding_boxes.shape
# TODO: first cast to float if bbox is int64 before convert_format_bounding_boxes
bounding_boxes = (
convert_format_bounding_boxes(bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
).reshape(-1, 4)
dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32
device = bounding_box.device
dtype = bounding_boxes.dtype if torch.is_floating_point(bounding_boxes) else torch.float32
device = bounding_boxes.device
# perspective_coeffs are computed as endpoint -> start point
# We have to invert perspective_coeffs for bboxes:
......@@ -1475,7 +1475,7 @@ def perspective_bounding_box(
# Tensor of points has shape (N * 4, 3), where N is the number of bboxes
# Single point structure is similar to
# [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
points = bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1)
# 2) Now let's transform the points using perspective matrices
# x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
......@@ -1490,15 +1490,15 @@ def perspective_bounding_box(
transformed_points = transformed_points.reshape(-1, 4, 2)
out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
out_bboxes = clamp_bounding_box(
torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype),
out_bboxes = clamp_bounding_boxes(
torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype),
format=datapoints.BoundingBoxFormat.XYXY,
spatial_size=spatial_size,
)
# out_bboxes should be of shape [N boxes, 4]
return convert_format_bounding_box(
return convert_format_bounding_boxes(
out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
).reshape(original_shape)
......@@ -1648,26 +1648,26 @@ def _create_identity_grid(size: Tuple[int, int], device: torch.device, dtype: to
return base_grid
def elastic_bounding_box(
bounding_box: torch.Tensor,
def elastic_bounding_boxes(
bounding_boxes: torch.Tensor,
format: datapoints.BoundingBoxFormat,
spatial_size: Tuple[int, int],
displacement: torch.Tensor,
) -> torch.Tensor:
if bounding_box.numel() == 0:
return bounding_box
if bounding_boxes.numel() == 0:
return bounding_boxes
# TODO: add in docstring about approximation we are doing for grid inversion
device = bounding_box.device
dtype = bounding_box.dtype if torch.is_floating_point(bounding_box) else torch.float32
device = bounding_boxes.device
dtype = bounding_boxes.dtype if torch.is_floating_point(bounding_boxes) else torch.float32
if displacement.dtype != dtype or displacement.device != device:
displacement = displacement.to(dtype=dtype, device=device)
original_shape = bounding_box.shape
# TODO: first cast to float if bbox is int64 before convert_format_bounding_box
bounding_box = (
convert_format_bounding_box(bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
original_shape = bounding_boxes.shape
# TODO: first cast to float if bbox is int64 before convert_format_bounding_boxes
bounding_boxes = (
convert_format_bounding_boxes(bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY)
).reshape(-1, 4)
id_grid = _create_identity_grid(spatial_size, device=device, dtype=dtype)
......@@ -1676,7 +1676,7 @@ def elastic_bounding_box(
inv_grid = id_grid.sub_(displacement)
# Get points from bboxes
points = bounding_box[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
points = bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
if points.is_floating_point():
points = points.ceil_()
index_xy = points.to(dtype=torch.long)
......@@ -1688,13 +1688,13 @@ def elastic_bounding_box(
transformed_points = transformed_points.reshape(-1, 4, 2)
out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
out_bboxes = clamp_bounding_box(
torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype),
out_bboxes = clamp_bounding_boxes(
torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype),
format=datapoints.BoundingBoxFormat.XYXY,
spatial_size=spatial_size,
)
return convert_format_bounding_box(
return convert_format_bounding_boxes(
out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True
).reshape(original_shape)
......@@ -1818,15 +1818,17 @@ def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL
return crop_image_pil(image, crop_top, crop_left, crop_height, crop_width)
def center_crop_bounding_box(
bounding_box: torch.Tensor,
def center_crop_bounding_boxes(
bounding_boxes: torch.Tensor,
format: datapoints.BoundingBoxFormat,
spatial_size: Tuple[int, int],
output_size: List[int],
) -> Tuple[torch.Tensor, Tuple[int, int]]:
crop_height, crop_width = _center_crop_parse_output_size(output_size)
crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *spatial_size)
return crop_bounding_box(bounding_box, format, top=crop_top, left=crop_left, height=crop_height, width=crop_width)
return crop_bounding_boxes(
bounding_boxes, format, top=crop_top, left=crop_left, height=crop_height, width=crop_width
)
def center_crop_mask(mask: torch.Tensor, output_size: List[int]) -> torch.Tensor:
......@@ -1893,8 +1895,8 @@ def resized_crop_image_pil(
return resize_image_pil(image, size, interpolation=interpolation)
def resized_crop_bounding_box(
bounding_box: torch.Tensor,
def resized_crop_bounding_boxes(
bounding_boxes: torch.Tensor,
format: datapoints.BoundingBoxFormat,
top: int,
left: int,
......@@ -1902,8 +1904,8 @@ def resized_crop_bounding_box(
width: int,
size: List[int],
) -> Tuple[torch.Tensor, Tuple[int, int]]:
bounding_box, _ = crop_bounding_box(bounding_box, format, top, left, height, width)
return resize_bounding_box(bounding_box, spatial_size=(height, width), size=size)
bounding_boxes, _ = crop_bounding_boxes(bounding_boxes, format, top, left, height, width)
return resize_bounding_boxes(bounding_boxes, spatial_size=(height, width), size=size)
def resized_crop_mask(
......
......@@ -109,8 +109,8 @@ def get_spatial_size_mask(mask: torch.Tensor) -> List[int]:
@torch.jit.unused
def get_spatial_size_bounding_box(bounding_box: datapoints.BoundingBox) -> List[int]:
return list(bounding_box.spatial_size)
def get_spatial_size_bounding_boxes(bounding_boxes: datapoints.BoundingBoxes) -> List[int]:
return list(bounding_boxes.spatial_size)
def get_spatial_size(inpt: datapoints._InputTypeJIT) -> List[int]:
......@@ -119,7 +119,7 @@ def get_spatial_size(inpt: datapoints._InputTypeJIT) -> List[int]:
if torch.jit.is_scripting() or is_simple_tensor(inpt):
return get_spatial_size_image_tensor(inpt)
elif isinstance(inpt, (datapoints.Image, datapoints.Video, datapoints.BoundingBox, datapoints.Mask)):
elif isinstance(inpt, (datapoints.Image, datapoints.Video, datapoints.BoundingBoxes, datapoints.Mask)):
return list(inpt.spatial_size)
elif isinstance(inpt, PIL.Image.Image):
return get_spatial_size_image_pil(inpt)
......@@ -185,95 +185,97 @@ def _xyxy_to_cxcywh(xyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
return xyxy
def _convert_format_bounding_box(
bounding_box: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, inplace: bool = False
def _convert_format_bounding_boxes(
bounding_boxes: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, inplace: bool = False
) -> torch.Tensor:
if new_format == old_format:
return bounding_box
return bounding_boxes
# TODO: Add _xywh_to_cxcywh and _cxcywh_to_xywh to improve performance
if old_format == BoundingBoxFormat.XYWH:
bounding_box = _xywh_to_xyxy(bounding_box, inplace)
bounding_boxes = _xywh_to_xyxy(bounding_boxes, inplace)
elif old_format == BoundingBoxFormat.CXCYWH:
bounding_box = _cxcywh_to_xyxy(bounding_box, inplace)
bounding_boxes = _cxcywh_to_xyxy(bounding_boxes, inplace)
if new_format == BoundingBoxFormat.XYWH:
bounding_box = _xyxy_to_xywh(bounding_box, inplace)
bounding_boxes = _xyxy_to_xywh(bounding_boxes, inplace)
elif new_format == BoundingBoxFormat.CXCYWH:
bounding_box = _xyxy_to_cxcywh(bounding_box, inplace)
bounding_boxes = _xyxy_to_cxcywh(bounding_boxes, inplace)
return bounding_box
return bounding_boxes
def convert_format_bounding_box(
def convert_format_bounding_boxes(
inpt: datapoints._InputTypeJIT,
old_format: Optional[BoundingBoxFormat] = None,
new_format: Optional[BoundingBoxFormat] = None,
inplace: bool = False,
) -> datapoints._InputTypeJIT:
# This being a kernel / dispatcher hybrid, we need an option to pass `old_format` explicitly for simple tensor
# inputs as well as extract it from `datapoints.BoundingBox` inputs. However, putting a default value on
# inputs as well as extract it from `datapoints.BoundingBoxes` inputs. However, putting a default value on
# `old_format` means we also need to put one on `new_format` to have syntactically correct Python. Here we mimic the
# default error that would be thrown if `new_format` had no default value.
if new_format is None:
raise TypeError("convert_format_bounding_box() missing 1 required argument: 'new_format'")
raise TypeError("convert_format_bounding_boxes() missing 1 required argument: 'new_format'")
if not torch.jit.is_scripting():
_log_api_usage_once(convert_format_bounding_box)
_log_api_usage_once(convert_format_bounding_boxes)
if torch.jit.is_scripting() or is_simple_tensor(inpt):
if old_format is None:
raise ValueError("For simple tensor inputs, `old_format` has to be passed.")
return _convert_format_bounding_box(inpt, old_format=old_format, new_format=new_format, inplace=inplace)
elif isinstance(inpt, datapoints.BoundingBox):
return _convert_format_bounding_boxes(inpt, old_format=old_format, new_format=new_format, inplace=inplace)
elif isinstance(inpt, datapoints.BoundingBoxes):
if old_format is not None:
raise ValueError("For bounding box datapoint inputs, `old_format` must not be passed.")
output = _convert_format_bounding_box(
output = _convert_format_bounding_boxes(
inpt.as_subclass(torch.Tensor), old_format=inpt.format, new_format=new_format, inplace=inplace
)
return datapoints.BoundingBox.wrap_like(inpt, output, format=new_format)
return datapoints.BoundingBoxes.wrap_like(inpt, output, format=new_format)
else:
raise TypeError(
f"Input can either be a plain tensor or a bounding box datapoint, but got {type(inpt)} instead."
)
def _clamp_bounding_box(
bounding_box: torch.Tensor, format: BoundingBoxFormat, spatial_size: Tuple[int, int]
def _clamp_bounding_boxes(
bounding_boxes: torch.Tensor, format: BoundingBoxFormat, spatial_size: Tuple[int, int]
) -> torch.Tensor:
# TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
# BoundingBoxFormat instead of converting back and forth
in_dtype = bounding_box.dtype
bounding_box = bounding_box.clone() if bounding_box.is_floating_point() else bounding_box.float()
xyxy_boxes = convert_format_bounding_box(
bounding_box, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
in_dtype = bounding_boxes.dtype
bounding_boxes = bounding_boxes.clone() if bounding_boxes.is_floating_point() else bounding_boxes.float()
xyxy_boxes = convert_format_bounding_boxes(
bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True
)
xyxy_boxes[..., 0::2].clamp_(min=0, max=spatial_size[1])
xyxy_boxes[..., 1::2].clamp_(min=0, max=spatial_size[0])
out_boxes = convert_format_bounding_box(
out_boxes = convert_format_bounding_boxes(
xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True
)
return out_boxes.to(in_dtype)
def clamp_bounding_box(
def clamp_bounding_boxes(
inpt: datapoints._InputTypeJIT,
format: Optional[BoundingBoxFormat] = None,
spatial_size: Optional[Tuple[int, int]] = None,
) -> datapoints._InputTypeJIT:
if not torch.jit.is_scripting():
_log_api_usage_once(clamp_bounding_box)
_log_api_usage_once(clamp_bounding_boxes)
if torch.jit.is_scripting() or is_simple_tensor(inpt):
if format is None or spatial_size is None:
raise ValueError("For simple tensor inputs, `format` and `spatial_size` has to be passed.")
return _clamp_bounding_box(inpt, format=format, spatial_size=spatial_size)
elif isinstance(inpt, datapoints.BoundingBox):
return _clamp_bounding_boxes(inpt, format=format, spatial_size=spatial_size)
elif isinstance(inpt, datapoints.BoundingBoxes):
if format is not None or spatial_size is not None:
raise ValueError("For bounding box datapoint inputs, `format` and `spatial_size` must not be passed.")
output = _clamp_bounding_box(inpt.as_subclass(torch.Tensor), format=inpt.format, spatial_size=inpt.spatial_size)
return datapoints.BoundingBox.wrap_like(inpt, output)
output = _clamp_bounding_boxes(
inpt.as_subclass(torch.Tensor), format=inpt.format, spatial_size=inpt.spatial_size
)
return datapoints.BoundingBoxes.wrap_like(inpt, output)
else:
raise TypeError(
f"Input can either be a plain tensor or a bounding box datapoint, but got {type(inpt)} instead."
......
......@@ -9,8 +9,8 @@ from torchvision._utils import sequence_to_str
from torchvision.transforms.v2.functional import get_dimensions, get_spatial_size, is_simple_tensor
def query_bounding_box(flat_inputs: List[Any]) -> datapoints.BoundingBox:
bounding_boxes = [inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BoundingBox)]
def query_bounding_boxes(flat_inputs: List[Any]) -> datapoints.BoundingBoxes:
bounding_boxes = [inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BoundingBoxes)]
if not bounding_boxes:
raise TypeError("No bounding box was found in the sample")
elif len(bounding_boxes) > 1:
......@@ -37,7 +37,7 @@ def query_spatial_size(flat_inputs: List[Any]) -> Tuple[int, int]:
tuple(get_spatial_size(inpt))
for inpt in flat_inputs
if isinstance(
inpt, (datapoints.Image, PIL.Image.Image, datapoints.Video, datapoints.Mask, datapoints.BoundingBox)
inpt, (datapoints.Image, PIL.Image.Image, datapoints.Video, datapoints.Mask, datapoints.BoundingBoxes)
)
or is_simple_tensor(inpt)
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment