Re-write getting started guide for transforms V2 (#7870)

Co-authored-by: vfdev <vfdev.5@gmail.com> Co-authored-by: Philip Meier <github.pmeier@posteo.de>

Re-write getting started guide for transforms V2 (#7870)
Co-authored-by: vfdev <vfdev.5@gmail.com> Co-authored-by: Philip Meier <github.pmeier@posteo.de>
6f72b76c · Nicolas Hug · GitHub · 11e49de4 · 6f72b76c · 6f72b76c
Unverified Commit 6f72b76c authored Aug 23, 2023 by Nicolas Hug Committed by GitHub Aug 23, 2023
9 changed files
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -83,6 +83,7 @@ sphinx_gallery_conf = {
    "backreferences_dir": "gen_modules/backreferences",
    "doc_module": ("torchvision",),
    "remove_config_comments": True,
+    "ignore_pattern": "helpers.py",
 }
 napoleon_use_ivar = True

--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -45,13 +45,17 @@ tasks (image classification, detection, segmentation, video classification).
 Transforms are typically passed as the ``transform`` or ``transforms`` argument
 to the :ref:`Datasets <datasets>`.
+.. TODO: Reader guide, i.e. what to read depending on what you're looking for
 .. TODO: add link to getting started guide here.
+.. _conventions:
 Supported input types and conventions
 -------------------------------------
 Most transformations accept both `PIL <https://pillow.readthedocs.io>`_ images
-and tensor images. The result of both backends (PIL or Tensors) should be very
+and tensor inputs. Both CPU and CUDA tensors are supported.
+The result of both backends (PIL or Tensors) should be very
 close. In general, we recommend relying on the tensor backend :ref:`for
 performance <transforms_perf>`.  The :ref:`conversion transforms
 <conversion_transforms>` may be used to convert to and from PIL images, or for
@@ -152,13 +156,15 @@ The above should give you the best performance in a typical training environment
 that relies on the :class:`torch.utils.data.DataLoader` with ``num_workers >
 0``.
-Transforms tend to be sensitive to the input strides / memory layout. Some
+Transforms tend to be sensitive to the input strides / memory format. Some
 transforms will be faster with channels-first images while others prefer
-channels-last. You may want to experiment a bit if you're chasing the very
+channels-last. Like ``torch`` operators, most transforms will preserve the
-best performance. Using :func:`torch.compile` on individual transforms may
+memory format of the input, but this may not always be respected due to
-also help factoring out the memory layout variable (e.g. on
+implementation details. You may want to experiment a bit if you're chasing the
+very best performance.  Using :func:`torch.compile` on individual transforms may
+also help factoring out the memory format variable (e.g. on
 :class:`~torchvision.transforms.v2.Normalize`). Note that we're talking about
-**memory layout**, not tensor shape.
+**memory format**, not :ref:`tensor shape <conventions>`.
 Note that resize transforms like :class:`~torchvision.transforms.v2.Resize`
 and :class:`~torchvision.transforms.v2.RandomResizedCrop` typically prefer

--- a/gallery/v2_transforms/helpers.py
+++ b/gallery/v2_transforms/helpers.py
+import matplotlib.pyplot as plt
+from torchvision.utils import draw_bounding_boxes
+def plot(imgs):
+    if not isinstance(imgs[0], list):
+        # Make a 2d grid even if there's just 1 row
+        imgs = [imgs]
+    num_rows = len(imgs)
+    num_cols = len(imgs[0])
+    _, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
+    for row_idx, row in enumerate(imgs):
+        for col_idx, img in enumerate(row):
+            bboxes = None
+            if isinstance(img, tuple):
+                bboxes = img[1]
+                img = img[0]
+                if isinstance(bboxes, dict):
+                    bboxes = bboxes['bboxes']
+            if img.dtype.is_floating_point and img.min() < 0:
+                # Poor man's re-normalization for the colors to be OK-ish. This
+                # is useful for images coming out of Normalize()
+                img -= img.min()
+                img /= img.max()
+            if bboxes is not None:
+                img = draw_bounding_boxes(img, bboxes, colors="yellow", width=3)
+            ax = axs[row_idx, col_idx]
+            ax.imshow(img.permute(1, 2, 0).numpy())
+            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+    plt.tight_layout()
--- a/gallery/v2_transforms/plot_datapoints.py
+++ b/gallery/v2_transforms/plot_datapoints.py
@@ -28,7 +28,6 @@ import PIL.Image
 import torch
 from torchvision import datapoints
-from torchvision.transforms.v2 import functional as F
 # %%
@@ -119,83 +118,10 @@ new_bboxes = datapoints.wrap(new_bboxes, like=bboxes)
 assert isinstance(new_bboxes, datapoints.BoundingBoxes)
 assert new_bboxes.canvas_size == bboxes.canvas_size
 # %%
 # The metadata of ``new_bboxes`` is the same as ``bboxes``, but you could pass
 # it as a parameter to override it.
 #
-# Do I have to wrap the output of the datasets myself?
-# ----------------------------------------------------
-#
-# TODO: Move this in another guide - this is user-facing, not dev-facing.
-#
-# Only if you are using custom datasets. For the built-in ones, you can use
-# :func:`torchvision.datasets.wrap_dataset_for_transforms_v2`. Note that the function also supports subclasses of the
-# built-in datasets. Meaning, if your custom dataset subclasses from a built-in one and the output type is the same, you
-# also don't have to wrap manually.
-#
-# If you have a custom dataset, for example the ``PennFudanDataset`` from
-# `this tutorial <https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html>`_, you have two options:
-#
-# 1. Perform the wrapping inside ``__getitem__``:
-class PennFudanDataset(torch.utils.data.Dataset):
-    ...
-    def __getitem__(self, item):
-        ...
-        target["bboxes"] = datapoints.BoundingBoxes(
-            bboxes,
-            format=datapoints.BoundingBoxFormat.XYXY,
-            canvas_size=F.get_size(img),
-        )
-        target["labels"] = labels
-        target["masks"] = datapoints.Mask(masks)
-        ...
-        if self.transforms is not None:
-            img, target = self.transforms(img, target)
-        ...
-# %%
-# 2. Perform the wrapping inside a custom transformation at the beginning of your pipeline:
-class WrapPennFudanDataset:
-    def __call__(self, img, target):
-        target["boxes"] = datapoints.BoundingBoxes(
-            target["boxes"],
-            format=datapoints.BoundingBoxFormat.XYXY,
-            canvas_size=F.get_size(img),
-        )
-        target["masks"] = datapoints.Mask(target["masks"])
-        return img, target
-...
-def get_transform(train):
-    transforms = []
-    transforms.append(WrapPennFudanDataset())
-    transforms.append(T.PILToTensor())
-    ...
-# %%
-# .. note::
-#
-#    If both :class:`~torchvision.datapoints.BoundingBoxes` and :class:`~torchvision.datapoints.Mask`'s are included in
-#    the sample, ``torchvision.transforms.v2`` will transform them both. Meaning, if you don't need both, dropping or
-#    at least not wrapping the obsolete parts, can lead to a significant performance boost.
-#
-#    For example, if you are using the ``PennFudanDataset`` for object detection, not wrapping the masks avoids
-#    transforming them over and over again in the pipeline just to ultimately ignoring them. In general, it would be
-#    even better to not load the masks at all, but this is not possible in this example, since the bounding boxes are
-#    generated from the masks.
-#
 # .. _datapoint_unwrapping_behaviour:
 #
 # I had a Datapoint but now I have a Tensor. Help!

--- a/gallery/v2_transforms/plot_transforms_v2.py
+++ b/gallery/v2_transforms/plot_transforms_v2.py
@@ -7,103 +7,254 @@ Getting started with transforms v2
    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_v2.ipynb>`_
    or :ref:`go to the end <sphx_glr_download_auto_examples_v2_transforms_plot_transforms_v2.py>` to download the full example code.
-Most computer vision tasks are not supported out of the box by ``torchvision.transforms`` v1, since it only supports
+This example illustrates all of what you need to know to get started with the
-images. ``torchvision.transforms.v2`` enables jointly transforming images, videos, bounding boxes, and masks. This
+new :mod:`torchvision.transforms.v2` API. We'll cover simple tasks like
-example showcases the core functionality of the new ``torchvision.transforms.v2`` API.
+image classification, and more advanced ones like object detection /
+segmentation.
 """
-import pathlib
+# %%
+# First, a bit of setup
+from pathlib import Path
 import torch
+import matplotlib.pyplot as plt
+plt.rcParams["savefig.bbox"] = 'tight'
+from torchvision.transforms import v2
+from torchvision.io import read_image
-def load_data():
+torch.manual_seed(1)
-    from torchvision.io import read_image
-    from torchvision import datapoints
-    from torchvision.ops import masks_to_boxes
-    assets_directory = pathlib.Path("../assets")
+# If you're trying to run that on collab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+from helpers import plot
+img = read_image(str(Path('../assets') / 'astronaut.jpg'))
+print(f"{type(img) = }, {img.dtype = }, {img.shape = }")
-    path = assets_directory / "FudanPed00054.png"
+# %%
-    image = datapoints.Image(read_image(str(path)))
+# The basics
-    merged_masks = read_image(str(assets_directory / "FudanPed00054_mask.png"))
+# ----------
+#
+# The Torchvision transforms behave like a regular :class:`torch.nn.Module` (in
+# fact, most of them are): instantiate a transform, pass an input, get a
+# transformed output:
-    labels = torch.unique(merged_masks)[1:]
+transform = v2.RandomCrop(size=(224, 224))
+out = transform(img)
-    masks = datapoints.Mask(merged_masks == labels.view(-1, 1, 1))
+plot([img, out])
-    bounding_boxes = datapoints.BoundingBoxes(
+# %%
-        masks_to_boxes(masks), format=datapoints.BoundingBoxFormat.XYXY, canvas_size=image.shape[-2:]
+# I just want to do image classification
-    )
+# --------------------------------------
+#
+# If you just care about image classification, things are very simple. A basic
+# classification pipeline may look like this:
-    return path, image, bounding_boxes, masks, labels
+transforms = v2.Compose([
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=0.5),
+    v2.ToDtype(torch.float32, scale=True),
+    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+])
+out = transforms(img)
+plot([img, out])
 # %%
-# The :mod:`torchvision.transforms.v2` API supports images, videos, bounding boxes, and instance and segmentation
+# Such transformation pipeline is typically passed as the ``transform`` argument
-# masks. Thus, it offers native support for many Computer Vision tasks, like image and video classification, object
+# to the :ref:`Datasets <datasets>`, e.g. ``ImageNet(...,
-# detection or instance and semantic segmentation. Still, the interface is the same, making
+# transform=transforms)``.
-# :mod:`torchvision.transforms.v2` a drop-in replacement for the existing :mod:`torchvision.transforms` API, aka v1.
+#
+# That's pretty much all there is. From there, read through our :ref:`main docs
+# <transforms>` to learn more about recommended practices and conventions, or
+# explore more :ref:`examples <transforms_gallery>` e.g. how to use augmentation
+# transforms like :ref:`CutMix and MixUp
+# <sphx_glr_auto_examples_v2_transforms_plot_cutmix_mixup.py>`.
+#
+# .. note::
+#
+#     If you're already relying on the ``torchvision.transforms`` v1 API,
+#     we recommend to :ref:`switch to the new v2 transforms<v1_or_v2>`. It's
+#     very easy: the v2 transforms are fully compatible with the v1 API, so you
+#     only need to change the import!
+#
+# Detection, Segmentation, Videos
+# -------------------------------
+#
+# The new Torchvision transforms in the ``torchvision.transforms.v2`` namespace
+# support tasks beyond image classification: they can also transform bounding
+# boxes, segmentation / detection masks, or videos.
+#
+# Let's briefly look at a detection example with bounding boxes.
-import torchvision.transforms.v2 as transforms
+from torchvision import datapoints  # we'll describe this a bit later, bare with us
-transform = transforms.Compose(
+bboxes = datapoints.BoundingBoxes(
    [
-        transforms.ColorJitter(contrast=0.5),
+        [15, 10, 370, 510],
-        transforms.RandomRotation(30),
+        [275, 340, 510, 510],
-        transforms.CenterCrop(480),
+        [130, 345, 210, 425]
-    ]
+    ],
-)
+    format="XYXY", canvas_size=img.shape[-2:])
-# %%
+transforms = v2.Compose([
-# :mod:`torchvision.transforms.v2` natively supports jointly transforming multiple inputs while making sure that
+    v2.RandomPhotometricDistort(),
-# potential random behavior is consistent across all inputs. However, it doesn't enforce a specific input structure or
+    v2.RandomIoUCrop(),
-# order.
+    v2.RandomHorizontalFlip(p=0.5),
+    v2.SanitizeBoundingBoxes(),
+])
+out_img, out_bboxes = transforms(img, bboxes)
-path, image, bounding_boxes, masks, labels = load_data()
+plot([(img, bboxes), (out_img, out_bboxes)])
-torch.manual_seed(0)
+# %%
-new_image = transform(image)  # Image Classification
+#
-new_image, new_bounding_boxes, new_labels = transform(image, bounding_boxes, labels)  # Object Detection
+# The example above focuses on object detection. But if we had masks
-new_image, new_bounding_boxes, new_masks, new_labels = transform(
+# (:class:`torchvision.datapoints.Mask`) for object segmentation or semantic
-    image, bounding_boxes, masks, labels
+# segmentation, or videos (:class:`torchvision.datapoints.Video`), we could have
-)  # Instance Segmentation
+# passed them to the transforms in exactly the same way.
-new_image, new_target = transform((image, {"boxes": bounding_boxes, "labels": labels}))  # Arbitrary Structure
+#
+# By now you likely have a few questions: what are these datapoints, how do we
+# use them, and what is the expected input/output of those transforms? We'll
+# answer these in the next sections.
 # %%
-# Under the hood, :mod:`torchvision.transforms.v2` relies on :mod:`torchvision.datapoints` for the dispatch to the
+# What are Datapoints?
-# appropriate function for the input data: :ref:`sphx_glr_auto_examples_v2_transforms_plot_datapoints.py`. Note however, that as
+# --------------------
-# regular user, you likely don't have to touch this yourself. See
+#
-# :ref:`sphx_glr_auto_examples_v2_transforms_plot_transforms_v2_e2e.py`.
+# Datapoints are :class:`torch.Tensor` subclasses. The available datapoints are
+# :class:`~torchvision.datapoints.Image`,
+# :class:`~torchvision.datapoints.BoundingBoxes`,
+# :class:`~torchvision.datapoints.Mask`, and
+# :class:`~torchvision.datapoints.Video`.
 #
-# All "foreign" types like :class:`str`'s or :class:`pathlib.Path`'s are passed through, allowing to store extra
+# Datapoints look and feel just like regular tensors - they **are** tensors.
-# information directly with the sample:
+# Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()``
+# or any ``torch.*`` operator will also work on a datapoint:
-sample = {"path": path, "image": image}
+img_dp = datapoints.Image(torch.randint(0, 256, (3, 256, 256), dtype=torch.uint8))
-new_sample = transform(sample)
-assert new_sample["path"] is sample["path"]
+print(f"{isinstance(img_dp, torch.Tensor) = }")
+print(f"{img_dp.dtype = }, {img_dp.shape = }, {img_dp.sum() = }")
 # %%
-# As stated above, :mod:`torchvision.transforms.v2` is a drop-in replacement for :mod:`torchvision.transforms` and thus
+# These Datapoint classes are at the core of the transforms: in order to
-# also supports transforming plain :class:`torch.Tensor`'s as image or video if applicable. This is achieved with a
+# transform a given input, the transforms first look at the **class** of the
-# simple heuristic:
+# object, and dispatch to the appropriate implementation accordingly.
 #
-# * If we find an explicit image or video (:class:`torchvision.datapoints.Image`, :class:`torchvision.datapoints.Video`,
+# You don't need to know much more about datapoints at this point, but advanced
-#   or :class:`PIL.Image.Image`) in the input, all other plain tensors are passed through.
+# users who want to learn more can refer to
-# * If there is no explicit image or video, only the first plain :class:`torch.Tensor` will be transformed as image or
+# :ref:`sphx_glr_auto_examples_v2_transforms_plot_datapoints.py`.
-#   video, while all others will be passed through.
+#
+# What do I pass as input?
-plain_tensor_image = torch.rand(image.shape)
+# ------------------------
+#
-print(image.shape, plain_tensor_image.shape)
+# Above, we've seen two examples: one where we passed a single image as input
+# i.e. ``out = transforms(img)``, and one where we passed both an image and
+# bounding boxes, i.e. ``out_img, out_bboxes = transforms(img, bboxes)``.
+#
+# In fact, transforms support **arbitrary input structures**. The input can be a
+# single image, a tuple, an arbitrarily nested dictionary... pretty much
+# anything. The same structure will be returned as output. Below, we use the
+# same detection transforms, but pass a tuple (image, target_dict) as input and
+# we're getting the same structure as output:
-# passing a plain tensor together with an explicit image, will not transform the former
+target = {
-plain_tensor_image, image = transform(plain_tensor_image, image)
+    "bboxes": bboxes,
+    "labels": torch.arange(bboxes.shape[0]),
+    "this_is_ignored": ("arbitrary", {"structure": "!"})
+}
-print(image.shape, plain_tensor_image.shape)
+# Re-using the transforms and definitions from above.
+out_img, out_target = transforms(img, target)
-# passing a plain tensor without an explicit image, will transform the former
+plot([(img, target["bboxes"]), (out_img, out_target["bboxes"])])
-plain_tensor_image, _ = transform(plain_tensor_image, bounding_boxes)
+print(f"{out_target['this_is_ignored']}")
-print(image.shape, plain_tensor_image.shape)
+# %%
+# We passed a tuple so we get a tuple back, and the second element is the
+# tranformed target dict. Transforms don't really care about the structure of
+# the input; as mentioned above, they only care about the **type** of the
+# objects and transforms them accordingly.
+#
+# *Foreign* objects like strings or ints are simply passed-through. This can be
+# useful e.g. if you want to associate a path with every single sample when
+# debugging!
+#
+# .. _passthrough_heuristic:
+#
+# .. note::
+#
+#     **Disclaimer** This note is slightly advanced and can be safely skipped on
+#     a first read.
+#
+#     Pure :class:`torch.Tensor` objects are, in general, treated as images (or
+#     as videos for video-specific transforms). Indeed, you may have noticed
+#     that in the code above we haven't used the
+#     :class:`~torchvision.datapoints.Image` class at all, and yet our images
+#     got transformed properly. Transforms follow the following logic to
+#     determine whether a pure Tensor should be treated as an image (or video),
+#     or just ignored:
+#
+#     * If there is an :class:`~torchvision.datapoints.Image`,
+#       :class:`~torchvision.datapoints.Video`,
+#       or :class:`PIL.Image.Image` instance in the input, all other pure
+#       tensors are passed-through.
+#     * If there is no :class:`~torchvision.datapoints.Image` or
+#       :class:`~torchvision.datapoints.Video` instance, only the first pure
+#       :class:`torch.Tensor` will be transformed as image or video, while all
+#       others will be passed-through. Here "first" means "first in a depth-wise
+#       traversal".
+#
+#     This is what happened in the detection example above: the first pure
+#     tensor was the image so it got transformed properly, and all other pure
+#     tensor instances like the ``labels`` were passed-through (although labels
+#     can still be transformed by some transforms like
+#     :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`!).
+#
+# Transforms and Datasets intercompatibility
+# ------------------------------------------
+#
+# Roughly speaking, the output of the datasets must correspond to the input of
+# the transforms. How to do that depends on whether you're using the torchvision
+# :ref:`built-in datatsets <datasets>`, or your own custom datasets.
+#
+# Using built-in datasets
+# ^^^^^^^^^^^^^^^^^^^^^^^
+#
+# If you're just doing image classification, you don't need to do anything. Just
+# use ``transform`` argument of the dataset e.g. ``ImageNet(...,
+# transform=transforms)`` and you're good to go.
+#
+# Torchvision also supports datasets for object detection or segmentation like
+# :class:`torchvision.datasets.CocoDetection`. Those datasets predate
+# the existence of the :mod:`torchvision.transforms.v2` module and of the
+# datapoints, so they don't return datapoints out of the box.
+#
+# An easy way to force those datasets to return datapoints and to make them
+# compatible with v2 transforms is to use the
+# :func:`torchvision.datasets.wrap_dataset_for_transforms_v2` function:
+#
+# .. code-block:: python
+#
+#    from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2
+#
+#    dataset = CocoDetection(..., transforms=my_v2_transforms)
+#    dataset = wrap_dataset_for_transforms_v2(dataset)
+#    # Now the dataset returns datapoints!
+#
+# Using your own datasets
+# ^^^^^^^^^^^^^^^^^^^^^^^
+#
+# If you have a custom dataset, then you'll need to convert your objects into
+# the appropriate Datapoint classes. Creating Datapoint instances is very easy,
+# refer to :ref:`datapoint_creation` for more details.
+#
+# There are two main places where you can implement that conversion logic:
+#
+# - At the end of the datasets's ``__getitem__`` method, before returning the
+#   sample (or by sub-classing the dataset).
+# - As the very first step of your transforms pipeline
+#
+# Either way, the logic will depend on your specific dataset.
--- a/torchvision/datapoints/_image.py
+++ b/torchvision/datapoints/_image.py
@@ -11,14 +11,20 @@ from ._datapoint import Datapoint
 class Image(Datapoint):
    """[BETA] :class:`torch.Tensor` subclass for images.
+    .. note::
+        In the :ref:`transforms <transforms>`, ``Image`` instances are largely
+        interchangeable with pure :class:`torch.Tensor`. See
+        :ref:`this note <passthrough_heuristic>` for more details.
    Args:
        data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
            well as PIL images.
-        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+        dtype (torch.dtype, optional): Desired data type. If omitted, will be inferred from
            ``data``.
-        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+        device (torch.device, optional): Desired device. If omitted and ``data`` is a
-            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the image is constructed on the CPU.
-        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+        requires_grad (bool, optional): Whether autograd should record operations. If omitted and
            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
    """

--- a/torchvision/datapoints/_mask.py
+++ b/torchvision/datapoints/_mask.py
@@ -14,11 +14,11 @@ class Mask(Datapoint):
    Args:
        data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
            well as PIL images.
-        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+        dtype (torch.dtype, optional): Desired data type. If omitted, will be inferred from
            ``data``.
-        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+        device (torch.device, optional): Desired device. If omitted and ``data`` is a
-            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the mask is constructed on the CPU.
-        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+        requires_grad (bool, optional): Whether autograd should record operations. If omitted and
            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
    """

--- a/torchvision/datapoints/_video.py
+++ b/torchvision/datapoints/_video.py
@@ -12,11 +12,11 @@ class Video(Datapoint):
    Args:
        data (tensor-like): Any data that can be turned into a tensor with :func:`torch.as_tensor`.
-        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+        dtype (torch.dtype, optional): Desired data type. If omitted, will be inferred from
            ``data``.
-        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+        device (torch.device, optional): Desired device. If omitted and ``data`` is a
-            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the video is constructed on the CPU.
-        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+        requires_grad (bool, optional): Whether autograd should record operations. If omitted and
            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
    """

--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -233,7 +233,8 @@ class ToDtype(Transform):
            A dict can be passed to specify per-datapoint conversions, e.g.
            ``dtype={datapoints.Image: torch.float32, datapoints.Mask: torch.int64, "others":None}``. The "others"
            key can be used as a catch-all for any other datapoint type, and ``None`` means no conversion.
-        scale (bool, optional): Whether to scale the values for images or videos. Default: ``False``.
+        scale (bool, optional): Whether to scale the values for images or videos. See :ref:`range_and_dtype`.
+            Default: ``False``.
    """
    _transformed_types = (torch.Tensor,)