merge v0.16.0

cc26cd81 · panning · f78f29f5 · fbb4cc54 · cc26cd81 · cc26cd81
Commit cc26cd81 authored Nov 27, 2023 by panning
20 changed files
--- a/gallery/transforms/README.rst
+++ b/gallery/transforms/README.rst
+.. _transforms_gallery:
+
+Transforms
+----------
--- a/gallery/transforms/helpers.py
+++ b/gallery/transforms/helpers.py
+import matplotlib.pyplot as plt
+import torch
+from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks
+from torchvision import tv_tensors
+from torchvision.transforms.v2 import functional as F
+
+
+def plot(imgs, row_title=None, **imshow_kwargs):
+    if not isinstance(imgs[0], list):
+        # Make a 2d grid even if there's just 1 row
+        imgs = [imgs]
+
+    num_rows = len(imgs)
+    num_cols = len(imgs[0])
+    _, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
+    for row_idx, row in enumerate(imgs):
+        for col_idx, img in enumerate(row):
+            boxes = None
+            masks = None
+            if isinstance(img, tuple):
+                img, target = img
+                if isinstance(target, dict):
+                    boxes = target.get("boxes")
+                    masks = target.get("masks")
+                elif isinstance(target, tv_tensors.BoundingBoxes):
+                    boxes = target
+                else:
+                    raise ValueError(f"Unexpected target type: {type(target)}")
+            img = F.to_image(img)
+            if img.dtype.is_floating_point and img.min() < 0:
+                # Poor man's re-normalization for the colors to be OK-ish. This
+                # is useful for images coming out of Normalize()
+                img -= img.min()
+                img /= img.max()
+
+            img = F.to_dtype(img, torch.uint8, scale=True)
+            if boxes is not None:
+                img = draw_bounding_boxes(img, boxes, colors="yellow", width=3)
+            if masks is not None:
+                img = draw_segmentation_masks(img, masks.to(torch.bool), colors=["green"] * masks.shape[0], alpha=.65)
+
+            ax = axs[row_idx, col_idx]
+            ax.imshow(img.permute(1, 2, 0).numpy(), **imshow_kwargs)
+            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+
+    if row_title is not None:
+        for row_idx in range(num_rows):
+            axs[row_idx, 0].set(ylabel=row_title[row_idx])
+
+    plt.tight_layout()
--- a/gallery/transforms/plot_custom_transforms.py
+++ b/gallery/transforms/plot_custom_transforms.py
+"""
+===================================
+How to write your own v2 transforms
+===================================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_custom_transforms.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_custom_transforms.py>` to download the full example code.
+
+This guide explains how to write transforms that are compatible with the
+torchvision transforms V2 API.
+"""
+
+# %%
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import v2
+
+
+# %%
+# Just create a ``nn.Module`` and override the ``forward`` method
+# ===============================================================
+#
+# In most cases, this is all you're going to need, as long as you already know
+# the structure of the input that your transform will expect. For example if
+# you're just doing image classification, your transform will typically accept a
+# single image as input, or a ``(img, label)`` input. So you can just hard-code
+# your ``forward`` method to accept just that, e.g.
+#
+# .. code:: python
+#
+#     class MyCustomTransform(torch.nn.Module):
+#         def forward(self, img, label):
+#             # Do some transformations
+#             return new_img, new_label
+#
+# .. note::
+#
+#     This means that if you have a custom transform that is already compatible
+#     with the V1 transforms (those in ``torchvision.transforms``), it will
+#     still work with the V2 transforms without any change!
+#
+# We will illustrate this more completely below with a typical detection case,
+# where our samples are just images, bounding boxes and labels:
+
+class MyCustomTransform(torch.nn.Module):
+    def forward(self, img, bboxes, label):  # we assume inputs are always structured like this
+        print(
+            f"I'm transforming an image of shape {img.shape} "
+            f"with bboxes = {bboxes}\n{label = }"
+        )
+        # Do some transformations. Here, we're just passing though the input
+        return img, bboxes, label
+
+
+transforms = v2.Compose([
+    MyCustomTransform(),
+    v2.RandomResizedCrop((224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=1),
+    v2.Normalize(mean=[0, 0, 0], std=[1, 1, 1])
+])
+
+H, W = 256, 256
+img = torch.rand(3, H, W)
+bboxes = tv_tensors.BoundingBoxes(
+    torch.tensor([[0, 10, 10, 20], [50, 50, 70, 70]]),
+    format="XYXY",
+    canvas_size=(H, W)
+)
+label = 3
+
+out_img, out_bboxes, out_label = transforms(img, bboxes, label)
+# %%
+print(f"Output image shape: {out_img.shape}\nout_bboxes = {out_bboxes}\n{out_label = }")
+# %%
+# .. note::
+#     While working with TVTensor classes in your code, make sure to
+#     familiarize yourself with this section:
+#     :ref:`tv_tensor_unwrapping_behaviour`
+#
+# Supporting arbitrary input structures
+# =====================================
+#
+# In the section above, we have assumed that you already know the structure of
+# your inputs and that you're OK with hard-coding this expected structure in
+# your code. If you want your custom transforms to be as flexible as possible,
+# this can be a bit limiting.
+#
+# A key feature of the builtin Torchvision V2 transforms is that they can accept
+# arbitrary input structure and return the same structure as output (with
+# transformed entries). For example, transforms can accept a single image, or a
+# tuple of ``(img, label)``, or an arbitrary nested dictionary as input:
+
+structured_input = {
+    "img": img,
+    "annotations": (bboxes, label),
+    "something_that_will_be_ignored": (1, "hello")
+}
+structured_output = v2.RandomHorizontalFlip(p=1)(structured_input)
+
+assert isinstance(structured_output, dict)
+assert structured_output["something_that_will_be_ignored"] == (1, "hello")
+print(f"The transformed bboxes are:\n{structured_output['annotations'][0]}")
+
+# %%
+# If you want to reproduce this behavior in your own transform, we invite you to
+# look at our `code
+# <https://github.com/pytorch/vision/blob/main/torchvision/transforms/v2/_transform.py>`_
+# and adapt it to your needs.
+#
+# In brief, the core logic is to unpack the input into a flat list using `pytree
+# <https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py>`_, and
+# then transform only the entries that can be transformed (the decision is made
+# based on the **class** of the entries, as all TVTensors are
+# tensor-subclasses) plus some custom logic that is out of score here - check the
+# code for details. The (potentially transformed) entries are then repacked and
+# returned, in the same structure as the input.
+#
+# We do not provide public dev-facing tools to achieve that at this time, but if
+# this is something that would be valuable to you, please let us know by opening
+# an issue on our `GitHub repo <https://github.com/pytorch/vision/issues>`_.
--- a/gallery/transforms/plot_custom_tv_tensors.py
+++ b/gallery/transforms/plot_custom_tv_tensors.py
+"""
+====================================
+How to write your own TVTensor class
+====================================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_custom_tv_tensors.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_custom_tv_tensors.py>` to download the full example code.
+
+This guide is intended for advanced users and downstream library maintainers. We explain how to
+write your own TVTensor class, and how to make it compatible with the built-in
+Torchvision v2 transforms. Before continuing, make sure you have read
+:ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py`.
+"""
+
+# %%
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import v2
+
+# %%
+# We will create a very simple class that just inherits from the base
+# :class:`~torchvision.tv_tensors.TVTensor` class. It will be enough to cover
+# what you need to know to implement your more elaborate uses-cases. If you need
+# to create a class that carries meta-data, take a look at how the
+# :class:`~torchvision.tv_tensors.BoundingBoxes` class is `implemented
+# <https://github.com/pytorch/vision/blob/main/torchvision/tv_tensors/_bounding_box.py>`_.
+
+
+class MyTVTensor(tv_tensors.TVTensor):
+    pass
+
+
+my_dp = MyTVTensor([1, 2, 3])
+my_dp
+
+# %%
+# Now that we have defined our custom TVTensor class, we want it to be
+# compatible with the built-in torchvision transforms, and the functional API.
+# For that, we need to implement a kernel which performs the core of the
+# transformation, and then "hook" it to the functional that we want to support
+# via :func:`~torchvision.transforms.v2.functional.register_kernel`.
+#
+# We illustrate this process below: we create a kernel for the "horizontal flip"
+# operation of our MyTVTensor class, and register it to the functional API.
+
+from torchvision.transforms.v2 import functional as F
+
+
+@F.register_kernel(functional="hflip", tv_tensor_cls=MyTVTensor)
+def hflip_my_tv_tensor(my_dp, *args, **kwargs):
+    print("Flipping!")
+    out = my_dp.flip(-1)
+    return tv_tensors.wrap(out, like=my_dp)
+
+
+# %%
+# To understand why :func:`~torchvision.tv_tensors.wrap` is used, see
+# :ref:`tv_tensor_unwrapping_behaviour`. Ignore the ``*args, **kwargs`` for now,
+# we will explain it below in :ref:`param_forwarding`.
+#
+# .. note::
+#
+#     In our call to ``register_kernel`` above we used a string
+#     ``functional="hflip"`` to refer to the functional we want to hook into. We
+#     could also have used the  functional *itself*, i.e.
+#     ``@register_kernel(functional=F.hflip, ...)``.
+#
+# Now that we have registered our kernel, we can call the functional API on a
+# ``MyTVTensor`` instance:
+
+my_dp = MyTVTensor(torch.rand(3, 256, 256))
+_ = F.hflip(my_dp)
+
+# %%
+# And we can also use the
+# :class:`~torchvision.transforms.v2.RandomHorizontalFlip` transform, since it relies on :func:`~torchvision.transforms.v2.functional.hflip` internally:
+t = v2.RandomHorizontalFlip(p=1)
+_ = t(my_dp)
+
+# %%
+# .. note::
+#
+#     We cannot register a kernel for a transform class, we can only register a
+#     kernel for a **functional**. The reason we can't register a transform
+#     class is because one transform may internally rely on more than one
+#     functional, so in general we can't register a single kernel for a given
+#     class.
+#
+# .. _param_forwarding:
+#
+# Parameter forwarding, and ensuring future compatibility of your kernels
+# -----------------------------------------------------------------------
+#
+# The functional API that you're hooking into is public and therefore
+# **backward** compatible: we guarantee that the parameters of these functionals
+# won't be removed or renamed without a proper deprecation cycle. However, we
+# don't guarantee **forward** compatibility, and we may add new parameters in
+# the future.
+#
+# Imagine that in a future version, Torchvision adds a new ``inplace`` parameter
+# to its :func:`~torchvision.transforms.v2.functional.hflip` functional. If you
+# already defined and registered your own kernel as
+
+def hflip_my_tv_tensor(my_dp):  # noqa
+    print("Flipping!")
+    out = my_dp.flip(-1)
+    return tv_tensors.wrap(out, like=my_dp)
+
+
+# %%
+# then calling ``F.hflip(my_dp)`` will **fail**, because ``hflip`` will try to
+# pass the new ``inplace`` parameter to your kernel, but your kernel doesn't
+# accept it.
+#
+# For this reason, we recommend to always define your kernels with
+# ``*args, **kwargs`` in their signature, as done above. This way, your kernel
+# will be able to accept any new parameter that we may add in the future.
+# (Technically, adding `**kwargs` only should be enough).
--- a/gallery/transforms/plot_cutmix_mixup.py
+++ b/gallery/transforms/plot_cutmix_mixup.py
+
+"""
+===========================
+How to use CutMix and MixUp
+===========================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_cutmix_mixup.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_cutmix_mixup.py>` to download the full example code.
+
+:class:`~torchvision.transforms.v2.CutMix` and
+:class:`~torchvision.transforms.v2.MixUp` are popular augmentation strategies
+that can improve classification accuracy.
+
+These transforms are slightly different from the rest of the Torchvision
+transforms, because they expect
+**batches** of samples as input, not individual images. In this example we'll
+explain how to use them: after the ``DataLoader``, or as part of a collation
+function.
+"""
+
+# %%
+import torch
+from torchvision.datasets import FakeData
+from torchvision.transforms import v2
+
+
+NUM_CLASSES = 100
+
+# %%
+# Pre-processing pipeline
+# -----------------------
+#
+# We'll use a simple but typical image classification pipeline:
+
+preproc = v2.Compose([
+    v2.PILToTensor(),
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=0.5),
+    v2.ToDtype(torch.float32, scale=True),  # to float32 in [0, 1]
+    v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),  # typically from ImageNet
+])
+
+dataset = FakeData(size=1000, num_classes=NUM_CLASSES, transform=preproc)
+
+img, label = dataset[0]
+print(f"{type(img) = }, {img.dtype = }, {img.shape = }, {label = }")
+
+# %%
+#
+# One important thing to note is that neither CutMix nor MixUp are part of this
+# pre-processing pipeline. We'll add them a bit later once we define the
+# DataLoader. Just as a refresher, this is what the DataLoader and training loop
+# would look like if we weren't using CutMix or MixUp:
+
+from torch.utils.data import DataLoader
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
+
+for images, labels in dataloader:
+    print(f"{images.shape = }, {labels.shape = }")
+    print(labels.dtype)
+    # <rest of the training loop here>
+    break
+# %%
+
+# %%
+# Where to use MixUp and CutMix
+# -----------------------------
+#
+# After the DataLoader
+# ^^^^^^^^^^^^^^^^^^^^
+#
+# Now let's add CutMix and MixUp. The simplest way to do this right after the
+# DataLoader: the Dataloader has already batched the images and labels for us,
+# and this is exactly what these transforms expect as input:
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
+
+cutmix = v2.CutMix(num_classes=NUM_CLASSES)
+mixup = v2.MixUp(num_classes=NUM_CLASSES)
+cutmix_or_mixup = v2.RandomChoice([cutmix, mixup])
+
+for images, labels in dataloader:
+    print(f"Before CutMix/MixUp: {images.shape = }, {labels.shape = }")
+    images, labels = cutmix_or_mixup(images, labels)
+    print(f"After CutMix/MixUp: {images.shape = }, {labels.shape = }")
+
+    # <rest of the training loop here>
+    break
+# %%
+#
+# Note how the labels were also transformed: we went from a batched label of
+# shape (batch_size,) to a tensor of shape (batch_size, num_classes). The
+# transformed labels can still be passed as-is to a loss function like
+# :func:`torch.nn.functional.cross_entropy`.
+#
+# As part of the collation function
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Passing the transforms after the DataLoader is the simplest way to use CutMix
+# and MixUp, but one disadvantage is that it does not take advantage of the
+# DataLoader multi-processing. For that, we can pass those transforms as part of
+# the collation function (refer to the `PyTorch docs
+# <https://pytorch.org/docs/stable/data.html#dataloader-collate-fn>`_ to learn
+# more about collation).
+
+from torch.utils.data import default_collate
+
+
+def collate_fn(batch):
+    return cutmix_or_mixup(*default_collate(batch))
+
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=2, collate_fn=collate_fn)
+
+for images, labels in dataloader:
+    print(f"{images.shape = }, {labels.shape = }")
+    # No need to call cutmix_or_mixup, it's already been called as part of the DataLoader!
+    # <rest of the training loop here>
+    break
+
+# %%
+# Non-standard input format
+# -------------------------
+#
+# So far we've used a typical sample structure where we pass ``(images,
+# labels)`` as inputs. MixUp and CutMix will magically work by default with most
+# common sample structures: tuples where the second parameter is a tensor label,
+# or dict with a "label[s]" key. Look at the documentation of the
+# ``labels_getter`` parameter for more details.
+#
+# If your samples have a different structure, you can still use CutMix and MixUp
+# by passing a callable to the ``labels_getter`` parameter. For example:
+
+batch = {
+    "imgs": torch.rand(4, 3, 224, 224),
+    "target": {
+        "classes": torch.randint(0, NUM_CLASSES, size=(4,)),
+        "some_other_key": "this is going to be passed-through"
+    }
+}
+
+
+def labels_getter(batch):
+    return batch["target"]["classes"]
+
+
+out = v2.CutMix(num_classes=NUM_CLASSES, labels_getter=labels_getter)(batch)
+print(f"{out['imgs'].shape = }, {out['target']['classes'].shape = }")
--- a/gallery/transforms/plot_transforms_e2e.py
+++ b/gallery/transforms/plot_transforms_e2e.py
+"""
+===============================================================
+Transforms v2: End-to-end object detection/segmentation example
+===============================================================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_e2e.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_transforms_e2e.py>` to download the full example code.
+
+Object detection and segmentation tasks are natively supported:
+``torchvision.transforms.v2`` enables jointly transforming images, videos,
+bounding boxes, and masks.
+
+This example showcases an end-to-end instance segmentation training case using
+Torchvision utils from ``torchvision.datasets``, ``torchvision.models`` and
+``torchvision.transforms.v2``. Everything covered here can be applied similarly
+to object detection or semantic segmentation tasks.
+"""
+
+# %%
+import pathlib
+
+import torch
+import torch.utils.data
+
+from torchvision import models, datasets, tv_tensors
+from torchvision.transforms import v2
+
+torch.manual_seed(0)
+
+# This loads fake data for illustration purposes of this example. In practice, you'll have
+# to replace this with the proper data.
+# If you're trying to run that on collab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+ROOT = pathlib.Path("../assets") / "coco"
+IMAGES_PATH = str(ROOT / "images")
+ANNOTATIONS_PATH = str(ROOT / "instances.json")
+from helpers import plot
+
+
+# %%
+# Dataset preparation
+# -------------------
+#
+# We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently
+# returns.
+
+dataset = datasets.CocoDetection(IMAGES_PATH, ANNOTATIONS_PATH)
+
+sample = dataset[0]
+img, target = sample
+print(f"{type(img) = }\n{type(target) = }\n{type(target[0]) = }\n{target[0].keys() = }")
+
+
+# %%
+# Torchvision datasets preserve the data structure and types as it was intended
+# by the datasets authors. So by default, the output structure may not always be
+# compatible with the models or the transforms.
+#
+# To overcome that, we can use the
+# :func:`~torchvision.datasets.wrap_dataset_for_transforms_v2` function. For
+# :class:`~torchvision.datasets.CocoDetection`, this changes the target
+# structure to a single dictionary of lists:
+
+dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys=("boxes", "labels", "masks"))
+
+sample = dataset[0]
+img, target = sample
+print(f"{type(img) = }\n{type(target) = }\n{target.keys() = }")
+print(f"{type(target['boxes']) = }\n{type(target['labels']) = }\n{type(target['masks']) = }")
+
+# %%
+# We used the ``target_keys`` parameter to specify the kind of output we're
+# interested in. Our dataset now returns a target which is dict where the values
+# are :ref:`TVTensors <what_are_tv_tensors>` (all are :class:`torch.Tensor`
+# subclasses). We're dropped all unncessary keys from the previous output, but
+# if you need any of the original keys e.g. "image_id", you can still ask for
+# it.
+#
+# .. note::
+#
+#     If you just want to do detection, you don't need and shouldn't pass
+#     "masks" in ``target_keys``: if masks are present in the sample, they will
+#     be transformed, slowing down your transformations unnecessarily.
+#
+# As baseline, let's have a look at a sample without transformations:
+
+plot([dataset[0], dataset[1]])
+
+
+# %%
+# Transforms
+# ----------
+#
+# Let's now define our pre-processing transforms. All the transforms know how
+# to handle images, bouding boxes and masks when relevant.
+#
+# Transforms are typically passed as the ``transforms`` parameter of the
+# dataset so that they can leverage multi-processing from the
+# :class:`torch.utils.data.DataLoader`.
+
+transforms = v2.Compose(
+    [
+        v2.ToImage(),
+        v2.RandomPhotometricDistort(p=1),
+        v2.RandomZoomOut(fill={tv_tensors.Image: (123, 117, 104), "others": 0}),
+        v2.RandomIoUCrop(),
+        v2.RandomHorizontalFlip(p=1),
+        v2.SanitizeBoundingBoxes(),
+        v2.ToDtype(torch.float32, scale=True),
+    ]
+)
+
+dataset = datasets.CocoDetection(IMAGES_PATH, ANNOTATIONS_PATH, transforms=transforms)
+dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys=["boxes", "labels", "masks"])
+
+# %%
+# A few things are worth noting here:
+#
+# - We're converting the PIL image into a
+#   :class:`~torchvision.transforms.v2.Image` object. This isn't strictly
+#   necessary, but relying on Tensors (here: a Tensor subclass) will
+#   :ref:`generally be faster <transforms_perf>`.
+# - We are calling :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` to
+#   make sure we remove degenerate bounding boxes, as well as their
+#   corresponding labels and masks.
+#   :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` should be placed
+#   at least once at the end of a detection pipeline; it is particularly
+#   critical if :class:`~torchvision.transforms.v2.RandomIoUCrop` was used.
+#
+# Let's look how the sample looks like with our augmentation pipeline in place:
+
+# sphinx_gallery_thumbnail_number = 2
+plot([dataset[0], dataset[1]])
+
+
+# %%
+# We can see that the color of the images were distorted, zoomed in or out, and flipped.
+# The bounding boxes and the masks were transformed accordingly. And without any further ado, we can start training.
+#
+# Data loading and training loop
+# ------------------------------
+#
+# Below we're using Mask-RCNN which is an instance segmentation model, but
+# everything we've covered in this tutorial also applies to object detection and
+# semantic segmentation tasks.
+
+data_loader = torch.utils.data.DataLoader(
+    dataset,
+    batch_size=2,
+    # We need a custom collation function here, since the object detection
+    # models expect a sequence of images and target dictionaries. The default
+    # collation function tries to torch.stack() the individual elements,
+    # which fails in general for object detection, because the number of bouding
+    # boxes varies between the images of a same batch.
+    collate_fn=lambda batch: tuple(zip(*batch)),
+)
+
+model = models.get_model("maskrcnn_resnet50_fpn_v2", weights=None, weights_backbone=None).train()
+
+for imgs, targets in data_loader:
+    loss_dict = model(imgs, targets)
+    # Put your training logic here
+
+    print(f"{[img.shape for img in imgs] = }")
+    print(f"{[type(target) for target in targets] = }")
+    for name, loss_val in loss_dict.items():
+        print(f"{name:<20}{loss_val:.3f}")
+
+# %%
+# Training References
+# -------------------
+#
+# From there, you can check out the `torchvision references
+# <https://github.com/pytorch/vision/tree/main/references>`_ where you'll find
+# the actual training scripts we use to train our models.
+#
+# **Disclaimer** The code in our references is more complex than what you'll
+# need for your own use-cases: this is because we're supporting different
+# backends (PIL, tensors, TVTensors) and different transforms namespaces (v1 and
+# v2). So don't be afraid to simplify and only keep what you need.
--- a/gallery/transforms/plot_transforms_getting_started.py
+++ b/gallery/transforms/plot_transforms_getting_started.py
+"""
+==================================
+Getting started with transforms v2
+==================================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_getting_started.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_transforms_getting_started.py>` to download the full example code.
+
+This example illustrates all of what you need to know to get started with the
+new :mod:`torchvision.transforms.v2` API. We'll cover simple tasks like
+image classification, and more advanced ones like object detection /
+segmentation.
+"""
+
+# %%
+# First, a bit of setup
+from pathlib import Path
+import torch
+import matplotlib.pyplot as plt
+plt.rcParams["savefig.bbox"] = 'tight'
+
+from torchvision.transforms import v2
+from torchvision.io import read_image
+
+torch.manual_seed(1)
+
+# If you're trying to run that on collab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+from helpers import plot
+img = read_image(str(Path('../assets') / 'astronaut.jpg'))
+print(f"{type(img) = }, {img.dtype = }, {img.shape = }")
+
+# %%
+# The basics
+# ----------
+#
+# The Torchvision transforms behave like a regular :class:`torch.nn.Module` (in
+# fact, most of them are): instantiate a transform, pass an input, get a
+# transformed output:
+
+transform = v2.RandomCrop(size=(224, 224))
+out = transform(img)
+
+plot([img, out])
+
+# %%
+# I just want to do image classification
+# --------------------------------------
+#
+# If you just care about image classification, things are very simple. A basic
+# classification pipeline may look like this:
+
+transforms = v2.Compose([
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=0.5),
+    v2.ToDtype(torch.float32, scale=True),
+    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+])
+out = transforms(img)
+
+plot([img, out])
+
+# %%
+# Such transformation pipeline is typically passed as the ``transform`` argument
+# to the :ref:`Datasets <datasets>`, e.g. ``ImageNet(...,
+# transform=transforms)``.
+#
+# That's pretty much all there is. From there, read through our :ref:`main docs
+# <transforms>` to learn more about recommended practices and conventions, or
+# explore more :ref:`examples <transforms_gallery>` e.g. how to use augmentation
+# transforms like :ref:`CutMix and MixUp
+# <sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py>`.
+#
+# .. note::
+#
+#     If you're already relying on the ``torchvision.transforms`` v1 API,
+#     we recommend to :ref:`switch to the new v2 transforms<v1_or_v2>`. It's
+#     very easy: the v2 transforms are fully compatible with the v1 API, so you
+#     only need to change the import!
+#
+# Detection, Segmentation, Videos
+# -------------------------------
+#
+# The new Torchvision transforms in the ``torchvision.transforms.v2`` namespace
+# support tasks beyond image classification: they can also transform bounding
+# boxes, segmentation / detection masks, or videos.
+#
+# Let's briefly look at a detection example with bounding boxes.
+
+from torchvision import tv_tensors  # we'll describe this a bit later, bare with us
+
+boxes = tv_tensors.BoundingBoxes(
+    [
+        [15, 10, 370, 510],
+        [275, 340, 510, 510],
+        [130, 345, 210, 425]
+    ],
+    format="XYXY", canvas_size=img.shape[-2:])
+
+transforms = v2.Compose([
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomPhotometricDistort(p=1),
+    v2.RandomHorizontalFlip(p=1),
+])
+out_img, out_boxes = transforms(img, boxes)
+print(type(boxes), type(out_boxes))
+
+plot([(img, boxes), (out_img, out_boxes)])
+
+# %%
+#
+# The example above focuses on object detection. But if we had masks
+# (:class:`torchvision.tv_tensors.Mask`) for object segmentation or semantic
+# segmentation, or videos (:class:`torchvision.tv_tensors.Video`), we could have
+# passed them to the transforms in exactly the same way.
+#
+# By now you likely have a few questions: what are these TVTensors, how do we
+# use them, and what is the expected input/output of those transforms? We'll
+# answer these in the next sections.
+
+# %%
+#
+# .. _what_are_tv_tensors:
+#
+# What are TVTensors?
+# --------------------
+#
+# TVTensors are :class:`torch.Tensor` subclasses. The available TVTensors are
+# :class:`~torchvision.tv_tensors.Image`,
+# :class:`~torchvision.tv_tensors.BoundingBoxes`,
+# :class:`~torchvision.tv_tensors.Mask`, and
+# :class:`~torchvision.tv_tensors.Video`.
+#
+# TVTensors look and feel just like regular tensors - they **are** tensors.
+# Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()``
+# or any ``torch.*`` operator will also work on a TVTensor:
+
+img_dp = tv_tensors.Image(torch.randint(0, 256, (3, 256, 256), dtype=torch.uint8))
+
+print(f"{isinstance(img_dp, torch.Tensor) = }")
+print(f"{img_dp.dtype = }, {img_dp.shape = }, {img_dp.sum() = }")
+
+# %%
+# These TVTensor classes are at the core of the transforms: in order to
+# transform a given input, the transforms first look at the **class** of the
+# object, and dispatch to the appropriate implementation accordingly.
+#
+# You don't need to know much more about TVTensors at this point, but advanced
+# users who want to learn more can refer to
+# :ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py`.
+#
+# What do I pass as input?
+# ------------------------
+#
+# Above, we've seen two examples: one where we passed a single image as input
+# i.e. ``out = transforms(img)``, and one where we passed both an image and
+# bounding boxes, i.e. ``out_img, out_boxes = transforms(img, boxes)``.
+#
+# In fact, transforms support **arbitrary input structures**. The input can be a
+# single image, a tuple, an arbitrarily nested dictionary... pretty much
+# anything. The same structure will be returned as output. Below, we use the
+# same detection transforms, but pass a tuple (image, target_dict) as input and
+# we're getting the same structure as output:
+
+target = {
+    "boxes": boxes,
+    "labels": torch.arange(boxes.shape[0]),
+    "this_is_ignored": ("arbitrary", {"structure": "!"})
+}
+
+# Re-using the transforms and definitions from above.
+out_img, out_target = transforms(img, target)
+
+# sphinx_gallery_thumbnail_number = 4
+plot([(img, target["boxes"]), (out_img, out_target["boxes"])])
+print(f"{out_target['this_is_ignored']}")
+
+# %%
+# We passed a tuple so we get a tuple back, and the second element is the
+# tranformed target dict. Transforms don't really care about the structure of
+# the input; as mentioned above, they only care about the **type** of the
+# objects and transforms them accordingly.
+#
+# *Foreign* objects like strings or ints are simply passed-through. This can be
+# useful e.g. if you want to associate a path with every single sample when
+# debugging!
+#
+# .. _passthrough_heuristic:
+#
+# .. note::
+#
+#     **Disclaimer** This note is slightly advanced and can be safely skipped on
+#     a first read.
+#
+#     Pure :class:`torch.Tensor` objects are, in general, treated as images (or
+#     as videos for video-specific transforms). Indeed, you may have noticed
+#     that in the code above we haven't used the
+#     :class:`~torchvision.tv_tensors.Image` class at all, and yet our images
+#     got transformed properly. Transforms follow the following logic to
+#     determine whether a pure Tensor should be treated as an image (or video),
+#     or just ignored:
+#
+#     * If there is an :class:`~torchvision.tv_tensors.Image`,
+#       :class:`~torchvision.tv_tensors.Video`,
+#       or :class:`PIL.Image.Image` instance in the input, all other pure
+#       tensors are passed-through.
+#     * If there is no :class:`~torchvision.tv_tensors.Image` or
+#       :class:`~torchvision.tv_tensors.Video` instance, only the first pure
+#       :class:`torch.Tensor` will be transformed as image or video, while all
+#       others will be passed-through. Here "first" means "first in a depth-wise
+#       traversal".
+#
+#     This is what happened in the detection example above: the first pure
+#     tensor was the image so it got transformed properly, and all other pure
+#     tensor instances like the ``labels`` were passed-through (although labels
+#     can still be transformed by some transforms like
+#     :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`!).
+#
+# .. _transforms_datasets_intercompatibility:
+#
+# Transforms and Datasets intercompatibility
+# ------------------------------------------
+#
+# Roughly speaking, the output of the datasets must correspond to the input of
+# the transforms. How to do that depends on whether you're using the torchvision
+# :ref:`built-in datatsets <datasets>`, or your own custom datasets.
+#
+# Using built-in datasets
+# ^^^^^^^^^^^^^^^^^^^^^^^
+#
+# If you're just doing image classification, you don't need to do anything. Just
+# use ``transform`` argument of the dataset e.g. ``ImageNet(...,
+# transform=transforms)`` and you're good to go.
+#
+# Torchvision also supports datasets for object detection or segmentation like
+# :class:`torchvision.datasets.CocoDetection`. Those datasets predate
+# the existence of the :mod:`torchvision.transforms.v2` module and of the
+# TVTensors, so they don't return TVTensors out of the box.
+#
+# An easy way to force those datasets to return TVTensors and to make them
+# compatible with v2 transforms is to use the
+# :func:`torchvision.datasets.wrap_dataset_for_transforms_v2` function:
+#
+# .. code-block:: python
+#
+#    from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2
+#
+#    dataset = CocoDetection(..., transforms=my_transforms)
+#    dataset = wrap_dataset_for_transforms_v2(dataset)
+#    # Now the dataset returns TVTensors!
+#
+# Using your own datasets
+# ^^^^^^^^^^^^^^^^^^^^^^^
+#
+# If you have a custom dataset, then you'll need to convert your objects into
+# the appropriate TVTensor classes. Creating TVTensor instances is very easy,
+# refer to :ref:`tv_tensor_creation` for more details.
+#
+# There are two main places where you can implement that conversion logic:
+#
+# - At the end of the datasets's ``__getitem__`` method, before returning the
+#   sample (or by sub-classing the dataset).
+# - As the very first step of your transforms pipeline
+#
+# Either way, the logic will depend on your specific dataset.
--- a/gallery/plot_transforms.py
+++ b/gallery/plot_transforms.py
@@ -3,317 +3,318 @@
 Illustration of transforms
 ==========================

-This example illustrates the various transforms available in :ref:`the
-torchvision.transforms module <transforms>`.
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_illustrations.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_transforms_illustrations.py>` to download the full example code.
+
+This example illustrates some of the various transforms available in :ref:`the
+torchvision.transforms.v2 module <transforms>`.
 """
+# %%

 # sphinx_gallery_thumbnail_path = "../../gallery/assets/transforms_thumbnail.png"

 from PIL import Image
 from pathlib import Path
 import matplotlib.pyplot as plt
-import numpy as np

 import torch
-import torchvision.transforms as T
-
+from torchvision.transforms import v2

 plt.rcParams["savefig.bbox"] = 'tight'
-orig_img = Image.open(Path('assets') / 'astronaut.jpg')
+
 # if you change the seed, make sure that the randomly-applied transforms
 # properly show that the image can be both transformed and *not* transformed!
 torch.manual_seed(0)

-
-def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
-    if not isinstance(imgs[0], list):
-        # Make a 2d grid even if there's just 1 row
-        imgs = [imgs]
-
-    num_rows = len(imgs)
-    num_cols = len(imgs[0]) + with_orig
-    fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
-    for row_idx, row in enumerate(imgs):
-        row = [orig_img] + row if with_orig else row
-        for col_idx, img in enumerate(row):
-            ax = axs[row_idx, col_idx]
-            ax.imshow(np.asarray(img), **imshow_kwargs)
-            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
-
-    if with_orig:
-        axs[0, 0].set(title='Original image')
-        axs[0, 0].title.set_size(8)
-    if row_title is not None:
-        for row_idx in range(num_rows):
-            axs[row_idx, 0].set(ylabel=row_title[row_idx])
-
-    plt.tight_layout()
-
-
-####################################
+# If you're trying to run that on collab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+from helpers import plot
+orig_img = Image.open(Path('../assets') / 'astronaut.jpg')
+
+# %%
+# Geometric Transforms
+# --------------------
+# Geometric image transformation refers to the process of altering the geometric properties of an image,
+# such as its shape, size, orientation, or position.
+# It involves applying mathematical operations to the image pixels or coordinates to achieve the desired transformation.
+#
 # Pad
-# ---
+# ~~~
 # The :class:`~torchvision.transforms.Pad` transform
 # (see also :func:`~torchvision.transforms.functional.pad`)
-# fills image borders with some pixel values.
-padded_imgs = [T.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)]
-plot(padded_imgs)
+# pads all image borders with some pixel values.
+padded_imgs = [v2.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)]
+plot([orig_img] + padded_imgs)

-####################################
+# %%
 # Resize
-# ------
+# ~~~~~~
 # The :class:`~torchvision.transforms.Resize` transform
 # (see also :func:`~torchvision.transforms.functional.resize`)
 # resizes an image.
-resized_imgs = [T.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
-plot(resized_imgs)
+resized_imgs = [v2.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
+plot([orig_img] + resized_imgs)

-####################################
+# %%
 # CenterCrop
-# ----------
+# ~~~~~~~~~~
 # The :class:`~torchvision.transforms.CenterCrop` transform
 # (see also :func:`~torchvision.transforms.functional.center_crop`)
 # crops the given image at the center.
-center_crops = [T.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
-plot(center_crops)
+center_crops = [v2.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
+plot([orig_img] + center_crops)

-####################################
+# %%
 # FiveCrop
-# --------
+# ~~~~~~~~
 # The :class:`~torchvision.transforms.FiveCrop` transform
 # (see also :func:`~torchvision.transforms.functional.five_crop`)
 # crops the given image into four corners and the central crop.
-(top_left, top_right, bottom_left, bottom_right, center) = T.FiveCrop(size=(100, 100))(orig_img)
-plot([top_left, top_right, bottom_left, bottom_right, center])
+(top_left, top_right, bottom_left, bottom_right, center) = v2.FiveCrop(size=(100, 100))(orig_img)
+plot([orig_img] + [top_left, top_right, bottom_left, bottom_right, center])

-####################################
-# Grayscale
-# ---------
-# The :class:`~torchvision.transforms.Grayscale` transform
-# (see also :func:`~torchvision.transforms.functional.to_grayscale`)
-# converts an image to grayscale
-gray_img = T.Grayscale()(orig_img)
-plot([gray_img], cmap='gray')
-
-####################################
-# Random transforms
-# -----------------
-# The following transforms are random, which means that the same transfomer
-# instance will produce different result each time it transforms a given image.
-#
-# ColorJitter
-# ~~~~~~~~~~~
-# The :class:`~torchvision.transforms.ColorJitter` transform
-# randomly changes the brightness, saturation, and other properties of an image.
-jitter = T.ColorJitter(brightness=.5, hue=.3)
-jitted_imgs = [jitter(orig_img) for _ in range(4)]
-plot(jitted_imgs)
-
-####################################
-# GaussianBlur
-# ~~~~~~~~~~~~
-# The :class:`~torchvision.transforms.GaussianBlur` transform
-# (see also :func:`~torchvision.transforms.functional.gaussian_blur`)
-# performs gaussian blur transform on an image.
-blurrer = T.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5))
-blurred_imgs = [blurrer(orig_img) for _ in range(4)]
-plot(blurred_imgs)
-
-####################################
+# %%
 # RandomPerspective
 # ~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomPerspective` transform
 # (see also :func:`~torchvision.transforms.functional.perspective`)
 # performs random perspective transform on an image.
-perspective_transformer = T.RandomPerspective(distortion_scale=0.6, p=1.0)
+perspective_transformer = v2.RandomPerspective(distortion_scale=0.6, p=1.0)
 perspective_imgs = [perspective_transformer(orig_img) for _ in range(4)]
-plot(perspective_imgs)
+plot([orig_img] + perspective_imgs)

-####################################
+# %%
 # RandomRotation
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomRotation` transform
 # (see also :func:`~torchvision.transforms.functional.rotate`)
 # rotates an image with random angle.
-rotater = T.RandomRotation(degrees=(0, 180))
+rotater = v2.RandomRotation(degrees=(0, 180))
 rotated_imgs = [rotater(orig_img) for _ in range(4)]
-plot(rotated_imgs)
+plot([orig_img] + rotated_imgs)

-####################################
+# %%
 # RandomAffine
 # ~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAffine` transform
 # (see also :func:`~torchvision.transforms.functional.affine`)
 # performs random affine transform on an image.
-affine_transfomer = T.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75))
+affine_transfomer = v2.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75))
 affine_imgs = [affine_transfomer(orig_img) for _ in range(4)]
-plot(affine_imgs)
+plot([orig_img] + affine_imgs)

-####################################
+# %%
 # ElasticTransform
 # ~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.ElasticTransform` transform
 # (see also :func:`~torchvision.transforms.functional.elastic_transform`)
 # Randomly transforms the morphology of objects in images and produces a
 # see-through-water-like effect.
-elastic_transformer = T.ElasticTransform(alpha=250.0)
+elastic_transformer = v2.ElasticTransform(alpha=250.0)
 transformed_imgs = [elastic_transformer(orig_img) for _ in range(2)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)

-####################################
+# %%
 # RandomCrop
 # ~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomCrop` transform
 # (see also :func:`~torchvision.transforms.functional.crop`)
 # crops an image at a random location.
-cropper = T.RandomCrop(size=(128, 128))
+cropper = v2.RandomCrop(size=(128, 128))
 crops = [cropper(orig_img) for _ in range(4)]
-plot(crops)
+plot([orig_img] + crops)

-####################################
+# %%
 # RandomResizedCrop
 # ~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomResizedCrop` transform
 # (see also :func:`~torchvision.transforms.functional.resized_crop`)
 # crops an image at a random location, and then resizes the crop to a given
 # size.
-resize_cropper = T.RandomResizedCrop(size=(32, 32))
+resize_cropper = v2.RandomResizedCrop(size=(32, 32))
 resized_crops = [resize_cropper(orig_img) for _ in range(4)]
-plot(resized_crops)
+plot([orig_img] + resized_crops)
+
+# %%
+# Photometric Transforms
+# ----------------------
+# Photometric image transformation refers to the process of modifying the photometric properties of an image,
+# such as its brightness, contrast, color, or tone.
+# These transformations are applied to change the visual appearance of an image
+# while preserving its geometric structure.
+#
+# Except :class:`~torchvision.transforms.Grayscale`, the following transforms are random,
+# which means that the same transform
+# instance will produce different result each time it transforms a given image.
+#
+# Grayscale
+# ~~~~~~~~~
+# The :class:`~torchvision.transforms.Grayscale` transform
+# (see also :func:`~torchvision.transforms.functional.to_grayscale`)
+# converts an image to grayscale
+gray_img = v2.Grayscale()(orig_img)
+plot([orig_img, gray_img], cmap='gray')

-####################################
+# %%
+# ColorJitter
+# ~~~~~~~~~~~
+# The :class:`~torchvision.transforms.ColorJitter` transform
+# randomly changes the brightness, contrast, saturation, hue, and other properties of an image.
+jitter = v2.ColorJitter(brightness=.5, hue=.3)
+jittered_imgs = [jitter(orig_img) for _ in range(4)]
+plot([orig_img] + jittered_imgs)
+
+# %%
+# GaussianBlur
+# ~~~~~~~~~~~~
+# The :class:`~torchvision.transforms.GaussianBlur` transform
+# (see also :func:`~torchvision.transforms.functional.gaussian_blur`)
+# performs gaussian blur transform on an image.
+blurrer = v2.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5.))
+blurred_imgs = [blurrer(orig_img) for _ in range(4)]
+plot([orig_img] + blurred_imgs)
+
+# %%
 # RandomInvert
 # ~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomInvert` transform
 # (see also :func:`~torchvision.transforms.functional.invert`)
 # randomly inverts the colors of the given image.
-inverter = T.RandomInvert()
+inverter = v2.RandomInvert()
 invertered_imgs = [inverter(orig_img) for _ in range(4)]
-plot(invertered_imgs)
+plot([orig_img] + invertered_imgs)

-####################################
+# %%
 # RandomPosterize
 # ~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomPosterize` transform
 # (see also :func:`~torchvision.transforms.functional.posterize`)
 # randomly posterizes the image by reducing the number of bits
 # of each color channel.
-posterizer = T.RandomPosterize(bits=2)
+posterizer = v2.RandomPosterize(bits=2)
 posterized_imgs = [posterizer(orig_img) for _ in range(4)]
-plot(posterized_imgs)
+plot([orig_img] + posterized_imgs)

-####################################
+# %%
 # RandomSolarize
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomSolarize` transform
 # (see also :func:`~torchvision.transforms.functional.solarize`)
 # randomly solarizes the image by inverting all pixel values above
 # the threshold.
-solarizer = T.RandomSolarize(threshold=192.0)
+solarizer = v2.RandomSolarize(threshold=192.0)
 solarized_imgs = [solarizer(orig_img) for _ in range(4)]
-plot(solarized_imgs)
+plot([orig_img] + solarized_imgs)

-####################################
+# %%
 # RandomAdjustSharpness
 # ~~~~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAdjustSharpness` transform
 # (see also :func:`~torchvision.transforms.functional.adjust_sharpness`)
 # randomly adjusts the sharpness of the given image.
-sharpness_adjuster = T.RandomAdjustSharpness(sharpness_factor=2)
+sharpness_adjuster = v2.RandomAdjustSharpness(sharpness_factor=2)
 sharpened_imgs = [sharpness_adjuster(orig_img) for _ in range(4)]
-plot(sharpened_imgs)
+plot([orig_img] + sharpened_imgs)

-####################################
+# %%
 # RandomAutocontrast
 # ~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAutocontrast` transform
 # (see also :func:`~torchvision.transforms.functional.autocontrast`)
 # randomly applies autocontrast to the given image.
-autocontraster = T.RandomAutocontrast()
+autocontraster = v2.RandomAutocontrast()
 autocontrasted_imgs = [autocontraster(orig_img) for _ in range(4)]
-plot(autocontrasted_imgs)
+plot([orig_img] + autocontrasted_imgs)

-####################################
+# %%
 # RandomEqualize
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomEqualize` transform
 # (see also :func:`~torchvision.transforms.functional.equalize`)
 # randomly equalizes the histogram of the given image.
-equalizer = T.RandomEqualize()
+equalizer = v2.RandomEqualize()
 equalized_imgs = [equalizer(orig_img) for _ in range(4)]
-plot(equalized_imgs)
+plot([orig_img] + equalized_imgs)

-####################################
+# %%
+# Augmentation Transforms
+# -----------------------
+# The following transforms are combinations of multiple transforms,
+# either geometric or photometric, or both.
+#
 # AutoAugment
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.AutoAugment` transform
 # automatically augments data based on a given auto-augmentation policy.
 # See :class:`~torchvision.transforms.AutoAugmentPolicy` for the available policies.
-policies = [T.AutoAugmentPolicy.CIFAR10, T.AutoAugmentPolicy.IMAGENET, T.AutoAugmentPolicy.SVHN]
-augmenters = [T.AutoAugment(policy) for policy in policies]
+policies = [v2.AutoAugmentPolicy.CIFAR10, v2.AutoAugmentPolicy.IMAGENET, v2.AutoAugmentPolicy.SVHN]
+augmenters = [v2.AutoAugment(policy) for policy in policies]
 imgs = [
    [augmenter(orig_img) for _ in range(4)]
    for augmenter in augmenters
 ]
 row_title = [str(policy).split('.')[-1] for policy in policies]
-plot(imgs, row_title=row_title)
+plot([[orig_img] + row for row in imgs], row_title=row_title)

-####################################
+# %%
 # RandAugment
 # ~~~~~~~~~~~
-# The :class:`~torchvision.transforms.RandAugment` transform automatically augments the data.
-augmenter = T.RandAugment()
+# The :class:`~torchvision.transforms.RandAugment` is an alternate version of AutoAugment.
+augmenter = v2.RandAugment()
 imgs = [augmenter(orig_img) for _ in range(4)]
-plot(imgs)
+plot([orig_img] + imgs)

-####################################
+# %%
 # TrivialAugmentWide
 # ~~~~~~~~~~~~~~~~~~
-# The :class:`~torchvision.transforms.TrivialAugmentWide` transform automatically augments the data.
-augmenter = T.TrivialAugmentWide()
+# The :class:`~torchvision.transforms.TrivialAugmentWide` is an alternate implementation of AutoAugment.
+# However, instead of transforming an image multiple times, it transforms an image only once
+# using a random transform from a given list with a random strength number.
+augmenter = v2.TrivialAugmentWide()
 imgs = [augmenter(orig_img) for _ in range(4)]
-plot(imgs)
+plot([orig_img] + imgs)

-####################################
+# %%
 # AugMix
 # ~~~~~~
-# The :class:`~torchvision.transforms.AugMix` transform automatically augments the data.
-augmenter = T.AugMix()
+# The :class:`~torchvision.transforms.AugMix` transform interpolates between augmented versions of an image.
+augmenter = v2.AugMix()
 imgs = [augmenter(orig_img) for _ in range(4)]
-plot(imgs)
+plot([orig_img] + imgs)

-####################################
-# Randomly-applied transforms
+# %%
+# Randomly-applied Transforms
 # ---------------------------
 #
-# Some transforms are randomly-applied given a probability ``p``.  That is, the
-# transformed image may actually be the same as the original one, even when
-# called with the same transformer instance!
+# The following transforms are randomly-applied given a probability ``p``.  That is, given ``p = 0.5``,
+# there is a 50% chance to return the original image, and a 50% chance to return the transformed image,
+# even when called with the same transform instance!
 #
 # RandomHorizontalFlip
 # ~~~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomHorizontalFlip` transform
 # (see also :func:`~torchvision.transforms.functional.hflip`)
 # performs horizontal flip of an image, with a given probability.
-hflipper = T.RandomHorizontalFlip(p=0.5)
+hflipper = v2.RandomHorizontalFlip(p=0.5)
 transformed_imgs = [hflipper(orig_img) for _ in range(4)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)

-####################################
+# %%
 # RandomVerticalFlip
 # ~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomVerticalFlip` transform
 # (see also :func:`~torchvision.transforms.functional.vflip`)
 # performs vertical flip of an image, with a given probability.
-vflipper = T.RandomVerticalFlip(p=0.5)
+vflipper = v2.RandomVerticalFlip(p=0.5)
 transformed_imgs = [vflipper(orig_img) for _ in range(4)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)

-####################################
+# %%
 # RandomApply
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomApply` transform
 # randomly applies a list of transforms, with a given probability.
-applier = T.RandomApply(transforms=[T.RandomCrop(size=(64, 64))], p=0.5)
+applier = v2.RandomApply(transforms=[v2.RandomCrop(size=(64, 64))], p=0.5)
 transformed_imgs = [applier(orig_img) for _ in range(4)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)
--- a/gallery/transforms/plot_tv_tensors.py
+++ b/gallery/transforms/plot_tv_tensors.py
+"""
+=============
+TVTensors FAQ
+=============
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_tv_tensors.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_tv_tensors.py>` to download the full example code.
+
+
+TVTensors are Tensor subclasses introduced together with
+``torchvision.transforms.v2``. This example showcases what these TVTensors are
+and how they behave.
+
+.. warning::
+
+    **Intended Audience** Unless you're writing your own transforms or your own TVTensors, you
+    probably do not need to read this guide. This is a fairly low-level topic
+    that most users will not need to worry about: you do not need to understand
+    the internals of TVTensors to efficiently rely on
+    ``torchvision.transforms.v2``. It may however be useful for advanced users
+    trying to implement their own datasets, transforms, or work directly with
+    the TVTensors.
+"""
+
+# %%
+import PIL.Image
+
+import torch
+from torchvision import tv_tensors
+
+
+# %%
+# What are TVTensors?
+# -------------------
+#
+# TVTensors are zero-copy tensor subclasses:
+
+tensor = torch.rand(3, 256, 256)
+image = tv_tensors.Image(tensor)
+
+assert isinstance(image, torch.Tensor)
+assert image.data_ptr() == tensor.data_ptr()
+
+# %%
+# Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
+# for the input data.
+#
+# :mod:`torchvision.tv_tensors` supports four types of TVTensors:
+#
+# * :class:`~torchvision.tv_tensors.Image`
+# * :class:`~torchvision.tv_tensors.Video`
+# * :class:`~torchvision.tv_tensors.BoundingBoxes`
+# * :class:`~torchvision.tv_tensors.Mask`
+#
+# What can I do with a TVTensor?
+# ------------------------------
+#
+# TVTensors look and feel just like regular tensors - they **are** tensors.
+# Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()`` or
+# any ``torch.*`` operator will also work on TVTensors. See
+# :ref:`tv_tensor_unwrapping_behaviour` for a few gotchas.
+
+# %%
+# .. _tv_tensor_creation:
+#
+# How do I construct a TVTensor?
+# ------------------------------
+#
+# Using the constructor
+# ^^^^^^^^^^^^^^^^^^^^^
+#
+# Each TVTensor class takes any tensor-like data that can be turned into a :class:`~torch.Tensor`
+
+image = tv_tensors.Image([[[[0, 1], [1, 0]]]])
+print(image)
+
+
+# %%
+# Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad``
+# parameters.
+
+float_image = tv_tensors.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires_grad=True)
+print(float_image)
+
+
+# %%
+# In addition, :class:`~torchvision.tv_tensors.Image` and :class:`~torchvision.tv_tensors.Mask` can also take a
+# :class:`PIL.Image.Image` directly:
+
+image = tv_tensors.Image(PIL.Image.open("../assets/astronaut.jpg"))
+print(image.shape, image.dtype)
+
+# %%
+# Some TVTensors require additional metadata to be passed in ordered to be constructed. For example,
+# :class:`~torchvision.tv_tensors.BoundingBoxes` requires the coordinate format as well as the size of the
+# corresponding image (``canvas_size``) alongside the actual values. These
+# metadata are required to properly transform the bounding boxes.
+
+bboxes = tv_tensors.BoundingBoxes(
+    [[17, 16, 344, 495], [0, 10, 0, 10]],
+    format=tv_tensors.BoundingBoxFormat.XYXY,
+    canvas_size=image.shape[-2:]
+)
+print(bboxes)
+
+# %%
+# Using ``tv_tensors.wrap()``
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# You can also use the :func:`~torchvision.tv_tensors.wrap` function to wrap a tensor object
+# into a TVTensor. This is useful when you already have an object of the
+# desired type, which typically happens when writing transforms: you just want
+# to wrap the output like the input.
+
+new_bboxes = torch.tensor([0, 20, 30, 40])
+new_bboxes = tv_tensors.wrap(new_bboxes, like=bboxes)
+assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
+assert new_bboxes.canvas_size == bboxes.canvas_size
+
+# %%
+# The metadata of ``new_bboxes`` is the same as ``bboxes``, but you could pass
+# it as a parameter to override it.
+#
+# .. _tv_tensor_unwrapping_behaviour:
+#
+# I had a TVTensor but now I have a Tensor. Help!
+# -----------------------------------------------
+#
+# By default, operations on :class:`~torchvision.tv_tensors.TVTensor` objects
+# will return a pure Tensor:
+
+
+assert isinstance(bboxes, tv_tensors.BoundingBoxes)
+
+# Shift bboxes by 3 pixels in both H and W
+new_bboxes = bboxes + 3
+
+assert isinstance(new_bboxes, torch.Tensor)
+assert not isinstance(new_bboxes, tv_tensors.BoundingBoxes)
+
+# %%
+# .. note::
+#
+#    This behavior only affects native ``torch`` operations. If you are using
+#    the built-in ``torchvision`` transforms or functionals, you will always get
+#    as output the same type that you passed as input (pure ``Tensor`` or
+#    ``TVTensor``).
+
+# %%
+# But I want a TVTensor back!
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# You can re-wrap a pure tensor into a TVTensor by just calling the TVTensor
+# constructor, or by using the :func:`~torchvision.tv_tensors.wrap` function
+# (see more details above in :ref:`tv_tensor_creation`):
+
+new_bboxes = bboxes + 3
+new_bboxes = tv_tensors.wrap(new_bboxes, like=bboxes)
+assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
+
+# %%
+# Alternatively, you can use the :func:`~torchvision.tv_tensors.set_return_type`
+# as a global config setting for the whole program, or as a context manager
+# (read its docs to learn more about caveats):
+
+with tv_tensors.set_return_type("TVTensor"):
+    new_bboxes = bboxes + 3
+assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
+
+# %%
+# Why is this happening?
+# ^^^^^^^^^^^^^^^^^^^^^^
+#
+# **For performance reasons**. :class:`~torchvision.tv_tensors.TVTensor`
+# classes are Tensor subclasses, so any operation involving a
+# :class:`~torchvision.tv_tensors.TVTensor` object will go through the
+# `__torch_function__
+# <https://pytorch.org/docs/stable/notes/extending.html#extending-torch>`_
+# protocol. This induces a small overhead, which we want to avoid when possible.
+# This doesn't matter for built-in ``torchvision`` transforms because we can
+# avoid the overhead there, but it could be a problem in your model's
+# ``forward``.
+#
+# **The alternative isn't much better anyway.** For every operation where
+# preserving the :class:`~torchvision.tv_tensors.TVTensor` type makes
+# sense, there are just as many operations where returning a pure Tensor is
+# preferable: for example, is ``img.sum()`` still an :class:`~torchvision.tv_tensors.Image`?
+# If we were to preserve :class:`~torchvision.tv_tensors.TVTensor` types all
+# the way, even model's logits or the output of the loss function would end up
+# being of type :class:`~torchvision.tv_tensors.Image`, and surely that's not
+# desirable.
+#
+# .. note::
+#
+#    This behaviour is something we're actively seeking feedback on. If you find this surprising or if you
+#    have any suggestions on how to better support your use-cases, please reach out to us via this issue:
+#    https://github.com/pytorch/vision/issues/7319
+#
+# Exceptions
+# ^^^^^^^^^^
+#
+# There are a few exceptions to this "unwrapping" rule:
+# :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`,
+# :meth:`torch.Tensor.detach`, and :meth:`~torch.Tensor.requires_grad_` retain
+# the TVTensor type.
+#
+# Inplace operations on TVTensors like ``obj.add_()`` will preserve the type of
+# ``obj``. However, the **returned** value of inplace operations will be a pure
+# tensor:
+
+image = tv_tensors.Image([[[0, 1], [1, 0]]])
+
+new_image = image.add_(1).mul_(2)
+
+# image got transformed in-place and is still a TVTensor Image, but new_image
+# is a Tensor. They share the same underlying data and they're equal, just
+# different classes.
+assert isinstance(image, tv_tensors.Image)
+print(image)
+
+assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, tv_tensors.Image)
+assert (new_image == image).all()
+assert new_image.data_ptr() == image.data_ptr()
--- a/hubconf.py
+++ b/hubconf.py
@@ -20,6 +20,7 @@ from torchvision.models.efficientnet import (
 )
 from torchvision.models.googlenet import googlenet
 from torchvision.models.inception import inception_v3
+from torchvision.models.maxvit import maxvit_t
 from torchvision.models.mnasnet import mnasnet0_5, mnasnet0_75, mnasnet1_0, mnasnet1_3
 from torchvision.models.mobilenetv2 import mobilenet_v2
 from torchvision.models.mobilenetv3 import mobilenet_v3_large, mobilenet_v3_small
@@ -68,6 +69,17 @@ from torchvision.models.shufflenetv2 import (
    shufflenet_v2_x2_0,
 )
 from torchvision.models.squeezenet import squeezenet1_0, squeezenet1_1
-from torchvision.models.swin_transformer import swin_b, swin_s, swin_t
+from torchvision.models.swin_transformer import swin_b, swin_s, swin_t, swin_v2_b, swin_v2_s, swin_v2_t
 from torchvision.models.vgg import vgg11, vgg11_bn, vgg13, vgg13_bn, vgg16, vgg16_bn, vgg19, vgg19_bn
+from torchvision.models.video import (
+    mc3_18,
+    mvit_v1_b,
+    mvit_v2_s,
+    r2plus1d_18,
+    r3d_18,
+    s3d,
+    swin3d_b,
+    swin3d_s,
+    swin3d_t,
+)
 from torchvision.models.vision_transformer import vit_b_16, vit_b_32, vit_h_14, vit_l_16, vit_l_32
--- a/ios/CMakeLists.txt
+++ b/ios/CMakeLists.txt
 cmake_minimum_required(VERSION 3.4.1)
 set(TARGET torchvision_ops)
 project(${TARGET} CXX)
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
 set(LIBTORCH_HEADER_ROOT ${LIBTORCH_HEADER_ROOT})
 set(LIBRARY_OUTPUT_PATH ../lib)


--- a/ios/LibTorchvision.podspec
+++ b/ios/LibTorchvision.podspec
-pytorch_version = '1.12.0'
+pytorch_version = '2.0.0'

 Pod::Spec.new do |s|
    s.name             = 'LibTorchvision'
-    s.version          = '0.13.0'
+    s.version          = '0.15.1'
    s.authors          = 'PyTorch Team'
    s.license          = { :type => 'BSD' }
    s.homepage         = 'https://github.com/pytorch/vision'

--- a/ios/README.md
+++ b/ios/README.md
+## Status
+
+The iOS demo of TorchVision is currently unmaintained, untested and likely out-of-date.
--- a/maintainer_guide.md
+++ b/maintainer_guide.md
+## Torchvision maintainers guide
+
+This document aims at documenting user-facing policies / principles used when
+developing and maintaining torchvision. Other maintainer info (e.g. release
+process) can be found in the meta-internal wiki.
+
+### What is public and what is private?
+
+For the Python API, torchvision largely follows the [PyTorch
+policy](https://github.com/pytorch/pytorch/wiki/Public-API-definition-and-documentation)
+which is consistent with other major packages
+([numpy](https://numpy.org/neps/nep-0023-backwards-compatibility.html),
+[scikit-learn](https://scikit-learn.org/dev/glossary.html#term-API) etc.).
+We recognize that his policy is somewhat imperfect for some edge cases, and that
+it's difficult to come up with an accurate technical definition. In broad terms,
+which are usually well understood by users, the policy is that:
+
+- modules that can be accessed without leading underscore are public
+- objects in a public file that don't have a leading underscore are public
+- class attributes are public iff they have no leading underscore
+- the rest of the modules / objects / class attributes are considered private
+
+The public API has backward-compatible (BC) guarantees defined in our
+deprecation policy (see below). The private API has not BC guarantees.
+
+For C++, code is private. For Meta employees: if a C++ change breaks fbcode, fix
+fbcode or revert the change. We should be careful about models running in
+production and relying on torchvision ops.
+
+The `test` folder is not importable and is **private.** Even meta-internal
+projects should *not* rely on it (it has happened in the past and is now
+programmatically impossible).
+
+The training references do not have BC guarantees. Breaking changes are
+possible, but we should make sure that the tutorials are still running properly,
+and that their intended narrative is preserved (by e.g. checking outputs,
+etc.).
+
+The rest of the folders (build, android, ios, etc.) are private and have no BC
+guarantees.
+
+### Deprecation policy.
+
+Because they're disruptive, **deprecations should only be used sparingly**.
+
+We largely follow the [PyTorch
+policy](https://github.com/pytorch/pytorch/wiki/PyTorch's-Python-Frontend-Backward-and-Forward-Compatibility-Policy):
+breaking changes require a deprecation period of at least 2 versions.
+
+Deprecations should clearly indicate their deadline in the docs and warning
+messages. Avoid not committing to a deadline, or keeping deprecated APIs for too
+long: it gives no incentive for users to update their code, sends conflicting
+messages ("why was this API removed while this other one is still around?"), and
+accumulates debt in the project.
+
+### Should this attribute be public? Should this function be private?
+
+When designing an API it’s not always obvious what should be exposed as public,
+and what should be kept as a private implementation detail. The following
+guidelines can be useful:
+
+* Functional consistency throughout the library is a top priority, for users and
+  developers’ sake. In doubt and unless it’s clearly wrong, expose what other
+  similar classes expose.
+* Think really hard about the users and their use-cases, and try to expose what
+  they would need to address those use-cases. Aggressively keep everything else
+  private. Remember that the “private -> public” direction is way smoother than
+  the “public -> private” one: in doubt, keep it private.
+* When thinking about use-cases, the general API motto applies: make what’s
+  simple and common easy, and make what’s complex possible (80% / 20% rule).
+  There might be a ~1% left that’s not addressed: that’s OK. Also, **make what’s
+  wrong very hard**, if not impossible.
+
+As a good practice, always create new files and even classes with a leading
+underscore in their name. This way, everything is private by default and the
+only public surface is explicitly present in an `__init__.py` file.
--- a/mypy.ini
+++ b/mypy.ini
@@ -7,6 +7,38 @@ allow_redefinition = True
 no_implicit_optional = True
 warn_redundant_casts = True

+[mypy-torchvision.prototype.datapoints.*]
+
+; untyped definitions and calls
+disallow_untyped_defs = True
+
+; None and Optional handling
+no_implicit_optional = True
+
+; warnings
+warn_unused_ignores = True
+
+; miscellaneous strictness flags
+allow_redefinition = True
+
+[mypy-torchvision.prototype.transforms.*]
+
+; untyped definitions and calls
+disallow_untyped_defs = True
+
+; None and Optional handling
+no_implicit_optional = True
+
+; warnings
+warn_unused_ignores = True
+
+; miscellaneous strictness flags
+allow_redefinition = True
+
+[mypy-torchvision.prototype.datasets.*]
+
+ignore_errors = True
+
 [mypy-torchvision.io.image.*]

 ignore_errors = True

--- a/packaging/README.md
+++ b/packaging/README.md
-# Building torchvision packages for release
-
-TorchVision release packages are built by using `build_wheel.sh` and `build_conda.sh` for all permutations of
-supported operating systems, compute platforms and python versions.
-
-OS/Python/Compute matrix is defined in https://github.com/pytorch/vision/blob/main/.circleci/regenerate.py
--- a/packaging/build_cmake.sh
+++ b/packaging/build_cmake.sh
-#!/bin/bash
-set -ex
-
-PARALLELISM=8
-if [ -n "$MAX_JOBS" ]; then
-    PARALLELISM=$MAX_JOBS
-fi
-
-if [[ "$(uname)" != Darwin && "$OSTYPE" != "msys" ]]; then
-    eval "$(./conda/bin/conda shell.bash hook)"
-    conda activate ./env
-fi
-
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-. "$script_dir/pkg_helpers.bash"
-
-export BUILD_TYPE=conda
-setup_env
-export SOURCE_ROOT_DIR="$PWD"
-setup_conda_pytorch_constraint
-setup_conda_cudatoolkit_plain_constraint
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    conda install -yq conda-build cmake future
-    pip install dataclasses
-fi
-
-setup_visual_studio_constraint
-setup_junit_results_folder
-
-if [[ "$(uname)" == Darwin ]]; then
-  # TODO: this can be removed as soon as mkl's CMake support works with clang
-  #  see https://github.com/pytorch/vision/pull/4203 for details
-  MKL_CONSTRAINT='mkl==2021.2.0'
-else
-  MKL_CONSTRAINT=''
-fi
-
-if [[ $CONDA_BUILD_VARIANT == "cpu" ]]; then
-  PYTORCH_MUTEX_CONSTRAINT='pytorch-mutex=1.0=cpu'
-else
-  PYTORCH_MUTEX_CONSTRAINT=''
-fi
-
-conda install -yq \pytorch=$PYTORCH_VERSION $CONDA_CUDATOOLKIT_CONSTRAINT $PYTORCH_MUTEX_CONSTRAINT $MKL_CONSTRAINT numpy -c nvidia -c "pytorch-${UPLOAD_CHANNEL}"
-TORCH_PATH=$(dirname $(python -c "import torch; print(torch.__file__)"))
-
-if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
-    conda install -yq libpng jpeg
-else
-    yum install -y libpng-devel libjpeg-turbo-devel
-fi
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    source .circleci/unittest/windows/scripts/set_cuda_envs.sh
-fi
-
-mkdir cpp_build
-pushd cpp_build
-
-# Generate libtorchvision files
-cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch -DWITH_CUDA=$CMAKE_USE_CUDA
-
-# Compile and install libtorchvision
-if [[ "$OSTYPE" == "msys" ]]; then
-    "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_cmake.bat" $PARALLELISM
-    CONDA_PATH=$(dirname $(which python))
-    cp -r "C:/Program Files (x86)/torchvision/include/torchvision" $CONDA_PATH/include
-else
-    make -j$PARALLELISM
-    make install
-
-    if [[ "$(uname)" == Darwin ]]; then
-        CONDA_PATH=$(dirname $(dirname $(which python)))
-        cp -r /usr/local/include/torchvision $CONDA_PATH/include/
-        export C_INCLUDE_PATH=/usr/local/include
-        export CPLUS_INCLUDE_PATH=/usr/local/include
-    fi
-fi
-
-popd
-
-# Install torchvision locally
-python setup.py develop
-
-# Trace, compile and run project that uses Faster-RCNN
-pushd test/tracing/frcnn
-mkdir build
-
-# Trace model
-python trace_model.py
-cp fasterrcnn_resnet50_fpn.pt build
-
-cd build
-cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch -DWITH_CUDA=$CMAKE_USE_CUDA
-if [[ "$OSTYPE" == "msys" ]]; then
-    "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_frcnn.bat" $PARALLELISM
-    mv fasterrcnn_resnet50_fpn.pt Release
-    cd Release
-    export PATH=$(cygpath -w "C:/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"):$(cygpath -w "C:/Program Files (x86)/torchvision/bin"):$(cygpath -w $TORCH_PATH)/lib:$PATH
-else
-    make -j$PARALLELISM
-fi
-
-# Run traced program
-./test_frcnn_tracing
-
-# Compile and run the CPP example
-popd
-cd examples/cpp/hello_world
-mkdir build
-
-# Trace model
-python trace_model.py
-cp resnet18.pt build
-
-cd build
-cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_cpp_example.bat" $PARALLELISM
-    mv resnet18.pt Release
-    cd Release
-else
-    make -j$PARALLELISM
-fi
-
-# Run CPP example
-./hello-world
--- a/packaging/build_conda.sh
+++ b/packaging/build_conda.sh
-#!/bin/bash
-set -ex
-
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-. "$script_dir/pkg_helpers.bash"
-
-export BUILD_TYPE=conda
-setup_env
-export SOURCE_ROOT_DIR="$PWD"
-setup_conda_pytorch_constraint
-setup_conda_cudatoolkit_constraint
-setup_visual_studio_constraint
-setup_junit_results_folder
-export CUDATOOLKIT_CHANNEL="nvidia"
-
-conda build -c $CUDATOOLKIT_CHANNEL $CONDA_CHANNEL_FLAGS --no-anaconda-upload --python "$PYTHON_VERSION" packaging/torchvision
--- a/packaging/build_wheel.sh
+++ b/packaging/build_wheel.sh
-#!/bin/bash
-set -ex
-
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-. "$script_dir/pkg_helpers.bash"
-
-export BUILD_TYPE=wheel
-setup_env
-setup_wheel_python
-pip_install numpy pyyaml future ninja
-pip_install --upgrade setuptools
-setup_pip_pytorch_version
-python setup.py clean
-
-# Copy binaries to be included in the wheel distribution
-if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
-    python_exec="$(which python)"
-    bin_path=$(dirname $python_exec)
-    env_path=$(dirname $bin_path)
-    if [[ "$(uname)" == Darwin ]]; then
-        # Install delocate to relocate the required binaries
-        pip_install "delocate>=0.9"
-    else
-        cp "$bin_path/Library/bin/libpng16.dll" torchvision
-        cp "$bin_path/Library/bin/libjpeg.dll" torchvision
-    fi
-else
-    # Install auditwheel to get some inspection utilities
-    pip_install auditwheel
-
-    # Point to custom libraries
-    export LD_LIBRARY_PATH=$(pwd)/ext_libraries/lib:$LD_LIBRARY_PATH
-    export TORCHVISION_INCLUDE=$(pwd)/ext_libraries/include
-    export TORCHVISION_LIBRARY=$(pwd)/ext_libraries/lib
-fi
-
-download_copy_ffmpeg
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    IS_WHEEL=1 "$script_dir/windows/internal/vc_env_helper.bat" python setup.py bdist_wheel
-else
-    IS_WHEEL=1 python setup.py bdist_wheel
-fi
-
-
-if [[ "$(uname)" == Darwin ]]; then
-    pushd dist/
-    python_exec="$(which python)"
-    bin_path=$(dirname $python_exec)
-    env_path=$(dirname $bin_path)
-    for whl in *.whl; do
-        DYLD_FALLBACK_LIBRARY_PATH="$env_path/lib/:$DYLD_FALLBACK_LIBRARY_PATH" delocate-wheel -v --ignore-missing-dependencies $whl
-    done
-else
-    if [[ "$OSTYPE" == "msys" ]]; then
-        "$script_dir/windows/internal/vc_env_helper.bat" python $script_dir/wheel/relocate.py
-    else
-        LD_LIBRARY_PATH="/usr/local/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH" python $script_dir/wheel/relocate.py
-    fi
-fi
--- a/packaging/pkg_helpers.bash
+++ b/packaging/pkg_helpers.bash
-# A set of useful bash functions for common functionality we need to do in
-# many build scripts
-
-
-# Setup CUDA environment variables, based on CU_VERSION
-#
-# Inputs:
-#   CU_VERSION (cpu, cu92, cu100)
-#   NO_CUDA_PACKAGE (bool)
-#   BUILD_TYPE (conda, wheel)
-#
-# Outputs:
-#   VERSION_SUFFIX (e.g., "")
-#   PYTORCH_VERSION_SUFFIX (e.g., +cpu)
-#   WHEEL_DIR (e.g., cu100/)
-#   CUDA_HOME (e.g., /usr/local/cuda-9.2, respected by torch.utils.cpp_extension)
-#   FORCE_CUDA (respected by torchvision setup.py)
-#   NVCC_FLAGS (respected by torchvision setup.py)
-#
-# Precondition: CUDA versions are installed in their conventional locations in
-# /usr/local/cuda-*
-#
-# NOTE: Why VERSION_SUFFIX versus PYTORCH_VERSION_SUFFIX?  If you're building
-# a package with CUDA on a platform we support CUDA on, VERSION_SUFFIX ==
-# PYTORCH_VERSION_SUFFIX and everyone is happy.  However, if you are building a
-# package with only CPU bits (e.g., torchaudio), then VERSION_SUFFIX is always
-# empty, but PYTORCH_VERSION_SUFFIX is +cpu (because that's how you get a CPU
-# version of a Python package.  But that doesn't apply if you're on OS X,
-# since the default CU_VERSION on OS X is cpu.
-setup_cuda() {
-
-  # First, compute version suffixes.  By default, assume no version suffixes
-  export VERSION_SUFFIX=""
-  export PYTORCH_VERSION_SUFFIX=""
-  export WHEEL_DIR=""
-  # Wheel builds need suffixes (but not if they're on OS X, which never has suffix)
-  if [[ "$BUILD_TYPE" == "wheel" ]] && [[ "$(uname)" != Darwin ]]; then
-    export PYTORCH_VERSION_SUFFIX="+$CU_VERSION"
-    # Match the suffix scheme of pytorch, unless this package does not have
-    # CUDA builds (in which case, use default)
-    if [[ -z "$NO_CUDA_PACKAGE" ]]; then
-      export VERSION_SUFFIX="$PYTORCH_VERSION_SUFFIX"
-      export WHEEL_DIR="$CU_VERSION/"
-    fi
-  fi
-
-  # Now work out the CUDA settings
-  case "$CU_VERSION" in
-    cu117)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.7"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.7/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-      ;;
-    cu116)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.6"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.6/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-      ;;
-    cpu)
-      ;;
-    rocm*)
-      export FORCE_CUDA=1
-      ;;
-    *)
-      echo "Unrecognized CU_VERSION=$CU_VERSION"
-      exit 1
-      ;;
-  esac
-  if [[ -n "$CUDA_HOME" ]]; then
-    # Adds nvcc binary to the search path so that CMake's `find_package(CUDA)` will pick the right one
-    export PATH="$CUDA_HOME/bin:$PATH"
-    export FORCE_CUDA=1
-  fi
-}
-
-# Populate build version if necessary, and add version suffix
-#
-# Inputs:
-#   BUILD_VERSION (e.g., 0.2.0 or empty)
-#   VERSION_SUFFIX (e.g., +cpu)
-#
-# Outputs:
-#   BUILD_VERSION (e.g., 0.2.0.dev20190807+cpu)
-#
-# Fill BUILD_VERSION if it doesn't exist already with a nightly string
-# Usage: setup_build_version 0.2.0
-setup_build_version() {
-  if [[ -z "$BUILD_VERSION" ]]; then
-    if [[ -z "$1" ]]; then
-      setup_base_build_version
-    else
-      BUILD_VERSION="$1"
-    fi
-    BUILD_VERSION="$BUILD_VERSION.dev$(date "+%Y%m%d")$VERSION_SUFFIX"
-  else
-    BUILD_VERSION="$BUILD_VERSION$VERSION_SUFFIX"
-  fi
-
-  # Set build version based on tag if on tag
-  if [[ -n "${CIRCLE_TAG}" ]]; then
-    # Strip tag
-    BUILD_VERSION="$(echo "${CIRCLE_TAG}" | sed -e 's/^v//' -e 's/-.*$//')${VERSION_SUFFIX}"
-  fi
-
-  export BUILD_VERSION
-}
-
-setup_base_build_version() {
-  SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-  # version.txt for some reason has `a` character after major.minor.rev
-  # command below yields 0.10.0 from version.txt containing 0.10.0a0
-  BUILD_VERSION=$( cut -f 1 -d a "$SCRIPT_DIR/../version.txt" )
-  export BUILD_VERSION
-}
-
-# Set some useful variables for OS X, if applicable
-setup_macos() {
-  if [[ "$(uname)" == Darwin ]]; then
-    export MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++
-  fi
-}
-
-
-# Top-level entry point for things every package will need to do
-#
-# Usage: setup_env 0.2.0
-setup_env() {
-  setup_cuda
-  setup_build_version "$1"
-  setup_macos
-}
-
-# Function to retry functions that sometimes timeout or have flaky failures
-retry () {
-    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
-}
-
-# Inputs:
-#   PYTHON_VERSION (3.7, 3.8, 3.9)
-#   UNICODE_ABI (bool)
-#
-# Outputs:
-#   PATH modified to put correct Python version in PATH
-#
-# Precondition: If Linux, you are in a soumith/manylinux-cuda* Docker image
-setup_wheel_python() {
-  if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
-    eval "$(conda shell.bash hook)"
-    conda env remove -n "env$PYTHON_VERSION" || true
-    conda create ${CONDA_CHANNEL_FLAGS} -yn "env$PYTHON_VERSION" python="$PYTHON_VERSION"
-    conda activate "env$PYTHON_VERSION"
-    # Install libpng from Anaconda (defaults)
-    conda install ${CONDA_CHANNEL_FLAGS} libpng "jpeg<=9b" -y
-  else
-    # Install native CentOS libJPEG, freetype and GnuTLS
-    yum install -y libjpeg-turbo-devel freetype gnutls
-    case "$PYTHON_VERSION" in
-      3.7) python_abi=cp37-cp37m ;;
-      3.8) python_abi=cp38-cp38 ;;
-      3.9) python_abi=cp39-cp39 ;;
-      3.10) python_abi=cp310-cp310 ;;
-      *)
-        echo "Unrecognized PYTHON_VERSION=$PYTHON_VERSION"
-        exit 1
-        ;;
-    esac
-    # Download all the dependencies required to compile image and video_reader
-    # extensions
-
-    mkdir -p ext_libraries
-    pushd ext_libraries
-    popd
-    export PATH="/opt/python/$python_abi/bin:$(pwd)/ext_libraries/bin:$PATH"
-  fi
-}
-
-# Install with pip a bit more robustly than the default
-pip_install() {
-  retry pip install --progress-bar off "$@"
-}
-
-# Install torch with pip, respecting PYTORCH_VERSION, and record the installed
-# version into PYTORCH_VERSION, if applicable
-setup_pip_pytorch_version() {
-  if [[ -z "$PYTORCH_VERSION" ]]; then
-    # Install latest prerelease version of torch, per our nightlies, consistent
-    # with the requested cuda version
-    pip_install --pre torch -f "https://download.pytorch.org/whl/test/${WHEEL_DIR}torch_test.html"
-    if [[ "$CUDA_VERSION" == "cpu" ]]; then
-      # CUDA and CPU are ABI compatible on the CPU-only parts, so strip
-      # in this case
-      export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version:  *//' | sed 's/+.\+//')"
-    else
-      export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version:  *//')"
-    fi
-  else
-    pip_install "torch==$PYTORCH_VERSION$PYTORCH_VERSION_SUFFIX" \
-      -f "https://download.pytorch.org/whl/${CU_VERSION}/torch_stable.html" \
-      -f "https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${CU_VERSION}/torch_${UPLOAD_CHANNEL}.html"
-  fi
-}
-
-# Fill PYTORCH_VERSION with the latest conda nightly version, and
-# CONDA_CHANNEL_FLAGS with appropriate flags to retrieve these versions
-#
-# You MUST have populated PYTORCH_VERSION_SUFFIX before hand.
-setup_conda_pytorch_constraint() {
-  if [[ -z "$PYTORCH_VERSION" ]]; then
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch-test -c pytorch"
-    PYTHON="python"
-    # Check if we have python 3 instead and prefer that
-    if python3 --version >/dev/null 2>/dev/null; then
-      PYTHON="python3"
-    fi
-    export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-test]' | \
-                              ${PYTHON} -c "import os, sys, json, re; cuver = os.environ.get('CU_VERSION'); \
-                               cuver_1 = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
-                               cuver_2 = (cuver[:-1] + '.' + cuver[-1]).replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
-                               print(re.sub(r'\\+.*$', '', \
-                                [x['version'] for x in json.load(sys.stdin)['pytorch'] \
-                                  if (x['platform'] == 'darwin' or cuver_1 in x['fn'] or cuver_2 in x['fn']) \
-                                    and 'py' + os.environ['PYTHON_VERSION'] in x['fn']][-1]))")"
-    if [[ -z "$PYTORCH_VERSION" ]]; then
-      echo "PyTorch version auto detection failed"
-      echo "No package found for CU_VERSION=$CU_VERSION and PYTHON_VERSION=$PYTHON_VERSION"
-      exit 1
-    fi
-  else
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch -c pytorch-${UPLOAD_CHANNEL}"
-  fi
-  if [[ "$CU_VERSION" == cpu ]]; then
-    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==$PYTORCH_VERSION${PYTORCH_VERSION_SUFFIX}"
-    export CONDA_PYTORCH_CONSTRAINT="- pytorch==$PYTORCH_VERSION"
-  else
-    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}"
-    export CONDA_PYTORCH_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}"
-  fi
-  if [[ "$OSTYPE" == msys && "$CU_VERSION" == cu92 ]]; then
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c defaults -c numba/label/dev"
-  fi
-}
-
-# Translate CUDA_VERSION into CUDA_CUDATOOLKIT_CONSTRAINT
-setup_conda_cudatoolkit_constraint() {
-  export CONDA_BUILD_VARIANT="cuda"
-  if [[ "$(uname)" == Darwin ]]; then
-    export CONDA_BUILD_VARIANT="cpu"
-  else
-    case "$CU_VERSION" in
-      cu117)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.7 # [not osx]"
-        ;;
-      cu116)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.6 # [not osx]"
-        ;;
-      cpu)
-        export CONDA_CUDATOOLKIT_CONSTRAINT=""
-        export CONDA_BUILD_VARIANT="cpu"
-        ;;
-      *)
-        echo "Unrecognized CU_VERSION=$CU_VERSION"
-        exit 1
-        ;;
-    esac
-  fi
-}
-
-setup_conda_cudatoolkit_plain_constraint() {
-  export CONDA_BUILD_VARIANT="cuda"
-  export CMAKE_USE_CUDA=1
-  if [[ "$(uname)" == Darwin ]]; then
-    export CONDA_BUILD_VARIANT="cpu"
-    export CMAKE_USE_CUDA=0
-  else
-    case "$CU_VERSION" in
-      cu117)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.7"
-        ;;
-      cu116)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.6"
-        ;;
-      cpu)
-        export CONDA_CUDATOOLKIT_CONSTRAINT=""
-        export CONDA_BUILD_VARIANT="cpu"
-        export CMAKE_USE_CUDA=0
-        ;;
-      *)
-        echo "Unrecognized CU_VERSION=$CU_VERSION"
-        exit 1
-        ;;
-    esac
-  fi
-}
-
-# Build the proper compiler package before building the final package
-setup_visual_studio_constraint() {
-  if [[ "$OSTYPE" == "msys" ]]; then
-      export VSTOOLCHAIN_PACKAGE=vs$VC_YEAR
-      conda build $CONDA_CHANNEL_FLAGS --no-anaconda-upload packaging/$VSTOOLCHAIN_PACKAGE
-      cp packaging/$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml packaging/torchvision/conda_build_config.yaml
-  fi
-}
-
-setup_junit_results_folder() {
-  if [[ "$CI" == "true" ]]; then
-    export CONDA_PYTORCH_BUILD_RESULTS_DIRECTORY="${SOURCE_ROOT_DIR}/build_results/results.xml"
-  fi
-}
-
-
-download_copy_ffmpeg() {
-  if [[ "$OSTYPE" == "msys" ]]; then
-    # conda install -yq ffmpeg=4.2 -c pytorch
-    # curl -L -q https://anaconda.org/pytorch/ffmpeg/4.3/download/win-64/ffmpeg-4.3-ha925a31_0.tar.bz2 --output ffmpeg-4.3-ha925a31_0.tar.bz2
-    # bzip2 --decompress --stdout ffmpeg-4.3-ha925a31_0.tar.bz2 | tar -x --file=-
-    # cp Library/bin/*.dll ../torchvision
-    echo "FFmpeg is disabled currently on Windows"
-  else
-    if [[ "$(uname)" == Darwin ]]; then
-      conda install -yq ffmpeg=4.2 -c pytorch
-      conda install -yq wget
-    else
-      # pushd ext_libraries
-      # wget -q https://anaconda.org/pytorch/ffmpeg/4.2/download/linux-64/ffmpeg-4.2-hf484d3e_0.tar.bz2
-      # tar -xjvf ffmpeg-4.2-hf484d3e_0.tar.bz2
-      # rm -rf ffmpeg-4.2-hf484d3e_0.tar.bz2
-      # ldconfig
-      # which ffmpeg
-      # popd
-      echo "FFmpeg is disabled currently on Linux"
-    fi
-  fi
-}