Unverified Commit 408917d1 authored by Nicolas Hug's avatar Nicolas Hug Committed by GitHub
Browse files

Use `# %%` syntax for gallery examples (#7793)

parent a893f313
...@@ -23,7 +23,7 @@ from torchvision import datapoints ...@@ -23,7 +23,7 @@ from torchvision import datapoints
from torchvision.transforms.v2 import functional as F from torchvision.transforms.v2 import functional as F
######################################################################################################################## # %%
# What are datapoints? # What are datapoints?
# -------------------- # --------------------
# #
...@@ -36,7 +36,7 @@ assert isinstance(image, torch.Tensor) ...@@ -36,7 +36,7 @@ assert isinstance(image, torch.Tensor)
assert image.data_ptr() == tensor.data_ptr() assert image.data_ptr() == tensor.data_ptr()
######################################################################################################################## # %%
# Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function # Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
# for the input data. # for the input data.
# #
...@@ -59,7 +59,7 @@ image = datapoints.Image([[[[0, 1], [1, 0]]]]) ...@@ -59,7 +59,7 @@ image = datapoints.Image([[[[0, 1], [1, 0]]]])
print(image) print(image)
######################################################################################################################## # %%
# Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad`` # Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad``
# parameters. # parameters.
...@@ -67,14 +67,14 @@ float_image = datapoints.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires ...@@ -67,14 +67,14 @@ float_image = datapoints.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires
print(float_image) print(float_image)
######################################################################################################################## # %%
# In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` also take a # In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` also take a
# :class:`PIL.Image.Image` directly: # :class:`PIL.Image.Image` directly:
image = datapoints.Image(PIL.Image.open("assets/astronaut.jpg")) image = datapoints.Image(PIL.Image.open("assets/astronaut.jpg"))
print(image.shape, image.dtype) print(image.shape, image.dtype)
######################################################################################################################## # %%
# In general, the datapoints can also store additional metadata that complements the underlying tensor. For example, # In general, the datapoints can also store additional metadata that complements the underlying tensor. For example,
# :class:`~torchvision.datapoints.BoundingBoxes` stores the coordinate format as well as the spatial size of the # :class:`~torchvision.datapoints.BoundingBoxes` stores the coordinate format as well as the spatial size of the
# corresponding image alongside the actual values: # corresponding image alongside the actual values:
...@@ -85,7 +85,7 @@ bounding_box = datapoints.BoundingBoxes( ...@@ -85,7 +85,7 @@ bounding_box = datapoints.BoundingBoxes(
print(bounding_box) print(bounding_box)
######################################################################################################################## # %%
# Do I have to wrap the output of the datasets myself? # Do I have to wrap the output of the datasets myself?
# ---------------------------------------------------- # ----------------------------------------------------
# #
...@@ -120,7 +120,7 @@ class PennFudanDataset(torch.utils.data.Dataset): ...@@ -120,7 +120,7 @@ class PennFudanDataset(torch.utils.data.Dataset):
... ...
######################################################################################################################## # %%
# 2. Perform the wrapping inside a custom transformation at the beginning of your pipeline: # 2. Perform the wrapping inside a custom transformation at the beginning of your pipeline:
...@@ -144,7 +144,7 @@ def get_transform(train): ...@@ -144,7 +144,7 @@ def get_transform(train):
transforms.append(T.PILToTensor()) transforms.append(T.PILToTensor())
... ...
######################################################################################################################## # %%
# .. note:: # .. note::
# #
# If both :class:`~torchvision.datapoints.BoundingBoxes`'es and :class:`~torchvision.datapoints.Mask`'s are included in # If both :class:`~torchvision.datapoints.BoundingBoxes`'es and :class:`~torchvision.datapoints.Mask`'s are included in
...@@ -171,7 +171,7 @@ new_image = image + 0 ...@@ -171,7 +171,7 @@ new_image = image + 0
assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image) assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image)
######################################################################################################################## # %%
# .. note:: # .. note::
# #
# This "unwrapping" behaviour is something we're actively seeking feedback on. If you find this surprising or if you # This "unwrapping" behaviour is something we're actively seeking feedback on. If you find this surprising or if you
......
...@@ -42,7 +42,7 @@ def plot(imgs, **imshow_kwargs): ...@@ -42,7 +42,7 @@ def plot(imgs, **imshow_kwargs):
plt.tight_layout() plt.tight_layout()
################################### # %%
# Reading Videos Using Torchvision # Reading Videos Using Torchvision
# -------------------------------- # --------------------------------
# We will first read a video using :func:`~torchvision.io.read_video`. # We will first read a video using :func:`~torchvision.io.read_video`.
...@@ -62,7 +62,7 @@ video_url = "https://download.pytorch.org/tutorial/pexelscom_pavel_danilyuk_bask ...@@ -62,7 +62,7 @@ video_url = "https://download.pytorch.org/tutorial/pexelscom_pavel_danilyuk_bask
video_path = Path(tempfile.mkdtemp()) / "basketball.mp4" video_path = Path(tempfile.mkdtemp()) / "basketball.mp4"
_ = urlretrieve(video_url, video_path) _ = urlretrieve(video_url, video_path)
######################### # %%
# :func:`~torchvision.io.read_video` returns the video frames, audio frames and # :func:`~torchvision.io.read_video` returns the video frames, audio frames and
# the metadata associated with the video. In our case, we only need the video # the metadata associated with the video. In our case, we only need the video
# frames. # frames.
...@@ -79,7 +79,7 @@ img2_batch = torch.stack([frames[101], frames[151]]) ...@@ -79,7 +79,7 @@ img2_batch = torch.stack([frames[101], frames[151]])
plot(img1_batch) plot(img1_batch)
######################### # %%
# The RAFT model accepts RGB images. We first get the frames from # The RAFT model accepts RGB images. We first get the frames from
# :func:`~torchvision.io.read_video` and resize them to ensure their dimensions # :func:`~torchvision.io.read_video` and resize them to ensure their dimensions
# are divisible by 8. Note that we explicitly use ``antialias=False``, because # are divisible by 8. Note that we explicitly use ``antialias=False``, because
...@@ -104,7 +104,7 @@ img1_batch, img2_batch = preprocess(img1_batch, img2_batch) ...@@ -104,7 +104,7 @@ img1_batch, img2_batch = preprocess(img1_batch, img2_batch)
print(f"shape = {img1_batch.shape}, dtype = {img1_batch.dtype}") print(f"shape = {img1_batch.shape}, dtype = {img1_batch.dtype}")
#################################### # %%
# Estimating Optical flow using RAFT # Estimating Optical flow using RAFT
# ---------------------------------- # ----------------------------------
# We will use our RAFT implementation from # We will use our RAFT implementation from
...@@ -125,7 +125,7 @@ list_of_flows = model(img1_batch.to(device), img2_batch.to(device)) ...@@ -125,7 +125,7 @@ list_of_flows = model(img1_batch.to(device), img2_batch.to(device))
print(f"type = {type(list_of_flows)}") print(f"type = {type(list_of_flows)}")
print(f"length = {len(list_of_flows)} = number of iterations of the model") print(f"length = {len(list_of_flows)} = number of iterations of the model")
#################################### # %%
# The RAFT model outputs lists of predicted flows where each entry is a # The RAFT model outputs lists of predicted flows where each entry is a
# (N, 2, H, W) batch of predicted flows that corresponds to a given "iteration" # (N, 2, H, W) batch of predicted flows that corresponds to a given "iteration"
# in the model. For more details on the iterative nature of the model, please # in the model. For more details on the iterative nature of the model, please
...@@ -144,7 +144,7 @@ print(f"shape = {predicted_flows.shape} = (N, 2, H, W)") ...@@ -144,7 +144,7 @@ print(f"shape = {predicted_flows.shape} = (N, 2, H, W)")
print(f"min = {predicted_flows.min()}, max = {predicted_flows.max()}") print(f"min = {predicted_flows.min()}, max = {predicted_flows.max()}")
#################################### # %%
# Visualizing predicted flows # Visualizing predicted flows
# --------------------------- # ---------------------------
# Torchvision provides the :func:`~torchvision.utils.flow_to_image` utlity to # Torchvision provides the :func:`~torchvision.utils.flow_to_image` utlity to
...@@ -166,7 +166,7 @@ img1_batch = [(img1 + 1) / 2 for img1 in img1_batch] ...@@ -166,7 +166,7 @@ img1_batch = [(img1 + 1) / 2 for img1 in img1_batch]
grid = [[img1, flow_img] for (img1, flow_img) in zip(img1_batch, flow_imgs)] grid = [[img1, flow_img] for (img1, flow_img) in zip(img1_batch, flow_imgs)]
plot(grid) plot(grid)
#################################### # %%
# Bonus: Creating GIFs of predicted flows # Bonus: Creating GIFs of predicted flows
# --------------------------------------- # ---------------------------------------
# In the example above we have only shown the predicted flows of 2 pairs of # In the example above we have only shown the predicted flows of 2 pairs of
...@@ -187,7 +187,7 @@ plot(grid) ...@@ -187,7 +187,7 @@ plot(grid)
# output_folder = "/tmp/" # Update this to the folder of your choice # output_folder = "/tmp/" # Update this to the folder of your choice
# write_jpeg(flow_img, output_folder + f"predicted_flow_{i}.jpg") # write_jpeg(flow_img, output_folder + f"predicted_flow_{i}.jpg")
#################################### # %%
# Once the .jpg flow images are saved, you can convert them into a video or a # Once the .jpg flow images are saved, you can convert them into a video or a
# GIF using ffmpeg with e.g.: # GIF using ffmpeg with e.g.:
# #
......
...@@ -36,7 +36,7 @@ def show(imgs): ...@@ -36,7 +36,7 @@ def show(imgs):
axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
#################################### # %%
# Masks # Masks
# ----- # -----
# In tasks like instance and panoptic segmentation, masks are commonly defined, and are defined by this package, # In tasks like instance and panoptic segmentation, masks are commonly defined, and are defined by this package,
...@@ -53,7 +53,7 @@ def show(imgs): ...@@ -53,7 +53,7 @@ def show(imgs):
# A nice property of masks is that they can be easily repurposed to be used in methods to solve a variety of object # A nice property of masks is that they can be easily repurposed to be used in methods to solve a variety of object
# localization tasks. # localization tasks.
#################################### # %%
# Converting Masks to Bounding Boxes # Converting Masks to Bounding Boxes
# ----------------------------------------------- # -----------------------------------------------
# For example, the :func:`~torchvision.ops.masks_to_boxes` operation can be used to # For example, the :func:`~torchvision.ops.masks_to_boxes` operation can be used to
...@@ -70,7 +70,7 @@ img = read_image(img_path) ...@@ -70,7 +70,7 @@ img = read_image(img_path)
mask = read_image(mask_path) mask = read_image(mask_path)
######################### # %%
# Here the masks are represented as a PNG Image, with floating point values. # Here the masks are represented as a PNG Image, with floating point values.
# Each pixel is encoded as different colors, with 0 being background. # Each pixel is encoded as different colors, with 0 being background.
# Notice that the spatial dimensions of image and mask match. # Notice that the spatial dimensions of image and mask match.
...@@ -79,7 +79,7 @@ print(mask.size()) ...@@ -79,7 +79,7 @@ print(mask.size())
print(img.size()) print(img.size())
print(mask) print(mask)
############################ # %%
# We get the unique colors, as these would be the object ids. # We get the unique colors, as these would be the object ids.
obj_ids = torch.unique(mask) obj_ids = torch.unique(mask)
...@@ -91,7 +91,7 @@ obj_ids = obj_ids[1:] ...@@ -91,7 +91,7 @@ obj_ids = obj_ids[1:]
# Note that this snippet would work as well if the masks were float values instead of ints. # Note that this snippet would work as well if the masks were float values instead of ints.
masks = mask == obj_ids[:, None, None] masks = mask == obj_ids[:, None, None]
######################## # %%
# Now the masks are a boolean tensor. # Now the masks are a boolean tensor.
# The first dimension in this case 3 and denotes the number of instances: there are 3 people in the image. # The first dimension in this case 3 and denotes the number of instances: there are 3 people in the image.
# The other two dimensions are height and width, which are equal to the dimensions of the image. # The other two dimensions are height and width, which are equal to the dimensions of the image.
...@@ -101,7 +101,7 @@ masks = mask == obj_ids[:, None, None] ...@@ -101,7 +101,7 @@ masks = mask == obj_ids[:, None, None]
print(masks.size()) print(masks.size())
print(masks) print(masks)
#################################### # %%
# Let us visualize an image and plot its corresponding segmentation masks. # Let us visualize an image and plot its corresponding segmentation masks.
# We will use the :func:`~torchvision.utils.draw_segmentation_masks` to draw the segmentation masks. # We will use the :func:`~torchvision.utils.draw_segmentation_masks` to draw the segmentation masks.
...@@ -113,7 +113,7 @@ for mask in masks: ...@@ -113,7 +113,7 @@ for mask in masks:
show(drawn_masks) show(drawn_masks)
#################################### # %%
# To convert the boolean masks into bounding boxes. # To convert the boolean masks into bounding boxes.
# We will use the :func:`~torchvision.ops.masks_to_boxes` from the torchvision.ops module # We will use the :func:`~torchvision.ops.masks_to_boxes` from the torchvision.ops module
# It returns the boxes in ``(xmin, ymin, xmax, ymax)`` format. # It returns the boxes in ``(xmin, ymin, xmax, ymax)`` format.
...@@ -124,7 +124,7 @@ boxes = masks_to_boxes(masks) ...@@ -124,7 +124,7 @@ boxes = masks_to_boxes(masks)
print(boxes.size()) print(boxes.size())
print(boxes) print(boxes)
#################################### # %%
# As the shape denotes, there are 3 boxes and in ``(xmin, ymin, xmax, ymax)`` format. # As the shape denotes, there are 3 boxes and in ``(xmin, ymin, xmax, ymax)`` format.
# These can be visualized very easily with :func:`~torchvision.utils.draw_bounding_boxes` utility # These can be visualized very easily with :func:`~torchvision.utils.draw_bounding_boxes` utility
# provided in :ref:`torchvision.utils <utils>`. # provided in :ref:`torchvision.utils <utils>`.
...@@ -134,7 +134,7 @@ from torchvision.utils import draw_bounding_boxes ...@@ -134,7 +134,7 @@ from torchvision.utils import draw_bounding_boxes
drawn_boxes = draw_bounding_boxes(img, boxes, colors="red") drawn_boxes = draw_bounding_boxes(img, boxes, colors="red")
show(drawn_boxes) show(drawn_boxes)
################################### # %%
# These boxes can now directly be used by detection models in torchvision. # These boxes can now directly be used by detection models in torchvision.
# Here is demo with a Faster R-CNN model loaded from # Here is demo with a Faster R-CNN model loaded from
# :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
...@@ -153,7 +153,7 @@ target["labels"] = labels = torch.ones((masks.size(0),), dtype=torch.int64) ...@@ -153,7 +153,7 @@ target["labels"] = labels = torch.ones((masks.size(0),), dtype=torch.int64)
detection_outputs = model(img.unsqueeze(0), [target]) detection_outputs = model(img.unsqueeze(0), [target])
#################################### # %%
# Converting Segmentation Dataset to Detection Dataset # Converting Segmentation Dataset to Detection Dataset
# ---------------------------------------------------- # ----------------------------------------------------
# #
......
...@@ -45,7 +45,7 @@ def show(imgs): ...@@ -45,7 +45,7 @@ def show(imgs):
axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
#################################### # %%
# The :func:`~torchvision.io.read_image` function allows to read an image and # The :func:`~torchvision.io.read_image` function allows to read an image and
# directly load it as a tensor # directly load it as a tensor
...@@ -53,7 +53,7 @@ dog1 = read_image(str(Path('assets') / 'dog1.jpg')) ...@@ -53,7 +53,7 @@ dog1 = read_image(str(Path('assets') / 'dog1.jpg'))
dog2 = read_image(str(Path('assets') / 'dog2.jpg')) dog2 = read_image(str(Path('assets') / 'dog2.jpg'))
show([dog1, dog2]) show([dog1, dog2])
#################################### # %%
# Transforming images on GPU # Transforming images on GPU
# -------------------------- # --------------------------
# Most transforms natively support tensors on top of PIL images (to visualize # Most transforms natively support tensors on top of PIL images (to visualize
...@@ -76,7 +76,7 @@ transformed_dog1 = transforms(dog1) ...@@ -76,7 +76,7 @@ transformed_dog1 = transforms(dog1)
transformed_dog2 = transforms(dog2) transformed_dog2 = transforms(dog2)
show([transformed_dog1, transformed_dog2]) show([transformed_dog1, transformed_dog2])
#################################### # %%
# Scriptable transforms for easier deployment via torchscript # Scriptable transforms for easier deployment via torchscript
# ----------------------------------------------------------- # -----------------------------------------------------------
# We now show how to combine image transformations and a model forward pass, # We now show how to combine image transformations and a model forward pass,
...@@ -103,7 +103,7 @@ class Predictor(nn.Module): ...@@ -103,7 +103,7 @@ class Predictor(nn.Module):
return y_pred.argmax(dim=1) return y_pred.argmax(dim=1)
#################################### # %%
# Now, let's define scripted and non-scripted instances of ``Predictor`` and # Now, let's define scripted and non-scripted instances of ``Predictor`` and
# apply it on multiple tensor images of the same size # apply it on multiple tensor images of the same size
...@@ -115,7 +115,7 @@ batch = torch.stack([dog1, dog2]).to(device) ...@@ -115,7 +115,7 @@ batch = torch.stack([dog1, dog2]).to(device)
res = predictor(batch) res = predictor(batch)
res_scripted = scripted_predictor(batch) res_scripted = scripted_predictor(batch)
#################################### # %%
# We can verify that the prediction of the scripted and non-scripted models are # We can verify that the prediction of the scripted and non-scripted models are
# the same: # the same:
...@@ -128,7 +128,7 @@ for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)): ...@@ -128,7 +128,7 @@ for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)):
assert pred == pred_scripted assert pred == pred_scripted
print(f"Prediction for Dog {i + 1}: {labels[str(pred.item())]}") print(f"Prediction for Dog {i + 1}: {labels[str(pred.item())]}")
#################################### # %%
# Since the model is scripted, it can be easily dumped on disk and re-used # Since the model is scripted, it can be easily dumped on disk and re-used
import tempfile import tempfile
......
...@@ -50,7 +50,7 @@ def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs): ...@@ -50,7 +50,7 @@ def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
plt.tight_layout() plt.tight_layout()
#################################### # %%
# Geometric Transforms # Geometric Transforms
# -------------------- # --------------------
# Geometric image transformation refers to the process of altering the geometric properties of an image, # Geometric image transformation refers to the process of altering the geometric properties of an image,
...@@ -65,7 +65,7 @@ def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs): ...@@ -65,7 +65,7 @@ def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
padded_imgs = [T.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)] padded_imgs = [T.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)]
plot(padded_imgs) plot(padded_imgs)
#################################### # %%
# Resize # Resize
# ~~~~~~ # ~~~~~~
# The :class:`~torchvision.transforms.Resize` transform # The :class:`~torchvision.transforms.Resize` transform
...@@ -74,7 +74,7 @@ plot(padded_imgs) ...@@ -74,7 +74,7 @@ plot(padded_imgs)
resized_imgs = [T.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)] resized_imgs = [T.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
plot(resized_imgs) plot(resized_imgs)
#################################### # %%
# CenterCrop # CenterCrop
# ~~~~~~~~~~ # ~~~~~~~~~~
# The :class:`~torchvision.transforms.CenterCrop` transform # The :class:`~torchvision.transforms.CenterCrop` transform
...@@ -83,7 +83,7 @@ plot(resized_imgs) ...@@ -83,7 +83,7 @@ plot(resized_imgs)
center_crops = [T.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)] center_crops = [T.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
plot(center_crops) plot(center_crops)
#################################### # %%
# FiveCrop # FiveCrop
# ~~~~~~~~ # ~~~~~~~~
# The :class:`~torchvision.transforms.FiveCrop` transform # The :class:`~torchvision.transforms.FiveCrop` transform
...@@ -92,7 +92,7 @@ plot(center_crops) ...@@ -92,7 +92,7 @@ plot(center_crops)
(top_left, top_right, bottom_left, bottom_right, center) = T.FiveCrop(size=(100, 100))(orig_img) (top_left, top_right, bottom_left, bottom_right, center) = T.FiveCrop(size=(100, 100))(orig_img)
plot([top_left, top_right, bottom_left, bottom_right, center]) plot([top_left, top_right, bottom_left, bottom_right, center])
#################################### # %%
# RandomPerspective # RandomPerspective
# ~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomPerspective` transform # The :class:`~torchvision.transforms.RandomPerspective` transform
...@@ -102,7 +102,7 @@ perspective_transformer = T.RandomPerspective(distortion_scale=0.6, p=1.0) ...@@ -102,7 +102,7 @@ perspective_transformer = T.RandomPerspective(distortion_scale=0.6, p=1.0)
perspective_imgs = [perspective_transformer(orig_img) for _ in range(4)] perspective_imgs = [perspective_transformer(orig_img) for _ in range(4)]
plot(perspective_imgs) plot(perspective_imgs)
#################################### # %%
# RandomRotation # RandomRotation
# ~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomRotation` transform # The :class:`~torchvision.transforms.RandomRotation` transform
...@@ -112,7 +112,7 @@ rotater = T.RandomRotation(degrees=(0, 180)) ...@@ -112,7 +112,7 @@ rotater = T.RandomRotation(degrees=(0, 180))
rotated_imgs = [rotater(orig_img) for _ in range(4)] rotated_imgs = [rotater(orig_img) for _ in range(4)]
plot(rotated_imgs) plot(rotated_imgs)
#################################### # %%
# RandomAffine # RandomAffine
# ~~~~~~~~~~~~ # ~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomAffine` transform # The :class:`~torchvision.transforms.RandomAffine` transform
...@@ -122,7 +122,7 @@ affine_transfomer = T.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale ...@@ -122,7 +122,7 @@ affine_transfomer = T.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale
affine_imgs = [affine_transfomer(orig_img) for _ in range(4)] affine_imgs = [affine_transfomer(orig_img) for _ in range(4)]
plot(affine_imgs) plot(affine_imgs)
#################################### # %%
# ElasticTransform # ElasticTransform
# ~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.ElasticTransform` transform # The :class:`~torchvision.transforms.ElasticTransform` transform
...@@ -133,7 +133,7 @@ elastic_transformer = T.ElasticTransform(alpha=250.0) ...@@ -133,7 +133,7 @@ elastic_transformer = T.ElasticTransform(alpha=250.0)
transformed_imgs = [elastic_transformer(orig_img) for _ in range(2)] transformed_imgs = [elastic_transformer(orig_img) for _ in range(2)]
plot(transformed_imgs) plot(transformed_imgs)
#################################### # %%
# RandomCrop # RandomCrop
# ~~~~~~~~~~ # ~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomCrop` transform # The :class:`~torchvision.transforms.RandomCrop` transform
...@@ -143,7 +143,7 @@ cropper = T.RandomCrop(size=(128, 128)) ...@@ -143,7 +143,7 @@ cropper = T.RandomCrop(size=(128, 128))
crops = [cropper(orig_img) for _ in range(4)] crops = [cropper(orig_img) for _ in range(4)]
plot(crops) plot(crops)
#################################### # %%
# RandomResizedCrop # RandomResizedCrop
# ~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomResizedCrop` transform # The :class:`~torchvision.transforms.RandomResizedCrop` transform
...@@ -154,7 +154,7 @@ resize_cropper = T.RandomResizedCrop(size=(32, 32)) ...@@ -154,7 +154,7 @@ resize_cropper = T.RandomResizedCrop(size=(32, 32))
resized_crops = [resize_cropper(orig_img) for _ in range(4)] resized_crops = [resize_cropper(orig_img) for _ in range(4)]
plot(resized_crops) plot(resized_crops)
#################################### # %%
# Photometric Transforms # Photometric Transforms
# ---------------------- # ----------------------
# Photometric image transformation refers to the process of modifying the photometric properties of an image, # Photometric image transformation refers to the process of modifying the photometric properties of an image,
...@@ -174,7 +174,7 @@ plot(resized_crops) ...@@ -174,7 +174,7 @@ plot(resized_crops)
gray_img = T.Grayscale()(orig_img) gray_img = T.Grayscale()(orig_img)
plot([gray_img], cmap='gray') plot([gray_img], cmap='gray')
#################################### # %%
# ColorJitter # ColorJitter
# ~~~~~~~~~~~ # ~~~~~~~~~~~
# The :class:`~torchvision.transforms.ColorJitter` transform # The :class:`~torchvision.transforms.ColorJitter` transform
...@@ -183,7 +183,7 @@ jitter = T.ColorJitter(brightness=.5, hue=.3) ...@@ -183,7 +183,7 @@ jitter = T.ColorJitter(brightness=.5, hue=.3)
jitted_imgs = [jitter(orig_img) for _ in range(4)] jitted_imgs = [jitter(orig_img) for _ in range(4)]
plot(jitted_imgs) plot(jitted_imgs)
#################################### # %%
# GaussianBlur # GaussianBlur
# ~~~~~~~~~~~~ # ~~~~~~~~~~~~
# The :class:`~torchvision.transforms.GaussianBlur` transform # The :class:`~torchvision.transforms.GaussianBlur` transform
...@@ -193,7 +193,7 @@ blurrer = T.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5)) ...@@ -193,7 +193,7 @@ blurrer = T.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5))
blurred_imgs = [blurrer(orig_img) for _ in range(4)] blurred_imgs = [blurrer(orig_img) for _ in range(4)]
plot(blurred_imgs) plot(blurred_imgs)
#################################### # %%
# RandomInvert # RandomInvert
# ~~~~~~~~~~~~ # ~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomInvert` transform # The :class:`~torchvision.transforms.RandomInvert` transform
...@@ -203,7 +203,7 @@ inverter = T.RandomInvert() ...@@ -203,7 +203,7 @@ inverter = T.RandomInvert()
invertered_imgs = [inverter(orig_img) for _ in range(4)] invertered_imgs = [inverter(orig_img) for _ in range(4)]
plot(invertered_imgs) plot(invertered_imgs)
#################################### # %%
# RandomPosterize # RandomPosterize
# ~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomPosterize` transform # The :class:`~torchvision.transforms.RandomPosterize` transform
...@@ -214,7 +214,7 @@ posterizer = T.RandomPosterize(bits=2) ...@@ -214,7 +214,7 @@ posterizer = T.RandomPosterize(bits=2)
posterized_imgs = [posterizer(orig_img) for _ in range(4)] posterized_imgs = [posterizer(orig_img) for _ in range(4)]
plot(posterized_imgs) plot(posterized_imgs)
#################################### # %%
# RandomSolarize # RandomSolarize
# ~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomSolarize` transform # The :class:`~torchvision.transforms.RandomSolarize` transform
...@@ -225,7 +225,7 @@ solarizer = T.RandomSolarize(threshold=192.0) ...@@ -225,7 +225,7 @@ solarizer = T.RandomSolarize(threshold=192.0)
solarized_imgs = [solarizer(orig_img) for _ in range(4)] solarized_imgs = [solarizer(orig_img) for _ in range(4)]
plot(solarized_imgs) plot(solarized_imgs)
#################################### # %%
# RandomAdjustSharpness # RandomAdjustSharpness
# ~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomAdjustSharpness` transform # The :class:`~torchvision.transforms.RandomAdjustSharpness` transform
...@@ -235,7 +235,7 @@ sharpness_adjuster = T.RandomAdjustSharpness(sharpness_factor=2) ...@@ -235,7 +235,7 @@ sharpness_adjuster = T.RandomAdjustSharpness(sharpness_factor=2)
sharpened_imgs = [sharpness_adjuster(orig_img) for _ in range(4)] sharpened_imgs = [sharpness_adjuster(orig_img) for _ in range(4)]
plot(sharpened_imgs) plot(sharpened_imgs)
#################################### # %%
# RandomAutocontrast # RandomAutocontrast
# ~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomAutocontrast` transform # The :class:`~torchvision.transforms.RandomAutocontrast` transform
...@@ -245,7 +245,7 @@ autocontraster = T.RandomAutocontrast() ...@@ -245,7 +245,7 @@ autocontraster = T.RandomAutocontrast()
autocontrasted_imgs = [autocontraster(orig_img) for _ in range(4)] autocontrasted_imgs = [autocontraster(orig_img) for _ in range(4)]
plot(autocontrasted_imgs) plot(autocontrasted_imgs)
#################################### # %%
# RandomEqualize # RandomEqualize
# ~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomEqualize` transform # The :class:`~torchvision.transforms.RandomEqualize` transform
...@@ -255,7 +255,7 @@ equalizer = T.RandomEqualize() ...@@ -255,7 +255,7 @@ equalizer = T.RandomEqualize()
equalized_imgs = [equalizer(orig_img) for _ in range(4)] equalized_imgs = [equalizer(orig_img) for _ in range(4)]
plot(equalized_imgs) plot(equalized_imgs)
#################################### # %%
# Augmentation Transforms # Augmentation Transforms
# ----------------------- # -----------------------
# The following transforms are combinations of multiple transforms, # The following transforms are combinations of multiple transforms,
...@@ -275,7 +275,7 @@ imgs = [ ...@@ -275,7 +275,7 @@ imgs = [
row_title = [str(policy).split('.')[-1] for policy in policies] row_title = [str(policy).split('.')[-1] for policy in policies]
plot(imgs, row_title=row_title) plot(imgs, row_title=row_title)
#################################### # %%
# RandAugment # RandAugment
# ~~~~~~~~~~~ # ~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandAugment` is an alternate version of AutoAugment. # The :class:`~torchvision.transforms.RandAugment` is an alternate version of AutoAugment.
...@@ -283,7 +283,7 @@ augmenter = T.RandAugment() ...@@ -283,7 +283,7 @@ augmenter = T.RandAugment()
imgs = [augmenter(orig_img) for _ in range(4)] imgs = [augmenter(orig_img) for _ in range(4)]
plot(imgs) plot(imgs)
#################################### # %%
# TrivialAugmentWide # TrivialAugmentWide
# ~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.TrivialAugmentWide` is an alternate implementation of AutoAugment. # The :class:`~torchvision.transforms.TrivialAugmentWide` is an alternate implementation of AutoAugment.
...@@ -293,7 +293,7 @@ augmenter = T.TrivialAugmentWide() ...@@ -293,7 +293,7 @@ augmenter = T.TrivialAugmentWide()
imgs = [augmenter(orig_img) for _ in range(4)] imgs = [augmenter(orig_img) for _ in range(4)]
plot(imgs) plot(imgs)
#################################### # %%
# AugMix # AugMix
# ~~~~~~ # ~~~~~~
# The :class:`~torchvision.transforms.AugMix` transform interpolates between augmented versions of an image. # The :class:`~torchvision.transforms.AugMix` transform interpolates between augmented versions of an image.
...@@ -301,7 +301,7 @@ augmenter = T.AugMix() ...@@ -301,7 +301,7 @@ augmenter = T.AugMix()
imgs = [augmenter(orig_img) for _ in range(4)] imgs = [augmenter(orig_img) for _ in range(4)]
plot(imgs) plot(imgs)
#################################### # %%
# Randomly-applied Transforms # Randomly-applied Transforms
# --------------------------- # ---------------------------
# #
...@@ -318,7 +318,7 @@ hflipper = T.RandomHorizontalFlip(p=0.5) ...@@ -318,7 +318,7 @@ hflipper = T.RandomHorizontalFlip(p=0.5)
transformed_imgs = [hflipper(orig_img) for _ in range(4)] transformed_imgs = [hflipper(orig_img) for _ in range(4)]
plot(transformed_imgs) plot(transformed_imgs)
#################################### # %%
# RandomVerticalFlip # RandomVerticalFlip
# ~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomVerticalFlip` transform # The :class:`~torchvision.transforms.RandomVerticalFlip` transform
...@@ -328,7 +328,7 @@ vflipper = T.RandomVerticalFlip(p=0.5) ...@@ -328,7 +328,7 @@ vflipper = T.RandomVerticalFlip(p=0.5)
transformed_imgs = [vflipper(orig_img) for _ in range(4)] transformed_imgs = [vflipper(orig_img) for _ in range(4)]
plot(transformed_imgs) plot(transformed_imgs)
#################################### # %%
# RandomApply # RandomApply
# ~~~~~~~~~~~ # ~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomApply` transform # The :class:`~torchvision.transforms.RandomApply` transform
......
...@@ -36,7 +36,7 @@ def load_data(): ...@@ -36,7 +36,7 @@ def load_data():
return path, image, bounding_boxes, masks, labels return path, image, bounding_boxes, masks, labels
######################################################################################################################## # %%
# The :mod:`torchvision.transforms.v2` API supports images, videos, bounding boxes, and instance and segmentation # The :mod:`torchvision.transforms.v2` API supports images, videos, bounding boxes, and instance and segmentation
# masks. Thus, it offers native support for many Computer Vision tasks, like image and video classification, object # masks. Thus, it offers native support for many Computer Vision tasks, like image and video classification, object
# detection or instance and semantic segmentation. Still, the interface is the same, making # detection or instance and semantic segmentation. Still, the interface is the same, making
...@@ -55,7 +55,7 @@ transform = transforms.Compose( ...@@ -55,7 +55,7 @@ transform = transforms.Compose(
] ]
) )
######################################################################################################################## # %%
# :mod:`torchvision.transforms.v2` natively supports jointly transforming multiple inputs while making sure that # :mod:`torchvision.transforms.v2` natively supports jointly transforming multiple inputs while making sure that
# potential random behavior is consistent across all inputs. However, it doesn't enforce a specific input structure or # potential random behavior is consistent across all inputs. However, it doesn't enforce a specific input structure or
# order. # order.
...@@ -70,7 +70,7 @@ new_image, new_bounding_boxes, new_masks, new_labels = transform( ...@@ -70,7 +70,7 @@ new_image, new_bounding_boxes, new_masks, new_labels = transform(
) # Instance Segmentation ) # Instance Segmentation
new_image, new_target = transform((image, {"boxes": bounding_boxes, "labels": labels})) # Arbitrary Structure new_image, new_target = transform((image, {"boxes": bounding_boxes, "labels": labels})) # Arbitrary Structure
######################################################################################################################## # %%
# Under the hood, :mod:`torchvision.transforms.v2` relies on :mod:`torchvision.datapoints` for the dispatch to the # Under the hood, :mod:`torchvision.transforms.v2` relies on :mod:`torchvision.datapoints` for the dispatch to the
# appropriate function for the input data: :ref:`sphx_glr_auto_examples_plot_datapoints.py`. Note however, that as # appropriate function for the input data: :ref:`sphx_glr_auto_examples_plot_datapoints.py`. Note however, that as
# regular user, you likely don't have to touch this yourself. See # regular user, you likely don't have to touch this yourself. See
...@@ -84,7 +84,7 @@ new_sample = transform(sample) ...@@ -84,7 +84,7 @@ new_sample = transform(sample)
assert new_sample["path"] is sample["path"] assert new_sample["path"] is sample["path"]
######################################################################################################################## # %%
# As stated above, :mod:`torchvision.transforms.v2` is a drop-in replacement for :mod:`torchvision.transforms` and thus # As stated above, :mod:`torchvision.transforms.v2` is a drop-in replacement for :mod:`torchvision.transforms` and thus
# also supports transforming plain :class:`torch.Tensor`'s as image or video if applicable. This is achieved with a # also supports transforming plain :class:`torch.Tensor`'s as image or video if applicable. This is achieved with a
# simple heuristic: # simple heuristic:
......
...@@ -47,7 +47,7 @@ from torchvision import models, datasets ...@@ -47,7 +47,7 @@ from torchvision import models, datasets
import torchvision.transforms.v2 as transforms import torchvision.transforms.v2 as transforms
######################################################################################################################## # %%
# We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently # We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently
# returns, and we'll see how to convert it to a format that is compatible with our new transforms. # returns, and we'll see how to convert it to a format that is compatible with our new transforms.
...@@ -67,7 +67,7 @@ print(type(image)) ...@@ -67,7 +67,7 @@ print(type(image))
print(type(target), type(target[0]), list(target[0].keys())) print(type(target), type(target[0]), list(target[0].keys()))
######################################################################################################################## # %%
# The dataset returns a two-tuple with the first item being a :class:`PIL.Image.Image` and second one a list of # The dataset returns a two-tuple with the first item being a :class:`PIL.Image.Image` and second one a list of
# dictionaries, which each containing the annotations for a single object instance. As is, this format is not compatible # dictionaries, which each containing the annotations for a single object instance. As is, this format is not compatible
# with the ``torchvision.transforms.v2``, nor with the models. To overcome that, we provide the # with the ``torchvision.transforms.v2``, nor with the models. To overcome that, we provide the
...@@ -85,13 +85,13 @@ print(type(image)) ...@@ -85,13 +85,13 @@ print(type(image))
print(type(target), list(target.keys())) print(type(target), list(target.keys()))
print(type(target["boxes"]), type(target["labels"])) print(type(target["boxes"]), type(target["labels"]))
######################################################################################################################## # %%
# As baseline, let's have a look at a sample without transformations: # As baseline, let's have a look at a sample without transformations:
show(sample) show(sample)
######################################################################################################################## # %%
# With the dataset properly set up, we can now define the augmentation pipeline. This is done the same way it is done in # With the dataset properly set up, we can now define the augmentation pipeline. This is done the same way it is done in
# ``torchvision.transforms`` v1, but now handles bounding boxes and masks without any extra configuration. # ``torchvision.transforms`` v1, but now handles bounding boxes and masks without any extra configuration.
...@@ -107,7 +107,7 @@ transform = transforms.Compose( ...@@ -107,7 +107,7 @@ transform = transforms.Compose(
] ]
) )
######################################################################################################################## # %%
# .. note:: # .. note::
# Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` transform is a no-op in this example, but it # Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` transform is a no-op in this example, but it
# should be placed at least once at the end of a detection pipeline to remove degenerate bounding boxes as well as # should be placed at least once at the end of a detection pipeline to remove degenerate bounding boxes as well as
...@@ -126,7 +126,7 @@ sample = dataset[0] ...@@ -126,7 +126,7 @@ sample = dataset[0]
show(sample) show(sample)
######################################################################################################################## # %%
# We can see that the color of the image was distorted, we zoomed out on it (off center) and flipped it horizontally. # We can see that the color of the image was distorted, we zoomed out on it (off center) and flipped it horizontally.
# In all of this, the bounding box was transformed accordingly. And without any further ado, we can start training. # In all of this, the bounding box was transformed accordingly. And without any further ado, we can start training.
......
...@@ -7,14 +7,14 @@ This example illustrates some of the APIs that torchvision offers for ...@@ -7,14 +7,14 @@ This example illustrates some of the APIs that torchvision offers for
videos, together with the examples on how to build datasets and more. videos, together with the examples on how to build datasets and more.
""" """
#################################### # %%
# 1. Introduction: building a new video object and examining the properties # 1. Introduction: building a new video object and examining the properties
# ------------------------------------------------------------------------- # -------------------------------------------------------------------------
# First we select a video to test the object out. For the sake of argument # First we select a video to test the object out. For the sake of argument
# we're using one from kinetics400 dataset. # we're using one from kinetics400 dataset.
# To create it, we need to define the path and the stream we want to use. # To create it, we need to define the path and the stream we want to use.
###################################### # %%
# Chosen video statistics: # Chosen video statistics:
# #
# - WUzgd7C1pWA.mp4 # - WUzgd7C1pWA.mp4
...@@ -42,7 +42,7 @@ download_url( ...@@ -42,7 +42,7 @@ download_url(
) )
video_path = "./WUzgd7C1pWA.mp4" video_path = "./WUzgd7C1pWA.mp4"
###################################### # %%
# Streams are defined in a similar fashion as torch devices. We encode them as strings in a form # Streams are defined in a similar fashion as torch devices. We encode them as strings in a form
# of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int. # of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int.
# The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered. # The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered.
...@@ -52,7 +52,7 @@ stream = "video" ...@@ -52,7 +52,7 @@ stream = "video"
video = torchvision.io.VideoReader(video_path, stream) video = torchvision.io.VideoReader(video_path, stream)
video.get_metadata() video.get_metadata()
###################################### # %%
# Here we can see that video has two streams - a video and an audio stream. # Here we can see that video has two streams - a video and an audio stream.
# Currently available stream types include ['video', 'audio']. # Currently available stream types include ['video', 'audio'].
# Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id # Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id
...@@ -61,7 +61,7 @@ video.get_metadata() ...@@ -61,7 +61,7 @@ video.get_metadata()
# users can access the one they want. # users can access the one they want.
# If only stream type is passed, the decoder auto-detects first stream of that type and returns it. # If only stream type is passed, the decoder auto-detects first stream of that type and returns it.
###################################### # %%
# Let's read all the frames from the video stream. By default, the return value of # Let's read all the frames from the video stream. By default, the return value of
# ``next(video_reader)`` is a dict containing the following fields. # ``next(video_reader)`` is a dict containing the following fields.
# #
...@@ -85,7 +85,7 @@ approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0] ...@@ -85,7 +85,7 @@ approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0]
print("Approx total number of datapoints we can expect: ", approx_nf) print("Approx total number of datapoints we can expect: ", approx_nf)
print("Read data size: ", frames[0].size(0) * len(frames)) print("Read data size: ", frames[0].size(0) * len(frames))
###################################### # %%
# But what if we only want to read certain time segment of the video? # But what if we only want to read certain time segment of the video?
# That can be done easily using the combination of our ``seek`` function, and the fact that each call # That can be done easily using the combination of our ``seek`` function, and the fact that each call
# to next returns the presentation timestamp of the returned frame in seconds. # to next returns the presentation timestamp of the returned frame in seconds.
...@@ -107,7 +107,7 @@ for frame, pts in itertools.islice(video.seek(2), 10): ...@@ -107,7 +107,7 @@ for frame, pts in itertools.islice(video.seek(2), 10):
print("Total number of frames: ", len(frames)) print("Total number of frames: ", len(frames))
###################################### # %%
# Or if we wanted to read from 2nd to 5th second, # Or if we wanted to read from 2nd to 5th second,
# We seek into a second second of the video, # We seek into a second second of the video,
# then we utilize the itertools takewhile to get the # then we utilize the itertools takewhile to get the
...@@ -125,7 +125,7 @@ approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0] ...@@ -125,7 +125,7 @@ approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0]
print("We can expect approx: ", approx_nf) print("We can expect approx: ", approx_nf)
print("Tensor size: ", frames[0].size()) print("Tensor size: ", frames[0].size())
#################################### # %%
# 2. Building a sample read_video function # 2. Building a sample read_video function
# ---------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------
# We can utilize the methods above to build the read video function that follows # We can utilize the methods above to build the read video function that follows
...@@ -170,21 +170,21 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au ...@@ -170,21 +170,21 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au
vf, af, info, meta = example_read_video(video) vf, af, info, meta = example_read_video(video)
print(vf.size(), af.size()) print(vf.size(), af.size())
#################################### # %%
# 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400) # 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400)
# ------------------------------------------------------------------------------------------------------- # -------------------------------------------------------------------------------------------------------
# Cool, so now we can use the same principle to make the sample dataset. # Cool, so now we can use the same principle to make the sample dataset.
# We suggest trying out iterable dataset for this purpose. # We suggest trying out iterable dataset for this purpose.
# Here, we are going to build an example dataset that reads randomly selected 10 frames of video. # Here, we are going to build an example dataset that reads randomly selected 10 frames of video.
#################################### # %%
# Make sample dataset # Make sample dataset
import os import os
os.makedirs("./dataset", exist_ok=True) os.makedirs("./dataset", exist_ok=True)
os.makedirs("./dataset/1", exist_ok=True) os.makedirs("./dataset/1", exist_ok=True)
os.makedirs("./dataset/2", exist_ok=True) os.makedirs("./dataset/2", exist_ok=True)
#################################### # %%
# Download the videos # Download the videos
from torchvision.datasets.utils import download_url from torchvision.datasets.utils import download_url
download_url( download_url(
...@@ -212,7 +212,7 @@ download_url( ...@@ -212,7 +212,7 @@ download_url(
"v_SoccerJuggling_g24_c01.avi" "v_SoccerJuggling_g24_c01.avi"
) )
#################################### # %%
# Housekeeping and utilities # Housekeeping and utilities
import os import os
import random import random
...@@ -232,7 +232,7 @@ def get_samples(root, extensions=(".mp4", ".avi")): ...@@ -232,7 +232,7 @@ def get_samples(root, extensions=(".mp4", ".avi")):
_, class_to_idx = _find_classes(root) _, class_to_idx = _find_classes(root)
return make_dataset(root, class_to_idx, extensions=extensions) return make_dataset(root, class_to_idx, extensions=extensions)
#################################### # %%
# We are going to define the dataset and some basic arguments. # We are going to define the dataset and some basic arguments.
# We assume the structure of the FolderDataset, and add the following parameters: # We assume the structure of the FolderDataset, and add the following parameters:
# #
...@@ -287,7 +287,7 @@ class RandomDataset(torch.utils.data.IterableDataset): ...@@ -287,7 +287,7 @@ class RandomDataset(torch.utils.data.IterableDataset):
'end': current_pts} 'end': current_pts}
yield output yield output
#################################### # %%
# Given a path of videos in a folder structure, i.e: # Given a path of videos in a folder structure, i.e:
# #
# - dataset # - dataset
...@@ -309,7 +309,7 @@ frame_transform = t.Compose(transforms) ...@@ -309,7 +309,7 @@ frame_transform = t.Compose(transforms)
dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform) dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform)
#################################### # %%
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
loader = DataLoader(dataset, batch_size=12) loader = DataLoader(dataset, batch_size=12)
data = {"video": [], 'start': [], 'end': [], 'tensorsize': []} data = {"video": [], 'start': [], 'end': [], 'tensorsize': []}
...@@ -321,7 +321,7 @@ for batch in loader: ...@@ -321,7 +321,7 @@ for batch in loader:
data['tensorsize'].append(batch['video'][i].size()) data['tensorsize'].append(batch['video'][i].size())
print(data) print(data)
#################################### # %%
# 4. Data Visualization # 4. Data Visualization
# ---------------------------------- # ----------------------------------
# Example of visualized video # Example of visualized video
...@@ -334,7 +334,7 @@ for i in range(16): ...@@ -334,7 +334,7 @@ for i in range(16):
plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0)) plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0))
plt.axis("off") plt.axis("off")
#################################### # %%
# Cleanup the video and dataset: # Cleanup the video and dataset:
import os import os
import shutil import shutil
......
...@@ -30,7 +30,7 @@ def show(imgs): ...@@ -30,7 +30,7 @@ def show(imgs):
axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
#################################### # %%
# Visualizing a grid of images # Visualizing a grid of images
# ---------------------------- # ----------------------------
# The :func:`~torchvision.utils.make_grid` function can be used to create a # The :func:`~torchvision.utils.make_grid` function can be used to create a
...@@ -48,7 +48,7 @@ dog_list = [dog1_int, dog2_int] ...@@ -48,7 +48,7 @@ dog_list = [dog1_int, dog2_int]
grid = make_grid(dog_list) grid = make_grid(dog_list)
show(grid) show(grid)
#################################### # %%
# Visualizing bounding boxes # Visualizing bounding boxes
# -------------------------- # --------------------------
# We can use :func:`~torchvision.utils.draw_bounding_boxes` to draw boxes on an # We can use :func:`~torchvision.utils.draw_bounding_boxes` to draw boxes on an
...@@ -64,7 +64,7 @@ result = draw_bounding_boxes(dog1_int, boxes, colors=colors, width=5) ...@@ -64,7 +64,7 @@ result = draw_bounding_boxes(dog1_int, boxes, colors=colors, width=5)
show(result) show(result)
##################################### # %%
# Naturally, we can also plot bounding boxes produced by torchvision detection # Naturally, we can also plot bounding boxes produced by torchvision detection
# models. Here is a demo with a Faster R-CNN model loaded from # models. Here is a demo with a Faster R-CNN model loaded from
# :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
...@@ -85,7 +85,7 @@ model = model.eval() ...@@ -85,7 +85,7 @@ model = model.eval()
outputs = model(images) outputs = model(images)
print(outputs) print(outputs)
##################################### # %%
# Let's plot the boxes detected by our model. We will only plot the boxes with a # Let's plot the boxes detected by our model. We will only plot the boxes with a
# score greater than a given threshold. # score greater than a given threshold.
...@@ -96,7 +96,7 @@ dogs_with_boxes = [ ...@@ -96,7 +96,7 @@ dogs_with_boxes = [
] ]
show(dogs_with_boxes) show(dogs_with_boxes)
##################################### # %%
# Visualizing segmentation masks # Visualizing segmentation masks
# ------------------------------ # ------------------------------
# The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to # The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
...@@ -125,7 +125,7 @@ batch = torch.stack([transforms(d) for d in dog_list]) ...@@ -125,7 +125,7 @@ batch = torch.stack([transforms(d) for d in dog_list])
output = model(batch)['out'] output = model(batch)['out']
print(output.shape, output.min().item(), output.max().item()) print(output.shape, output.min().item(), output.max().item())
##################################### # %%
# As we can see above, the output of the segmentation model is a tensor of shape # As we can see above, the output of the segmentation model is a tensor of shape
# ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score, and # ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score, and
# we can normalize them into ``[0, 1]`` by using a softmax. After the softmax, # we can normalize them into ``[0, 1]`` by using a softmax. After the softmax,
...@@ -147,7 +147,7 @@ dog_and_boat_masks = [ ...@@ -147,7 +147,7 @@ dog_and_boat_masks = [
show(dog_and_boat_masks) show(dog_and_boat_masks)
##################################### # %%
# As expected, the model is confident about the dog class, but not so much for # As expected, the model is confident about the dog class, but not so much for
# the boat class. # the boat class.
# #
...@@ -162,7 +162,7 @@ print(f"shape = {boolean_dog_masks.shape}, dtype = {boolean_dog_masks.dtype}") ...@@ -162,7 +162,7 @@ print(f"shape = {boolean_dog_masks.shape}, dtype = {boolean_dog_masks.dtype}")
show([m.float() for m in boolean_dog_masks]) show([m.float() for m in boolean_dog_masks])
##################################### # %%
# The line above where we define ``boolean_dog_masks`` is a bit cryptic, but you # The line above where we define ``boolean_dog_masks`` is a bit cryptic, but you
# can read it as the following query: "For which pixels is 'dog' the most likely # can read it as the following query: "For which pixels is 'dog' the most likely
# class?" # class?"
...@@ -184,7 +184,7 @@ dogs_with_masks = [ ...@@ -184,7 +184,7 @@ dogs_with_masks = [
] ]
show(dogs_with_masks) show(dogs_with_masks)
##################################### # %%
# We can plot more than one mask per image! Remember that the model returned as # We can plot more than one mask per image! Remember that the model returned as
# many masks as there are classes. Let's ask the same query as above, but this # many masks as there are classes. Let's ask the same query as above, but this
# time for *all* classes, not just the dog class: "For each pixel and each class # time for *all* classes, not just the dog class: "For each pixel and each class
...@@ -204,7 +204,7 @@ print(f"dog1_all_classes_masks = {dog1_all_classes_masks.shape}, dtype = {dog1_a ...@@ -204,7 +204,7 @@ print(f"dog1_all_classes_masks = {dog1_all_classes_masks.shape}, dtype = {dog1_a
dog_with_all_masks = draw_segmentation_masks(dog1_int, masks=dog1_all_classes_masks, alpha=.6) dog_with_all_masks = draw_segmentation_masks(dog1_int, masks=dog1_all_classes_masks, alpha=.6)
show(dog_with_all_masks) show(dog_with_all_masks)
##################################### # %%
# We can see in the image above that only 2 masks were drawn: the mask for the # We can see in the image above that only 2 masks were drawn: the mask for the
# background and the mask for the dog. This is because the model thinks that # background and the mask for the dog. This is because the model thinks that
# only these 2 classes are the most likely ones across all the pixels. If the # only these 2 classes are the most likely ones across all the pixels. If the
...@@ -231,7 +231,7 @@ dogs_with_masks = [ ...@@ -231,7 +231,7 @@ dogs_with_masks = [
show(dogs_with_masks) show(dogs_with_masks)
##################################### # %%
# .. _instance_seg_output: # .. _instance_seg_output:
# #
# Instance segmentation models # Instance segmentation models
...@@ -265,7 +265,7 @@ model = model.eval() ...@@ -265,7 +265,7 @@ model = model.eval()
output = model(images) output = model(images)
print(output) print(output)
##################################### # %%
# Let's break this down. For each image in the batch, the model outputs some # Let's break this down. For each image in the batch, the model outputs some
# detections (or instances). The number of detections varies for each input # detections (or instances). The number of detections varies for each input
# image. Each instance is described by its bounding box, its label, its score # image. Each instance is described by its bounding box, its label, its score
...@@ -288,7 +288,7 @@ dog1_masks = dog1_output['masks'] ...@@ -288,7 +288,7 @@ dog1_masks = dog1_output['masks']
print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, " print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, "
f"min = {dog1_masks.min()}, max = {dog1_masks.max()}") f"min = {dog1_masks.min()}, max = {dog1_masks.max()}")
##################################### # %%
# Here the masks correspond to probabilities indicating, for each pixel, how # Here the masks correspond to probabilities indicating, for each pixel, how
# likely it is to belong to the predicted label of that instance. Those # likely it is to belong to the predicted label of that instance. Those
# predicted labels correspond to the 'labels' element in the same output dict. # predicted labels correspond to the 'labels' element in the same output dict.
...@@ -297,7 +297,7 @@ print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, " ...@@ -297,7 +297,7 @@ print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, "
print("For the first dog, the following instances were detected:") print("For the first dog, the following instances were detected:")
print([weights.meta["categories"][label] for label in dog1_output['labels']]) print([weights.meta["categories"][label] for label in dog1_output['labels']])
##################################### # %%
# Interestingly, the model detects two persons in the image. Let's go ahead and # Interestingly, the model detects two persons in the image. Let's go ahead and
# plot those masks. Since :func:`~torchvision.utils.draw_segmentation_masks` # plot those masks. Since :func:`~torchvision.utils.draw_segmentation_masks`
# expects boolean masks, we need to convert those probabilities into boolean # expects boolean masks, we need to convert those probabilities into boolean
...@@ -315,14 +315,14 @@ dog1_bool_masks = dog1_bool_masks.squeeze(1) ...@@ -315,14 +315,14 @@ dog1_bool_masks = dog1_bool_masks.squeeze(1)
show(draw_segmentation_masks(dog1_int, dog1_bool_masks, alpha=0.9)) show(draw_segmentation_masks(dog1_int, dog1_bool_masks, alpha=0.9))
##################################### # %%
# The model seems to have properly detected the dog, but it also confused trees # The model seems to have properly detected the dog, but it also confused trees
# with people. Looking more closely at the scores will help us plot more # with people. Looking more closely at the scores will help us plot more
# relevant masks: # relevant masks:
print(dog1_output['scores']) print(dog1_output['scores'])
##################################### # %%
# Clearly the model is more confident about the dog detection than it is about # Clearly the model is more confident about the dog detection than it is about
# the people detections. That's good news. When plotting the masks, we can ask # the people detections. That's good news. When plotting the masks, we can ask
# for only those that have a good score. Let's use a score threshold of .75 # for only those that have a good score. Let's use a score threshold of .75
...@@ -341,12 +341,12 @@ dogs_with_masks = [ ...@@ -341,12 +341,12 @@ dogs_with_masks = [
] ]
show(dogs_with_masks) show(dogs_with_masks)
##################################### # %%
# The two 'people' masks in the first image where not selected because they have # The two 'people' masks in the first image where not selected because they have
# a lower score than the score threshold. Similarly, in the second image, the # a lower score than the score threshold. Similarly, in the second image, the
# instance with class 15 (which corresponds to 'bench') was not selected. # instance with class 15 (which corresponds to 'bench') was not selected.
##################################### # %%
# .. _keypoint_output: # .. _keypoint_output:
# #
# Visualizing keypoints # Visualizing keypoints
...@@ -373,7 +373,7 @@ model = model.eval() ...@@ -373,7 +373,7 @@ model = model.eval()
outputs = model([person_float]) outputs = model([person_float])
print(outputs) print(outputs)
##################################### # %%
# As we see the output contains a list of dictionaries. # As we see the output contains a list of dictionaries.
# The output list is of length batch_size. # The output list is of length batch_size.
# We currently have just a single image so length of list is 1. # We currently have just a single image so length of list is 1.
...@@ -388,7 +388,7 @@ scores = outputs[0]['scores'] ...@@ -388,7 +388,7 @@ scores = outputs[0]['scores']
print(kpts) print(kpts)
print(scores) print(scores)
##################################### # %%
# The KeypointRCNN model detects there are two instances in the image. # The KeypointRCNN model detects there are two instances in the image.
# If you plot the boxes by using :func:`~draw_bounding_boxes` # If you plot the boxes by using :func:`~draw_bounding_boxes`
# you would recognize they are the person and the surfboard. # you would recognize they are the person and the surfboard.
...@@ -402,7 +402,7 @@ keypoints = kpts[idx] ...@@ -402,7 +402,7 @@ keypoints = kpts[idx]
print(keypoints) print(keypoints)
##################################### # %%
# Great, now we have the keypoints corresponding to the person. # Great, now we have the keypoints corresponding to the person.
# Each keypoint is represented by x, y coordinates and the visibility. # Each keypoint is represented by x, y coordinates and the visibility.
# We can now use the :func:`~torchvision.utils.draw_keypoints` function to draw keypoints. # We can now use the :func:`~torchvision.utils.draw_keypoints` function to draw keypoints.
...@@ -413,7 +413,7 @@ from torchvision.utils import draw_keypoints ...@@ -413,7 +413,7 @@ from torchvision.utils import draw_keypoints
res = draw_keypoints(person_int, keypoints, colors="blue", radius=3) res = draw_keypoints(person_int, keypoints, colors="blue", radius=3)
show(res) show(res)
##################################### # %%
# As we see the keypoints appear as colored circles over the image. # As we see the keypoints appear as colored circles over the image.
# The coco keypoints for a person are ordered and represent the following list.\ # The coco keypoints for a person are ordered and represent the following list.\
...@@ -424,7 +424,7 @@ coco_keypoints = [ ...@@ -424,7 +424,7 @@ coco_keypoints = [
"left_knee", "right_knee", "left_ankle", "right_ankle", "left_knee", "right_knee", "left_ankle", "right_ankle",
] ]
##################################### # %%
# What if we are interested in joining the keypoints? # What if we are interested in joining the keypoints?
# This is especially useful in creating pose detection or action recognition. # This is especially useful in creating pose detection or action recognition.
# We can join the keypoints easily using the `connectivity` parameter. # We can join the keypoints easily using the `connectivity` parameter.
...@@ -450,7 +450,7 @@ connect_skeleton = [ ...@@ -450,7 +450,7 @@ connect_skeleton = [
(7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14), (13, 15), (14, 16) (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14), (13, 15), (14, 16)
] ]
##################################### # %%
# We pass the above list to the connectivity parameter to connect the keypoints. # We pass the above list to the connectivity parameter to connect the keypoints.
# #
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment