Unverified Commit 408917d1 authored by Nicolas Hug's avatar Nicolas Hug Committed by GitHub
Browse files

Use `# %%` syntax for gallery examples (#7793)

parent a893f313
......@@ -23,7 +23,7 @@ from torchvision import datapoints
from torchvision.transforms.v2 import functional as F
########################################################################################################################
# %%
# What are datapoints?
# --------------------
#
......@@ -36,7 +36,7 @@ assert isinstance(image, torch.Tensor)
assert image.data_ptr() == tensor.data_ptr()
########################################################################################################################
# %%
# Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
# for the input data.
#
......@@ -59,7 +59,7 @@ image = datapoints.Image([[[[0, 1], [1, 0]]]])
print(image)
########################################################################################################################
# %%
# Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad``
# parameters.
......@@ -67,14 +67,14 @@ float_image = datapoints.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires
print(float_image)
########################################################################################################################
# %%
# In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` also take a
# :class:`PIL.Image.Image` directly:
image = datapoints.Image(PIL.Image.open("assets/astronaut.jpg"))
print(image.shape, image.dtype)
########################################################################################################################
# %%
# In general, the datapoints can also store additional metadata that complements the underlying tensor. For example,
# :class:`~torchvision.datapoints.BoundingBoxes` stores the coordinate format as well as the spatial size of the
# corresponding image alongside the actual values:
......@@ -85,7 +85,7 @@ bounding_box = datapoints.BoundingBoxes(
print(bounding_box)
########################################################################################################################
# %%
# Do I have to wrap the output of the datasets myself?
# ----------------------------------------------------
#
......@@ -120,7 +120,7 @@ class PennFudanDataset(torch.utils.data.Dataset):
...
########################################################################################################################
# %%
# 2. Perform the wrapping inside a custom transformation at the beginning of your pipeline:
......@@ -144,7 +144,7 @@ def get_transform(train):
transforms.append(T.PILToTensor())
...
########################################################################################################################
# %%
# .. note::
#
# If both :class:`~torchvision.datapoints.BoundingBoxes`'es and :class:`~torchvision.datapoints.Mask`'s are included in
......@@ -171,7 +171,7 @@ new_image = image + 0
assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image)
########################################################################################################################
# %%
# .. note::
#
# This "unwrapping" behaviour is something we're actively seeking feedback on. If you find this surprising or if you
......
......@@ -42,7 +42,7 @@ def plot(imgs, **imshow_kwargs):
plt.tight_layout()
###################################
# %%
# Reading Videos Using Torchvision
# --------------------------------
# We will first read a video using :func:`~torchvision.io.read_video`.
......@@ -62,7 +62,7 @@ video_url = "https://download.pytorch.org/tutorial/pexelscom_pavel_danilyuk_bask
video_path = Path(tempfile.mkdtemp()) / "basketball.mp4"
_ = urlretrieve(video_url, video_path)
#########################
# %%
# :func:`~torchvision.io.read_video` returns the video frames, audio frames and
# the metadata associated with the video. In our case, we only need the video
# frames.
......@@ -79,7 +79,7 @@ img2_batch = torch.stack([frames[101], frames[151]])
plot(img1_batch)
#########################
# %%
# The RAFT model accepts RGB images. We first get the frames from
# :func:`~torchvision.io.read_video` and resize them to ensure their dimensions
# are divisible by 8. Note that we explicitly use ``antialias=False``, because
......@@ -104,7 +104,7 @@ img1_batch, img2_batch = preprocess(img1_batch, img2_batch)
print(f"shape = {img1_batch.shape}, dtype = {img1_batch.dtype}")
####################################
# %%
# Estimating Optical flow using RAFT
# ----------------------------------
# We will use our RAFT implementation from
......@@ -125,7 +125,7 @@ list_of_flows = model(img1_batch.to(device), img2_batch.to(device))
print(f"type = {type(list_of_flows)}")
print(f"length = {len(list_of_flows)} = number of iterations of the model")
####################################
# %%
# The RAFT model outputs lists of predicted flows where each entry is a
# (N, 2, H, W) batch of predicted flows that corresponds to a given "iteration"
# in the model. For more details on the iterative nature of the model, please
......@@ -144,7 +144,7 @@ print(f"shape = {predicted_flows.shape} = (N, 2, H, W)")
print(f"min = {predicted_flows.min()}, max = {predicted_flows.max()}")
####################################
# %%
# Visualizing predicted flows
# ---------------------------
# Torchvision provides the :func:`~torchvision.utils.flow_to_image` utlity to
......@@ -166,7 +166,7 @@ img1_batch = [(img1 + 1) / 2 for img1 in img1_batch]
grid = [[img1, flow_img] for (img1, flow_img) in zip(img1_batch, flow_imgs)]
plot(grid)
####################################
# %%
# Bonus: Creating GIFs of predicted flows
# ---------------------------------------
# In the example above we have only shown the predicted flows of 2 pairs of
......@@ -187,7 +187,7 @@ plot(grid)
# output_folder = "/tmp/" # Update this to the folder of your choice
# write_jpeg(flow_img, output_folder + f"predicted_flow_{i}.jpg")
####################################
# %%
# Once the .jpg flow images are saved, you can convert them into a video or a
# GIF using ffmpeg with e.g.:
#
......
......@@ -36,7 +36,7 @@ def show(imgs):
axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
####################################
# %%
# Masks
# -----
# In tasks like instance and panoptic segmentation, masks are commonly defined, and are defined by this package,
......@@ -53,7 +53,7 @@ def show(imgs):
# A nice property of masks is that they can be easily repurposed to be used in methods to solve a variety of object
# localization tasks.
####################################
# %%
# Converting Masks to Bounding Boxes
# -----------------------------------------------
# For example, the :func:`~torchvision.ops.masks_to_boxes` operation can be used to
......@@ -70,7 +70,7 @@ img = read_image(img_path)
mask = read_image(mask_path)
#########################
# %%
# Here the masks are represented as a PNG Image, with floating point values.
# Each pixel is encoded as different colors, with 0 being background.
# Notice that the spatial dimensions of image and mask match.
......@@ -79,7 +79,7 @@ print(mask.size())
print(img.size())
print(mask)
############################
# %%
# We get the unique colors, as these would be the object ids.
obj_ids = torch.unique(mask)
......@@ -91,7 +91,7 @@ obj_ids = obj_ids[1:]
# Note that this snippet would work as well if the masks were float values instead of ints.
masks = mask == obj_ids[:, None, None]
########################
# %%
# Now the masks are a boolean tensor.
# The first dimension in this case 3 and denotes the number of instances: there are 3 people in the image.
# The other two dimensions are height and width, which are equal to the dimensions of the image.
......@@ -101,7 +101,7 @@ masks = mask == obj_ids[:, None, None]
print(masks.size())
print(masks)
####################################
# %%
# Let us visualize an image and plot its corresponding segmentation masks.
# We will use the :func:`~torchvision.utils.draw_segmentation_masks` to draw the segmentation masks.
......@@ -113,7 +113,7 @@ for mask in masks:
show(drawn_masks)
####################################
# %%
# To convert the boolean masks into bounding boxes.
# We will use the :func:`~torchvision.ops.masks_to_boxes` from the torchvision.ops module
# It returns the boxes in ``(xmin, ymin, xmax, ymax)`` format.
......@@ -124,7 +124,7 @@ boxes = masks_to_boxes(masks)
print(boxes.size())
print(boxes)
####################################
# %%
# As the shape denotes, there are 3 boxes and in ``(xmin, ymin, xmax, ymax)`` format.
# These can be visualized very easily with :func:`~torchvision.utils.draw_bounding_boxes` utility
# provided in :ref:`torchvision.utils <utils>`.
......@@ -134,7 +134,7 @@ from torchvision.utils import draw_bounding_boxes
drawn_boxes = draw_bounding_boxes(img, boxes, colors="red")
show(drawn_boxes)
###################################
# %%
# These boxes can now directly be used by detection models in torchvision.
# Here is demo with a Faster R-CNN model loaded from
# :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
......@@ -153,7 +153,7 @@ target["labels"] = labels = torch.ones((masks.size(0),), dtype=torch.int64)
detection_outputs = model(img.unsqueeze(0), [target])
####################################
# %%
# Converting Segmentation Dataset to Detection Dataset
# ----------------------------------------------------
#
......
......@@ -45,7 +45,7 @@ def show(imgs):
axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
####################################
# %%
# The :func:`~torchvision.io.read_image` function allows to read an image and
# directly load it as a tensor
......@@ -53,7 +53,7 @@ dog1 = read_image(str(Path('assets') / 'dog1.jpg'))
dog2 = read_image(str(Path('assets') / 'dog2.jpg'))
show([dog1, dog2])
####################################
# %%
# Transforming images on GPU
# --------------------------
# Most transforms natively support tensors on top of PIL images (to visualize
......@@ -76,7 +76,7 @@ transformed_dog1 = transforms(dog1)
transformed_dog2 = transforms(dog2)
show([transformed_dog1, transformed_dog2])
####################################
# %%
# Scriptable transforms for easier deployment via torchscript
# -----------------------------------------------------------
# We now show how to combine image transformations and a model forward pass,
......@@ -103,7 +103,7 @@ class Predictor(nn.Module):
return y_pred.argmax(dim=1)
####################################
# %%
# Now, let's define scripted and non-scripted instances of ``Predictor`` and
# apply it on multiple tensor images of the same size
......@@ -115,7 +115,7 @@ batch = torch.stack([dog1, dog2]).to(device)
res = predictor(batch)
res_scripted = scripted_predictor(batch)
####################################
# %%
# We can verify that the prediction of the scripted and non-scripted models are
# the same:
......@@ -128,7 +128,7 @@ for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)):
assert pred == pred_scripted
print(f"Prediction for Dog {i + 1}: {labels[str(pred.item())]}")
####################################
# %%
# Since the model is scripted, it can be easily dumped on disk and re-used
import tempfile
......
......@@ -50,7 +50,7 @@ def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
plt.tight_layout()
####################################
# %%
# Geometric Transforms
# --------------------
# Geometric image transformation refers to the process of altering the geometric properties of an image,
......@@ -65,7 +65,7 @@ def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
padded_imgs = [T.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)]
plot(padded_imgs)
####################################
# %%
# Resize
# ~~~~~~
# The :class:`~torchvision.transforms.Resize` transform
......@@ -74,7 +74,7 @@ plot(padded_imgs)
resized_imgs = [T.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
plot(resized_imgs)
####################################
# %%
# CenterCrop
# ~~~~~~~~~~
# The :class:`~torchvision.transforms.CenterCrop` transform
......@@ -83,7 +83,7 @@ plot(resized_imgs)
center_crops = [T.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
plot(center_crops)
####################################
# %%
# FiveCrop
# ~~~~~~~~
# The :class:`~torchvision.transforms.FiveCrop` transform
......@@ -92,7 +92,7 @@ plot(center_crops)
(top_left, top_right, bottom_left, bottom_right, center) = T.FiveCrop(size=(100, 100))(orig_img)
plot([top_left, top_right, bottom_left, bottom_right, center])
####################################
# %%
# RandomPerspective
# ~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomPerspective` transform
......@@ -102,7 +102,7 @@ perspective_transformer = T.RandomPerspective(distortion_scale=0.6, p=1.0)
perspective_imgs = [perspective_transformer(orig_img) for _ in range(4)]
plot(perspective_imgs)
####################################
# %%
# RandomRotation
# ~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomRotation` transform
......@@ -112,7 +112,7 @@ rotater = T.RandomRotation(degrees=(0, 180))
rotated_imgs = [rotater(orig_img) for _ in range(4)]
plot(rotated_imgs)
####################################
# %%
# RandomAffine
# ~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomAffine` transform
......@@ -122,7 +122,7 @@ affine_transfomer = T.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale
affine_imgs = [affine_transfomer(orig_img) for _ in range(4)]
plot(affine_imgs)
####################################
# %%
# ElasticTransform
# ~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.ElasticTransform` transform
......@@ -133,7 +133,7 @@ elastic_transformer = T.ElasticTransform(alpha=250.0)
transformed_imgs = [elastic_transformer(orig_img) for _ in range(2)]
plot(transformed_imgs)
####################################
# %%
# RandomCrop
# ~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomCrop` transform
......@@ -143,7 +143,7 @@ cropper = T.RandomCrop(size=(128, 128))
crops = [cropper(orig_img) for _ in range(4)]
plot(crops)
####################################
# %%
# RandomResizedCrop
# ~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomResizedCrop` transform
......@@ -154,7 +154,7 @@ resize_cropper = T.RandomResizedCrop(size=(32, 32))
resized_crops = [resize_cropper(orig_img) for _ in range(4)]
plot(resized_crops)
####################################
# %%
# Photometric Transforms
# ----------------------
# Photometric image transformation refers to the process of modifying the photometric properties of an image,
......@@ -174,7 +174,7 @@ plot(resized_crops)
gray_img = T.Grayscale()(orig_img)
plot([gray_img], cmap='gray')
####################################
# %%
# ColorJitter
# ~~~~~~~~~~~
# The :class:`~torchvision.transforms.ColorJitter` transform
......@@ -183,7 +183,7 @@ jitter = T.ColorJitter(brightness=.5, hue=.3)
jitted_imgs = [jitter(orig_img) for _ in range(4)]
plot(jitted_imgs)
####################################
# %%
# GaussianBlur
# ~~~~~~~~~~~~
# The :class:`~torchvision.transforms.GaussianBlur` transform
......@@ -193,7 +193,7 @@ blurrer = T.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5))
blurred_imgs = [blurrer(orig_img) for _ in range(4)]
plot(blurred_imgs)
####################################
# %%
# RandomInvert
# ~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomInvert` transform
......@@ -203,7 +203,7 @@ inverter = T.RandomInvert()
invertered_imgs = [inverter(orig_img) for _ in range(4)]
plot(invertered_imgs)
####################################
# %%
# RandomPosterize
# ~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomPosterize` transform
......@@ -214,7 +214,7 @@ posterizer = T.RandomPosterize(bits=2)
posterized_imgs = [posterizer(orig_img) for _ in range(4)]
plot(posterized_imgs)
####################################
# %%
# RandomSolarize
# ~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomSolarize` transform
......@@ -225,7 +225,7 @@ solarizer = T.RandomSolarize(threshold=192.0)
solarized_imgs = [solarizer(orig_img) for _ in range(4)]
plot(solarized_imgs)
####################################
# %%
# RandomAdjustSharpness
# ~~~~~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomAdjustSharpness` transform
......@@ -235,7 +235,7 @@ sharpness_adjuster = T.RandomAdjustSharpness(sharpness_factor=2)
sharpened_imgs = [sharpness_adjuster(orig_img) for _ in range(4)]
plot(sharpened_imgs)
####################################
# %%
# RandomAutocontrast
# ~~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomAutocontrast` transform
......@@ -245,7 +245,7 @@ autocontraster = T.RandomAutocontrast()
autocontrasted_imgs = [autocontraster(orig_img) for _ in range(4)]
plot(autocontrasted_imgs)
####################################
# %%
# RandomEqualize
# ~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomEqualize` transform
......@@ -255,7 +255,7 @@ equalizer = T.RandomEqualize()
equalized_imgs = [equalizer(orig_img) for _ in range(4)]
plot(equalized_imgs)
####################################
# %%
# Augmentation Transforms
# -----------------------
# The following transforms are combinations of multiple transforms,
......@@ -275,7 +275,7 @@ imgs = [
row_title = [str(policy).split('.')[-1] for policy in policies]
plot(imgs, row_title=row_title)
####################################
# %%
# RandAugment
# ~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandAugment` is an alternate version of AutoAugment.
......@@ -283,7 +283,7 @@ augmenter = T.RandAugment()
imgs = [augmenter(orig_img) for _ in range(4)]
plot(imgs)
####################################
# %%
# TrivialAugmentWide
# ~~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.TrivialAugmentWide` is an alternate implementation of AutoAugment.
......@@ -293,7 +293,7 @@ augmenter = T.TrivialAugmentWide()
imgs = [augmenter(orig_img) for _ in range(4)]
plot(imgs)
####################################
# %%
# AugMix
# ~~~~~~
# The :class:`~torchvision.transforms.AugMix` transform interpolates between augmented versions of an image.
......@@ -301,7 +301,7 @@ augmenter = T.AugMix()
imgs = [augmenter(orig_img) for _ in range(4)]
plot(imgs)
####################################
# %%
# Randomly-applied Transforms
# ---------------------------
#
......@@ -318,7 +318,7 @@ hflipper = T.RandomHorizontalFlip(p=0.5)
transformed_imgs = [hflipper(orig_img) for _ in range(4)]
plot(transformed_imgs)
####################################
# %%
# RandomVerticalFlip
# ~~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomVerticalFlip` transform
......@@ -328,7 +328,7 @@ vflipper = T.RandomVerticalFlip(p=0.5)
transformed_imgs = [vflipper(orig_img) for _ in range(4)]
plot(transformed_imgs)
####################################
# %%
# RandomApply
# ~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomApply` transform
......
......@@ -36,7 +36,7 @@ def load_data():
return path, image, bounding_boxes, masks, labels
########################################################################################################################
# %%
# The :mod:`torchvision.transforms.v2` API supports images, videos, bounding boxes, and instance and segmentation
# masks. Thus, it offers native support for many Computer Vision tasks, like image and video classification, object
# detection or instance and semantic segmentation. Still, the interface is the same, making
......@@ -55,7 +55,7 @@ transform = transforms.Compose(
]
)
########################################################################################################################
# %%
# :mod:`torchvision.transforms.v2` natively supports jointly transforming multiple inputs while making sure that
# potential random behavior is consistent across all inputs. However, it doesn't enforce a specific input structure or
# order.
......@@ -70,7 +70,7 @@ new_image, new_bounding_boxes, new_masks, new_labels = transform(
) # Instance Segmentation
new_image, new_target = transform((image, {"boxes": bounding_boxes, "labels": labels})) # Arbitrary Structure
########################################################################################################################
# %%
# Under the hood, :mod:`torchvision.transforms.v2` relies on :mod:`torchvision.datapoints` for the dispatch to the
# appropriate function for the input data: :ref:`sphx_glr_auto_examples_plot_datapoints.py`. Note however, that as
# regular user, you likely don't have to touch this yourself. See
......@@ -84,7 +84,7 @@ new_sample = transform(sample)
assert new_sample["path"] is sample["path"]
########################################################################################################################
# %%
# As stated above, :mod:`torchvision.transforms.v2` is a drop-in replacement for :mod:`torchvision.transforms` and thus
# also supports transforming plain :class:`torch.Tensor`'s as image or video if applicable. This is achieved with a
# simple heuristic:
......
......@@ -47,7 +47,7 @@ from torchvision import models, datasets
import torchvision.transforms.v2 as transforms
########################################################################################################################
# %%
# We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently
# returns, and we'll see how to convert it to a format that is compatible with our new transforms.
......@@ -67,7 +67,7 @@ print(type(image))
print(type(target), type(target[0]), list(target[0].keys()))
########################################################################################################################
# %%
# The dataset returns a two-tuple with the first item being a :class:`PIL.Image.Image` and second one a list of
# dictionaries, which each containing the annotations for a single object instance. As is, this format is not compatible
# with the ``torchvision.transforms.v2``, nor with the models. To overcome that, we provide the
......@@ -85,13 +85,13 @@ print(type(image))
print(type(target), list(target.keys()))
print(type(target["boxes"]), type(target["labels"]))
########################################################################################################################
# %%
# As baseline, let's have a look at a sample without transformations:
show(sample)
########################################################################################################################
# %%
# With the dataset properly set up, we can now define the augmentation pipeline. This is done the same way it is done in
# ``torchvision.transforms`` v1, but now handles bounding boxes and masks without any extra configuration.
......@@ -107,7 +107,7 @@ transform = transforms.Compose(
]
)
########################################################################################################################
# %%
# .. note::
# Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` transform is a no-op in this example, but it
# should be placed at least once at the end of a detection pipeline to remove degenerate bounding boxes as well as
......@@ -126,7 +126,7 @@ sample = dataset[0]
show(sample)
########################################################################################################################
# %%
# We can see that the color of the image was distorted, we zoomed out on it (off center) and flipped it horizontally.
# In all of this, the bounding box was transformed accordingly. And without any further ado, we can start training.
......
......@@ -7,14 +7,14 @@ This example illustrates some of the APIs that torchvision offers for
videos, together with the examples on how to build datasets and more.
"""
####################################
# %%
# 1. Introduction: building a new video object and examining the properties
# -------------------------------------------------------------------------
# First we select a video to test the object out. For the sake of argument
# we're using one from kinetics400 dataset.
# To create it, we need to define the path and the stream we want to use.
######################################
# %%
# Chosen video statistics:
#
# - WUzgd7C1pWA.mp4
......@@ -42,7 +42,7 @@ download_url(
)
video_path = "./WUzgd7C1pWA.mp4"
######################################
# %%
# Streams are defined in a similar fashion as torch devices. We encode them as strings in a form
# of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int.
# The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered.
......@@ -52,7 +52,7 @@ stream = "video"
video = torchvision.io.VideoReader(video_path, stream)
video.get_metadata()
######################################
# %%
# Here we can see that video has two streams - a video and an audio stream.
# Currently available stream types include ['video', 'audio'].
# Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id
......@@ -61,7 +61,7 @@ video.get_metadata()
# users can access the one they want.
# If only stream type is passed, the decoder auto-detects first stream of that type and returns it.
######################################
# %%
# Let's read all the frames from the video stream. By default, the return value of
# ``next(video_reader)`` is a dict containing the following fields.
#
......@@ -85,7 +85,7 @@ approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0]
print("Approx total number of datapoints we can expect: ", approx_nf)
print("Read data size: ", frames[0].size(0) * len(frames))
######################################
# %%
# But what if we only want to read certain time segment of the video?
# That can be done easily using the combination of our ``seek`` function, and the fact that each call
# to next returns the presentation timestamp of the returned frame in seconds.
......@@ -107,7 +107,7 @@ for frame, pts in itertools.islice(video.seek(2), 10):
print("Total number of frames: ", len(frames))
######################################
# %%
# Or if we wanted to read from 2nd to 5th second,
# We seek into a second second of the video,
# then we utilize the itertools takewhile to get the
......@@ -125,7 +125,7 @@ approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0]
print("We can expect approx: ", approx_nf)
print("Tensor size: ", frames[0].size())
####################################
# %%
# 2. Building a sample read_video function
# ----------------------------------------------------------------------------------------
# We can utilize the methods above to build the read video function that follows
......@@ -170,21 +170,21 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au
vf, af, info, meta = example_read_video(video)
print(vf.size(), af.size())
####################################
# %%
# 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400)
# -------------------------------------------------------------------------------------------------------
# Cool, so now we can use the same principle to make the sample dataset.
# We suggest trying out iterable dataset for this purpose.
# Here, we are going to build an example dataset that reads randomly selected 10 frames of video.
####################################
# %%
# Make sample dataset
import os
os.makedirs("./dataset", exist_ok=True)
os.makedirs("./dataset/1", exist_ok=True)
os.makedirs("./dataset/2", exist_ok=True)
####################################
# %%
# Download the videos
from torchvision.datasets.utils import download_url
download_url(
......@@ -212,7 +212,7 @@ download_url(
"v_SoccerJuggling_g24_c01.avi"
)
####################################
# %%
# Housekeeping and utilities
import os
import random
......@@ -232,7 +232,7 @@ def get_samples(root, extensions=(".mp4", ".avi")):
_, class_to_idx = _find_classes(root)
return make_dataset(root, class_to_idx, extensions=extensions)
####################################
# %%
# We are going to define the dataset and some basic arguments.
# We assume the structure of the FolderDataset, and add the following parameters:
#
......@@ -287,7 +287,7 @@ class RandomDataset(torch.utils.data.IterableDataset):
'end': current_pts}
yield output
####################################
# %%
# Given a path of videos in a folder structure, i.e:
#
# - dataset
......@@ -309,7 +309,7 @@ frame_transform = t.Compose(transforms)
dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform)
####################################
# %%
from torch.utils.data import DataLoader
loader = DataLoader(dataset, batch_size=12)
data = {"video": [], 'start': [], 'end': [], 'tensorsize': []}
......@@ -321,7 +321,7 @@ for batch in loader:
data['tensorsize'].append(batch['video'][i].size())
print(data)
####################################
# %%
# 4. Data Visualization
# ----------------------------------
# Example of visualized video
......@@ -334,7 +334,7 @@ for i in range(16):
plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0))
plt.axis("off")
####################################
# %%
# Cleanup the video and dataset:
import os
import shutil
......
......@@ -30,7 +30,7 @@ def show(imgs):
axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
####################################
# %%
# Visualizing a grid of images
# ----------------------------
# The :func:`~torchvision.utils.make_grid` function can be used to create a
......@@ -48,7 +48,7 @@ dog_list = [dog1_int, dog2_int]
grid = make_grid(dog_list)
show(grid)
####################################
# %%
# Visualizing bounding boxes
# --------------------------
# We can use :func:`~torchvision.utils.draw_bounding_boxes` to draw boxes on an
......@@ -64,7 +64,7 @@ result = draw_bounding_boxes(dog1_int, boxes, colors=colors, width=5)
show(result)
#####################################
# %%
# Naturally, we can also plot bounding boxes produced by torchvision detection
# models. Here is a demo with a Faster R-CNN model loaded from
# :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
......@@ -85,7 +85,7 @@ model = model.eval()
outputs = model(images)
print(outputs)
#####################################
# %%
# Let's plot the boxes detected by our model. We will only plot the boxes with a
# score greater than a given threshold.
......@@ -96,7 +96,7 @@ dogs_with_boxes = [
]
show(dogs_with_boxes)
#####################################
# %%
# Visualizing segmentation masks
# ------------------------------
# The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
......@@ -125,7 +125,7 @@ batch = torch.stack([transforms(d) for d in dog_list])
output = model(batch)['out']
print(output.shape, output.min().item(), output.max().item())
#####################################
# %%
# As we can see above, the output of the segmentation model is a tensor of shape
# ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score, and
# we can normalize them into ``[0, 1]`` by using a softmax. After the softmax,
......@@ -147,7 +147,7 @@ dog_and_boat_masks = [
show(dog_and_boat_masks)
#####################################
# %%
# As expected, the model is confident about the dog class, but not so much for
# the boat class.
#
......@@ -162,7 +162,7 @@ print(f"shape = {boolean_dog_masks.shape}, dtype = {boolean_dog_masks.dtype}")
show([m.float() for m in boolean_dog_masks])
#####################################
# %%
# The line above where we define ``boolean_dog_masks`` is a bit cryptic, but you
# can read it as the following query: "For which pixels is 'dog' the most likely
# class?"
......@@ -184,7 +184,7 @@ dogs_with_masks = [
]
show(dogs_with_masks)
#####################################
# %%
# We can plot more than one mask per image! Remember that the model returned as
# many masks as there are classes. Let's ask the same query as above, but this
# time for *all* classes, not just the dog class: "For each pixel and each class
......@@ -204,7 +204,7 @@ print(f"dog1_all_classes_masks = {dog1_all_classes_masks.shape}, dtype = {dog1_a
dog_with_all_masks = draw_segmentation_masks(dog1_int, masks=dog1_all_classes_masks, alpha=.6)
show(dog_with_all_masks)
#####################################
# %%
# We can see in the image above that only 2 masks were drawn: the mask for the
# background and the mask for the dog. This is because the model thinks that
# only these 2 classes are the most likely ones across all the pixels. If the
......@@ -231,7 +231,7 @@ dogs_with_masks = [
show(dogs_with_masks)
#####################################
# %%
# .. _instance_seg_output:
#
# Instance segmentation models
......@@ -265,7 +265,7 @@ model = model.eval()
output = model(images)
print(output)
#####################################
# %%
# Let's break this down. For each image in the batch, the model outputs some
# detections (or instances). The number of detections varies for each input
# image. Each instance is described by its bounding box, its label, its score
......@@ -288,7 +288,7 @@ dog1_masks = dog1_output['masks']
print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, "
f"min = {dog1_masks.min()}, max = {dog1_masks.max()}")
#####################################
# %%
# Here the masks correspond to probabilities indicating, for each pixel, how
# likely it is to belong to the predicted label of that instance. Those
# predicted labels correspond to the 'labels' element in the same output dict.
......@@ -297,7 +297,7 @@ print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, "
print("For the first dog, the following instances were detected:")
print([weights.meta["categories"][label] for label in dog1_output['labels']])
#####################################
# %%
# Interestingly, the model detects two persons in the image. Let's go ahead and
# plot those masks. Since :func:`~torchvision.utils.draw_segmentation_masks`
# expects boolean masks, we need to convert those probabilities into boolean
......@@ -315,14 +315,14 @@ dog1_bool_masks = dog1_bool_masks.squeeze(1)
show(draw_segmentation_masks(dog1_int, dog1_bool_masks, alpha=0.9))
#####################################
# %%
# The model seems to have properly detected the dog, but it also confused trees
# with people. Looking more closely at the scores will help us plot more
# relevant masks:
print(dog1_output['scores'])
#####################################
# %%
# Clearly the model is more confident about the dog detection than it is about
# the people detections. That's good news. When plotting the masks, we can ask
# for only those that have a good score. Let's use a score threshold of .75
......@@ -341,12 +341,12 @@ dogs_with_masks = [
]
show(dogs_with_masks)
#####################################
# %%
# The two 'people' masks in the first image where not selected because they have
# a lower score than the score threshold. Similarly, in the second image, the
# instance with class 15 (which corresponds to 'bench') was not selected.
#####################################
# %%
# .. _keypoint_output:
#
# Visualizing keypoints
......@@ -373,7 +373,7 @@ model = model.eval()
outputs = model([person_float])
print(outputs)
#####################################
# %%
# As we see the output contains a list of dictionaries.
# The output list is of length batch_size.
# We currently have just a single image so length of list is 1.
......@@ -388,7 +388,7 @@ scores = outputs[0]['scores']
print(kpts)
print(scores)
#####################################
# %%
# The KeypointRCNN model detects there are two instances in the image.
# If you plot the boxes by using :func:`~draw_bounding_boxes`
# you would recognize they are the person and the surfboard.
......@@ -402,7 +402,7 @@ keypoints = kpts[idx]
print(keypoints)
#####################################
# %%
# Great, now we have the keypoints corresponding to the person.
# Each keypoint is represented by x, y coordinates and the visibility.
# We can now use the :func:`~torchvision.utils.draw_keypoints` function to draw keypoints.
......@@ -413,7 +413,7 @@ from torchvision.utils import draw_keypoints
res = draw_keypoints(person_int, keypoints, colors="blue", radius=3)
show(res)
#####################################
# %%
# As we see the keypoints appear as colored circles over the image.
# The coco keypoints for a person are ordered and represent the following list.\
......@@ -424,7 +424,7 @@ coco_keypoints = [
"left_knee", "right_knee", "left_ankle", "right_ankle",
]
#####################################
# %%
# What if we are interested in joining the keypoints?
# This is especially useful in creating pose detection or action recognition.
# We can join the keypoints easily using the `connectivity` parameter.
......@@ -450,7 +450,7 @@ connect_skeleton = [
(7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14), (13, 15), (14, 16)
]
#####################################
# %%
# We pass the above list to the connectivity parameter to connect the keypoints.
#
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment