Use `# %%` syntax for gallery examples (#7793)

408917d1 · Nicolas Hug · GitHub · a893f313 · 408917d1 · 408917d1
Unverified Commit 408917d1 authored Aug 02, 2023 by Nicolas Hug Committed by GitHub Aug 02, 2023
9 changed files
--- a/gallery/plot_datapoints.py
+++ b/gallery/plot_datapoints.py
@@ -23,7 +23,7 @@ from torchvision import datapoints
 from torchvision.transforms.v2 import functional as F


-########################################################################################################################
+# %%
 # What are datapoints?
 # --------------------
 #
@@ -36,7 +36,7 @@ assert isinstance(image, torch.Tensor)
 assert image.data_ptr() == tensor.data_ptr()


-########################################################################################################################
+# %%
 # Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
 # for the input data.
 #
@@ -59,7 +59,7 @@ image = datapoints.Image([[[[0, 1], [1, 0]]]])
 print(image)


-########################################################################################################################
+# %%
 # Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad``
 # parameters.

@@ -67,14 +67,14 @@ float_image = datapoints.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires
 print(float_image)


-########################################################################################################################
+# %%
 # In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` also take a
 # :class:`PIL.Image.Image` directly:

 image = datapoints.Image(PIL.Image.open("assets/astronaut.jpg"))
 print(image.shape, image.dtype)

-########################################################################################################################
+# %%
 # In general, the datapoints can also store additional metadata that complements the underlying tensor. For example,
 # :class:`~torchvision.datapoints.BoundingBoxes` stores the coordinate format as well as the spatial size of the
 # corresponding image alongside the actual values:
@@ -85,7 +85,7 @@ bounding_box = datapoints.BoundingBoxes(
 print(bounding_box)


-########################################################################################################################
+# %%
 # Do I have to wrap the output of the datasets myself?
 # ----------------------------------------------------
 #
@@ -120,7 +120,7 @@ class PennFudanDataset(torch.utils.data.Dataset):

        ...

-########################################################################################################################
+# %%
 # 2. Perform the wrapping inside a custom transformation at the beginning of your pipeline:


@@ -144,7 +144,7 @@ def get_transform(train):
    transforms.append(T.PILToTensor())
    ...

-########################################################################################################################
+# %%
 # .. note::
 #
 #    If both :class:`~torchvision.datapoints.BoundingBoxes`'es and :class:`~torchvision.datapoints.Mask`'s are included in
@@ -171,7 +171,7 @@ new_image = image + 0

 assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image)

-########################################################################################################################
+# %%
 # .. note::
 #
 #    This "unwrapping" behaviour is something we're actively seeking feedback on. If you find this surprising or if you

--- a/gallery/plot_optical_flow.py
+++ b/gallery/plot_optical_flow.py
@@ -42,7 +42,7 @@ def plot(imgs, **imshow_kwargs):

    plt.tight_layout()

-###################################
+# %%
 # Reading Videos Using Torchvision
 # --------------------------------
 # We will first read a video using :func:`~torchvision.io.read_video`.
@@ -62,7 +62,7 @@ video_url = "https://download.pytorch.org/tutorial/pexelscom_pavel_danilyuk_bask
 video_path = Path(tempfile.mkdtemp()) / "basketball.mp4"
 _ = urlretrieve(video_url, video_path)

-#########################
+# %%
 # :func:`~torchvision.io.read_video` returns the video frames, audio frames and
 # the metadata associated with the video. In our case, we only need the video
 # frames.
@@ -79,7 +79,7 @@ img2_batch = torch.stack([frames[101], frames[151]])

 plot(img1_batch)

-#########################
+# %%
 # The RAFT model accepts RGB images. We first get the frames from
 # :func:`~torchvision.io.read_video` and resize them to ensure their dimensions
 # are divisible by 8. Note that we explicitly use ``antialias=False``, because
@@ -104,7 +104,7 @@ img1_batch, img2_batch = preprocess(img1_batch, img2_batch)
 print(f"shape = {img1_batch.shape}, dtype = {img1_batch.dtype}")


-####################################
+# %%
 # Estimating Optical flow using RAFT
 # ----------------------------------
 # We will use our RAFT implementation from
@@ -125,7 +125,7 @@ list_of_flows = model(img1_batch.to(device), img2_batch.to(device))
 print(f"type = {type(list_of_flows)}")
 print(f"length = {len(list_of_flows)} = number of iterations of the model")

-####################################
+# %%
 # The RAFT model outputs lists of predicted flows where each entry is a
 # (N, 2, H, W) batch of predicted flows that corresponds to a given "iteration"
 # in the model. For more details on the iterative nature of the model, please
@@ -144,7 +144,7 @@ print(f"shape = {predicted_flows.shape} = (N, 2, H, W)")
 print(f"min = {predicted_flows.min()}, max = {predicted_flows.max()}")


-####################################
+# %%
 # Visualizing predicted flows
 # ---------------------------
 # Torchvision provides the :func:`~torchvision.utils.flow_to_image` utlity to
@@ -166,7 +166,7 @@ img1_batch = [(img1 + 1) / 2 for img1 in img1_batch]
 grid = [[img1, flow_img] for (img1, flow_img) in zip(img1_batch, flow_imgs)]
 plot(grid)

-####################################
+# %%
 # Bonus: Creating GIFs of predicted flows
 # ---------------------------------------
 # In the example above we have only shown the predicted flows of 2 pairs of
@@ -187,7 +187,7 @@ plot(grid)
 #     output_folder = "/tmp/"  # Update this to the folder of your choice
 #     write_jpeg(flow_img, output_folder + f"predicted_flow_{i}.jpg")

-####################################
+# %%
 # Once the .jpg flow images are saved, you can convert them into a video or a
 # GIF using ffmpeg with e.g.:
 #

--- a/gallery/plot_repurposing_annotations.py
+++ b/gallery/plot_repurposing_annotations.py
@@ -36,7 +36,7 @@ def show(imgs):
        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])


-####################################
+# %%
 # Masks
 # -----
 # In tasks like instance and panoptic segmentation, masks are commonly defined, and are defined by this package,
@@ -53,7 +53,7 @@ def show(imgs):
 # A nice property of masks is that they can be easily repurposed to be used in methods to solve a variety of object
 # localization tasks.

-####################################
+# %%
 # Converting Masks to Bounding Boxes
 # -----------------------------------------------
 # For example, the :func:`~torchvision.ops.masks_to_boxes` operation can be used to
@@ -70,7 +70,7 @@ img = read_image(img_path)
 mask = read_image(mask_path)


-#########################
+# %%
 # Here the masks are represented as a PNG Image, with floating point values.
 # Each pixel is encoded as different colors, with 0 being background.
 # Notice that the spatial dimensions of image and mask match.
@@ -79,7 +79,7 @@ print(mask.size())
 print(img.size())
 print(mask)

-############################
+# %%

 # We get the unique colors, as these would be the object ids.
 obj_ids = torch.unique(mask)
@@ -91,7 +91,7 @@ obj_ids = obj_ids[1:]
 # Note that this snippet would work as well if the masks were float values instead of ints.
 masks = mask == obj_ids[:, None, None]

-########################
+# %%
 # Now the masks are a boolean tensor.
 # The first dimension in this case 3 and denotes the number of instances: there are 3 people in the image.
 # The other two dimensions are height and width, which are equal to the dimensions of the image.
@@ -101,7 +101,7 @@ masks = mask == obj_ids[:, None, None]
 print(masks.size())
 print(masks)

-####################################
+# %%
 # Let us visualize an image and plot its corresponding segmentation masks.
 # We will use the :func:`~torchvision.utils.draw_segmentation_masks` to draw the segmentation masks.

@@ -113,7 +113,7 @@ for mask in masks:

 show(drawn_masks)

-####################################
+# %%
 # To convert the boolean masks into bounding boxes.
 # We will use the :func:`~torchvision.ops.masks_to_boxes` from the torchvision.ops module
 # It returns the boxes in ``(xmin, ymin, xmax, ymax)`` format.
@@ -124,7 +124,7 @@ boxes = masks_to_boxes(masks)
 print(boxes.size())
 print(boxes)

-####################################
+# %%
 # As the shape denotes, there are 3 boxes and in ``(xmin, ymin, xmax, ymax)`` format.
 # These can be visualized very easily with :func:`~torchvision.utils.draw_bounding_boxes` utility
 # provided in :ref:`torchvision.utils <utils>`.
@@ -134,7 +134,7 @@ from torchvision.utils import draw_bounding_boxes
 drawn_boxes = draw_bounding_boxes(img, boxes, colors="red")
 show(drawn_boxes)

-###################################
+# %%
 # These boxes can now directly be used by detection models in torchvision.
 # Here is demo with a Faster R-CNN model loaded from
 # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
@@ -153,7 +153,7 @@ target["labels"] = labels = torch.ones((masks.size(0),), dtype=torch.int64)
 detection_outputs = model(img.unsqueeze(0), [target])


-####################################
+# %%
 # Converting Segmentation Dataset to Detection Dataset
 # ----------------------------------------------------
 #

--- a/gallery/plot_scripted_tensor_transforms.py
+++ b/gallery/plot_scripted_tensor_transforms.py
@@ -45,7 +45,7 @@ def show(imgs):
        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])


-####################################
+# %%
 # The :func:`~torchvision.io.read_image` function allows to read an image and
 # directly load it as a tensor

@@ -53,7 +53,7 @@ dog1 = read_image(str(Path('assets') / 'dog1.jpg'))
 dog2 = read_image(str(Path('assets') / 'dog2.jpg'))
 show([dog1, dog2])

-####################################
+# %%
 # Transforming images on GPU
 # --------------------------
 # Most transforms natively support tensors on top of PIL images (to visualize
@@ -76,7 +76,7 @@ transformed_dog1 = transforms(dog1)
 transformed_dog2 = transforms(dog2)
 show([transformed_dog1, transformed_dog2])

-####################################
+# %%
 # Scriptable transforms for easier deployment via torchscript
 # -----------------------------------------------------------
 # We now show how to combine image transformations and a model forward pass,
@@ -103,7 +103,7 @@ class Predictor(nn.Module):
            return y_pred.argmax(dim=1)


-####################################
+# %%
 # Now, let's define scripted and non-scripted instances of ``Predictor`` and
 # apply it on multiple tensor images of the same size

@@ -115,7 +115,7 @@ batch = torch.stack([dog1, dog2]).to(device)
 res = predictor(batch)
 res_scripted = scripted_predictor(batch)

-####################################
+# %%
 # We can verify that the prediction of the scripted and non-scripted models are
 # the same:

@@ -128,7 +128,7 @@ for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)):
    assert pred == pred_scripted
    print(f"Prediction for Dog {i + 1}: {labels[str(pred.item())]}")

-####################################
+# %%
 # Since the model is scripted, it can be easily dumped on disk and re-used

 import tempfile

--- a/gallery/plot_transforms.py
+++ b/gallery/plot_transforms.py
@@ -50,7 +50,7 @@ def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
    plt.tight_layout()


-####################################
+# %%
 # Geometric Transforms
 # --------------------
 # Geometric image transformation refers to the process of altering the geometric properties of an image,
@@ -65,7 +65,7 @@ def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
 padded_imgs = [T.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)]
 plot(padded_imgs)

-####################################
+# %%
 # Resize
 # ~~~~~~
 # The :class:`~torchvision.transforms.Resize` transform
@@ -74,7 +74,7 @@ plot(padded_imgs)
 resized_imgs = [T.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
 plot(resized_imgs)

-####################################
+# %%
 # CenterCrop
 # ~~~~~~~~~~
 # The :class:`~torchvision.transforms.CenterCrop` transform
@@ -83,7 +83,7 @@ plot(resized_imgs)
 center_crops = [T.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
 plot(center_crops)

-####################################
+# %%
 # FiveCrop
 # ~~~~~~~~
 # The :class:`~torchvision.transforms.FiveCrop` transform
@@ -92,7 +92,7 @@ plot(center_crops)
 (top_left, top_right, bottom_left, bottom_right, center) = T.FiveCrop(size=(100, 100))(orig_img)
 plot([top_left, top_right, bottom_left, bottom_right, center])

-####################################
+# %%
 # RandomPerspective
 # ~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomPerspective` transform
@@ -102,7 +102,7 @@ perspective_transformer = T.RandomPerspective(distortion_scale=0.6, p=1.0)
 perspective_imgs = [perspective_transformer(orig_img) for _ in range(4)]
 plot(perspective_imgs)

-####################################
+# %%
 # RandomRotation
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomRotation` transform
@@ -112,7 +112,7 @@ rotater = T.RandomRotation(degrees=(0, 180))
 rotated_imgs = [rotater(orig_img) for _ in range(4)]
 plot(rotated_imgs)

-####################################
+# %%
 # RandomAffine
 # ~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAffine` transform
@@ -122,7 +122,7 @@ affine_transfomer = T.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale
 affine_imgs = [affine_transfomer(orig_img) for _ in range(4)]
 plot(affine_imgs)

-####################################
+# %%
 # ElasticTransform
 # ~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.ElasticTransform` transform
@@ -133,7 +133,7 @@ elastic_transformer = T.ElasticTransform(alpha=250.0)
 transformed_imgs = [elastic_transformer(orig_img) for _ in range(2)]
 plot(transformed_imgs)

-####################################
+# %%
 # RandomCrop
 # ~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomCrop` transform
@@ -143,7 +143,7 @@ cropper = T.RandomCrop(size=(128, 128))
 crops = [cropper(orig_img) for _ in range(4)]
 plot(crops)

-####################################
+# %%
 # RandomResizedCrop
 # ~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomResizedCrop` transform
@@ -154,7 +154,7 @@ resize_cropper = T.RandomResizedCrop(size=(32, 32))
 resized_crops = [resize_cropper(orig_img) for _ in range(4)]
 plot(resized_crops)

-####################################
+# %%
 # Photometric Transforms
 # ----------------------
 # Photometric image transformation refers to the process of modifying the photometric properties of an image,
@@ -174,7 +174,7 @@ plot(resized_crops)
 gray_img = T.Grayscale()(orig_img)
 plot([gray_img], cmap='gray')

-####################################
+# %%
 # ColorJitter
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.ColorJitter` transform
@@ -183,7 +183,7 @@ jitter = T.ColorJitter(brightness=.5, hue=.3)
 jitted_imgs = [jitter(orig_img) for _ in range(4)]
 plot(jitted_imgs)

-####################################
+# %%
 # GaussianBlur
 # ~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.GaussianBlur` transform
@@ -193,7 +193,7 @@ blurrer = T.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5))
 blurred_imgs = [blurrer(orig_img) for _ in range(4)]
 plot(blurred_imgs)

-####################################
+# %%
 # RandomInvert
 # ~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomInvert` transform
@@ -203,7 +203,7 @@ inverter = T.RandomInvert()
 invertered_imgs = [inverter(orig_img) for _ in range(4)]
 plot(invertered_imgs)

-####################################
+# %%
 # RandomPosterize
 # ~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomPosterize` transform
@@ -214,7 +214,7 @@ posterizer = T.RandomPosterize(bits=2)
 posterized_imgs = [posterizer(orig_img) for _ in range(4)]
 plot(posterized_imgs)

-####################################
+# %%
 # RandomSolarize
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomSolarize` transform
@@ -225,7 +225,7 @@ solarizer = T.RandomSolarize(threshold=192.0)
 solarized_imgs = [solarizer(orig_img) for _ in range(4)]
 plot(solarized_imgs)

-####################################
+# %%
 # RandomAdjustSharpness
 # ~~~~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAdjustSharpness` transform
@@ -235,7 +235,7 @@ sharpness_adjuster = T.RandomAdjustSharpness(sharpness_factor=2)
 sharpened_imgs = [sharpness_adjuster(orig_img) for _ in range(4)]
 plot(sharpened_imgs)

-####################################
+# %%
 # RandomAutocontrast
 # ~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAutocontrast` transform
@@ -245,7 +245,7 @@ autocontraster = T.RandomAutocontrast()
 autocontrasted_imgs = [autocontraster(orig_img) for _ in range(4)]
 plot(autocontrasted_imgs)

-####################################
+# %%
 # RandomEqualize
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomEqualize` transform
@@ -255,7 +255,7 @@ equalizer = T.RandomEqualize()
 equalized_imgs = [equalizer(orig_img) for _ in range(4)]
 plot(equalized_imgs)

-####################################
+# %%
 # Augmentation Transforms
 # -----------------------
 # The following transforms are combinations of multiple transforms,
@@ -275,7 +275,7 @@ imgs = [
 row_title = [str(policy).split('.')[-1] for policy in policies]
 plot(imgs, row_title=row_title)

-####################################
+# %%
 # RandAugment
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandAugment` is an alternate version of AutoAugment.
@@ -283,7 +283,7 @@ augmenter = T.RandAugment()
 imgs = [augmenter(orig_img) for _ in range(4)]
 plot(imgs)

-####################################
+# %%
 # TrivialAugmentWide
 # ~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.TrivialAugmentWide` is an alternate implementation of AutoAugment.
@@ -293,7 +293,7 @@ augmenter = T.TrivialAugmentWide()
 imgs = [augmenter(orig_img) for _ in range(4)]
 plot(imgs)

-####################################
+# %%
 # AugMix
 # ~~~~~~
 # The :class:`~torchvision.transforms.AugMix` transform interpolates between augmented versions of an image.
@@ -301,7 +301,7 @@ augmenter = T.AugMix()
 imgs = [augmenter(orig_img) for _ in range(4)]
 plot(imgs)

-####################################
+# %%
 # Randomly-applied Transforms
 # ---------------------------
 #
@@ -318,7 +318,7 @@ hflipper = T.RandomHorizontalFlip(p=0.5)
 transformed_imgs = [hflipper(orig_img) for _ in range(4)]
 plot(transformed_imgs)

-####################################
+# %%
 # RandomVerticalFlip
 # ~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomVerticalFlip` transform
@@ -328,7 +328,7 @@ vflipper = T.RandomVerticalFlip(p=0.5)
 transformed_imgs = [vflipper(orig_img) for _ in range(4)]
 plot(transformed_imgs)

-####################################
+# %%
 # RandomApply
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomApply` transform

--- a/gallery/plot_transforms_v2.py
+++ b/gallery/plot_transforms_v2.py
@@ -36,7 +36,7 @@ def load_data():
    return path, image, bounding_boxes, masks, labels


-########################################################################################################################
+# %%
 # The :mod:`torchvision.transforms.v2` API supports images, videos, bounding boxes, and instance and segmentation
 # masks. Thus, it offers native support for many Computer Vision tasks, like image and video classification, object
 # detection or instance and semantic segmentation. Still, the interface is the same, making
@@ -55,7 +55,7 @@ transform = transforms.Compose(
    ]
 )

-########################################################################################################################
+# %%
 # :mod:`torchvision.transforms.v2` natively supports jointly transforming multiple inputs while making sure that
 # potential random behavior is consistent across all inputs. However, it doesn't enforce a specific input structure or
 # order.
@@ -70,7 +70,7 @@ new_image, new_bounding_boxes, new_masks, new_labels = transform(
 )  # Instance Segmentation
 new_image, new_target = transform((image, {"boxes": bounding_boxes, "labels": labels}))  # Arbitrary Structure

-########################################################################################################################
+# %%
 # Under the hood, :mod:`torchvision.transforms.v2` relies on :mod:`torchvision.datapoints` for the dispatch to the
 # appropriate function for the input data: :ref:`sphx_glr_auto_examples_plot_datapoints.py`. Note however, that as
 # regular user, you likely don't have to touch this yourself. See
@@ -84,7 +84,7 @@ new_sample = transform(sample)

 assert new_sample["path"] is sample["path"]

-########################################################################################################################
+# %%
 # As stated above, :mod:`torchvision.transforms.v2` is a drop-in replacement for :mod:`torchvision.transforms` and thus
 # also supports transforming plain :class:`torch.Tensor`'s as image or video if applicable. This is achieved with a
 # simple heuristic:

--- a/gallery/plot_transforms_v2_e2e.py
+++ b/gallery/plot_transforms_v2_e2e.py
@@ -47,7 +47,7 @@ from torchvision import models, datasets
 import torchvision.transforms.v2 as transforms


-########################################################################################################################
+# %%
 # We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently
 # returns, and we'll see how to convert it to a format that is compatible with our new transforms.

@@ -67,7 +67,7 @@ print(type(image))
 print(type(target), type(target[0]), list(target[0].keys()))


-########################################################################################################################
+# %%
 # The dataset returns a two-tuple with the first item being a :class:`PIL.Image.Image` and second one a list of
 # dictionaries, which each containing the annotations for a single object instance. As is, this format is not compatible
 # with the ``torchvision.transforms.v2``, nor with the models. To overcome that, we provide the
@@ -85,13 +85,13 @@ print(type(image))
 print(type(target), list(target.keys()))
 print(type(target["boxes"]), type(target["labels"]))

-########################################################################################################################
+# %%
 # As baseline, let's have a look at a sample without transformations:

 show(sample)


-########################################################################################################################
+# %%
 # With the dataset properly set up, we can now define the augmentation pipeline. This is done the same way it is done in
 # ``torchvision.transforms`` v1, but now handles bounding boxes and masks without any extra configuration.

@@ -107,7 +107,7 @@ transform = transforms.Compose(
    ]
 )

-########################################################################################################################
+# %%
 # .. note::
 #    Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` transform is a no-op in this example, but it
 #    should be placed at least once at the end of a detection pipeline to remove degenerate bounding boxes as well as
@@ -126,7 +126,7 @@ sample = dataset[0]
 show(sample)


-########################################################################################################################
+# %%
 # We can see that the color of the image was distorted, we zoomed out on it (off center) and flipped it horizontally.
 # In all of this, the bounding box was transformed accordingly. And without any further ado, we can start training.


--- a/gallery/plot_video_api.py
+++ b/gallery/plot_video_api.py
@@ -7,14 +7,14 @@ This example illustrates some of the APIs that torchvision offers for
 videos, together with the examples on how to build datasets and more.
 """

-####################################
+# %%
 # 1. Introduction: building a new video object and examining the properties
 # -------------------------------------------------------------------------
 # First we select a video to test the object out. For the sake of argument
 # we're using one from kinetics400 dataset.
 # To create it, we need to define the path and the stream we want to use.

-######################################
+# %%
 # Chosen video statistics:
 #
 # - WUzgd7C1pWA.mp4
@@ -42,7 +42,7 @@ download_url(
 )
 video_path = "./WUzgd7C1pWA.mp4"

-######################################
+# %%
 # Streams are defined in a similar fashion as torch devices. We encode them as strings in a form
 # of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int.
 # The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered.
@@ -52,7 +52,7 @@ stream = "video"
 video = torchvision.io.VideoReader(video_path, stream)
 video.get_metadata()

-######################################
+# %%
 # Here we can see that video has two streams - a video and an audio stream.
 # Currently available stream types include ['video', 'audio'].
 # Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id
@@ -61,7 +61,7 @@ video.get_metadata()
 # users can access the one they want.
 # If only stream type is passed, the decoder auto-detects first stream of that type and returns it.

-######################################
+# %%
 # Let's read all the frames from the video stream. By default, the return value of
 # ``next(video_reader)`` is a dict containing the following fields.
 #
@@ -85,7 +85,7 @@ approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0]
 print("Approx total number of datapoints we can expect: ", approx_nf)
 print("Read data size: ", frames[0].size(0) * len(frames))

-######################################
+# %%
 # But what if we only want to read certain time segment of the video?
 # That can be done easily using the combination of our ``seek`` function, and the fact that each call
 # to next returns the presentation timestamp of the returned frame in seconds.
@@ -107,7 +107,7 @@ for frame, pts in itertools.islice(video.seek(2), 10):

 print("Total number of frames: ", len(frames))

-######################################
+# %%
 # Or if we wanted to read from 2nd to 5th second,
 # We seek into a second second of the video,
 # then we utilize the itertools takewhile to get the
@@ -125,7 +125,7 @@ approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0]
 print("We can expect approx: ", approx_nf)
 print("Tensor size: ", frames[0].size())

-####################################
+# %%
 # 2. Building a sample read_video function
 # ----------------------------------------------------------------------------------------
 # We can utilize the methods above to build the read video function that follows
@@ -170,21 +170,21 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au
 vf, af, info, meta = example_read_video(video)
 print(vf.size(), af.size())

-####################################
+# %%
 # 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400)
 # -------------------------------------------------------------------------------------------------------
 # Cool, so now we can use the same principle to make the sample dataset.
 # We suggest trying out iterable dataset for this purpose.
 # Here, we are going to build an example dataset that reads randomly selected 10 frames of video.

-####################################
+# %%
 # Make sample dataset
 import os
 os.makedirs("./dataset", exist_ok=True)
 os.makedirs("./dataset/1", exist_ok=True)
 os.makedirs("./dataset/2", exist_ok=True)

-####################################
+# %%
 # Download the videos
 from torchvision.datasets.utils import download_url
 download_url(
@@ -212,7 +212,7 @@ download_url(
    "v_SoccerJuggling_g24_c01.avi"
 )

-####################################
+# %%
 # Housekeeping and utilities
 import os
 import random
@@ -232,7 +232,7 @@ def get_samples(root, extensions=(".mp4", ".avi")):
    _, class_to_idx = _find_classes(root)
    return make_dataset(root, class_to_idx, extensions=extensions)

-####################################
+# %%
 # We are going to define the dataset and some basic arguments.
 # We assume the structure of the FolderDataset, and add the following parameters:
 #
@@ -287,7 +287,7 @@ class RandomDataset(torch.utils.data.IterableDataset):
                'end': current_pts}
            yield output

-####################################
+# %%
 # Given a path of videos in a folder structure, i.e:
 #
 # - dataset
@@ -309,7 +309,7 @@ frame_transform = t.Compose(transforms)

 dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform)

-####################################
+# %%
 from torch.utils.data import DataLoader
 loader = DataLoader(dataset, batch_size=12)
 data = {"video": [], 'start': [], 'end': [], 'tensorsize': []}
@@ -321,7 +321,7 @@ for batch in loader:
        data['tensorsize'].append(batch['video'][i].size())
 print(data)

-####################################
+# %%
 # 4. Data Visualization
 # ----------------------------------
 # Example of visualized video
@@ -334,7 +334,7 @@ for i in range(16):
    plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0))
    plt.axis("off")

-####################################
+# %%
 # Cleanup the video and dataset:
 import os
 import shutil

--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -30,7 +30,7 @@ def show(imgs):
        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])


-####################################
+# %%
 # Visualizing a grid of images
 # ----------------------------
 # The :func:`~torchvision.utils.make_grid` function can be used to create a
@@ -48,7 +48,7 @@ dog_list = [dog1_int, dog2_int]
 grid = make_grid(dog_list)
 show(grid)

-####################################
+# %%
 # Visualizing bounding boxes
 # --------------------------
 # We can use :func:`~torchvision.utils.draw_bounding_boxes` to draw boxes on an
@@ -64,7 +64,7 @@ result = draw_bounding_boxes(dog1_int, boxes, colors=colors, width=5)
 show(result)


-#####################################
+# %%
 # Naturally, we can also plot bounding boxes produced by torchvision detection
 # models.  Here is a demo with a Faster R-CNN model loaded from
 # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
@@ -85,7 +85,7 @@ model = model.eval()
 outputs = model(images)
 print(outputs)

-#####################################
+# %%
 # Let's plot the boxes detected by our model. We will only plot the boxes with a
 # score greater than a given threshold.

@@ -96,7 +96,7 @@ dogs_with_boxes = [
 ]
 show(dogs_with_boxes)

-#####################################
+# %%
 # Visualizing segmentation masks
 # ------------------------------
 # The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
@@ -125,7 +125,7 @@ batch = torch.stack([transforms(d) for d in dog_list])
 output = model(batch)['out']
 print(output.shape, output.min().item(), output.max().item())

-#####################################
+# %%
 # As we can see above, the output of the segmentation model is a tensor of shape
 # ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score, and
 # we can normalize them into ``[0, 1]`` by using a softmax. After the softmax,
@@ -147,7 +147,7 @@ dog_and_boat_masks = [

 show(dog_and_boat_masks)

-#####################################
+# %%
 # As expected, the model is confident about the dog class, but not so much for
 # the boat class.
 #
@@ -162,7 +162,7 @@ print(f"shape = {boolean_dog_masks.shape}, dtype = {boolean_dog_masks.dtype}")
 show([m.float() for m in boolean_dog_masks])


-#####################################
+# %%
 # The line above where we define ``boolean_dog_masks`` is a bit cryptic, but you
 # can read it as the following query: "For which pixels is 'dog' the most likely
 # class?"
@@ -184,7 +184,7 @@ dogs_with_masks = [
 ]
 show(dogs_with_masks)

-#####################################
+# %%
 # We can plot more than one mask per image! Remember that the model returned as
 # many masks as there are classes. Let's ask the same query as above, but this
 # time for *all* classes, not just the dog class: "For each pixel and each class
@@ -204,7 +204,7 @@ print(f"dog1_all_classes_masks = {dog1_all_classes_masks.shape}, dtype = {dog1_a
 dog_with_all_masks = draw_segmentation_masks(dog1_int, masks=dog1_all_classes_masks, alpha=.6)
 show(dog_with_all_masks)

-#####################################
+# %%
 # We can see in the image above that only 2 masks were drawn: the mask for the
 # background and the mask for the dog. This is because the model thinks that
 # only these 2 classes are the most likely ones across all the pixels. If the
@@ -231,7 +231,7 @@ dogs_with_masks = [
 show(dogs_with_masks)


-#####################################
+# %%
 # .. _instance_seg_output:
 #
 # Instance segmentation models
@@ -265,7 +265,7 @@ model = model.eval()
 output = model(images)
 print(output)

-#####################################
+# %%
 # Let's break this down. For each image in the batch, the model outputs some
 # detections (or instances). The number of detections varies for each input
 # image. Each instance is described by its bounding box, its label, its score
@@ -288,7 +288,7 @@ dog1_masks = dog1_output['masks']
 print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, "
      f"min = {dog1_masks.min()}, max = {dog1_masks.max()}")

-#####################################
+# %%
 # Here the masks correspond to probabilities indicating, for each pixel, how
 # likely it is to belong to the predicted label of that instance. Those
 # predicted labels correspond to the 'labels' element in the same output dict.
@@ -297,7 +297,7 @@ print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, "
 print("For the first dog, the following instances were detected:")
 print([weights.meta["categories"][label] for label in dog1_output['labels']])

-#####################################
+# %%
 # Interestingly, the model detects two persons in the image. Let's go ahead and
 # plot those masks. Since :func:`~torchvision.utils.draw_segmentation_masks`
 # expects boolean masks, we need to convert those probabilities into boolean
@@ -315,14 +315,14 @@ dog1_bool_masks = dog1_bool_masks.squeeze(1)

 show(draw_segmentation_masks(dog1_int, dog1_bool_masks, alpha=0.9))

-#####################################
+# %%
 # The model seems to have properly detected the dog, but it also confused trees
 # with people. Looking more closely at the scores will help us plot more
 # relevant masks:

 print(dog1_output['scores'])

-#####################################
+# %%
 # Clearly the model is more confident about the dog detection than it is about
 # the people detections. That's good news. When plotting the masks, we can ask
 # for only those that have a good score. Let's use a score threshold of .75
@@ -341,12 +341,12 @@ dogs_with_masks = [
 ]
 show(dogs_with_masks)

-#####################################
+# %%
 # The two 'people' masks in the first image where not selected because they have
 # a lower score than the score threshold. Similarly, in the second image, the
 # instance with class 15 (which corresponds to 'bench') was not selected.

-#####################################
+# %%
 # .. _keypoint_output:
 #
 # Visualizing keypoints
@@ -373,7 +373,7 @@ model = model.eval()
 outputs = model([person_float])
 print(outputs)

-#####################################
+# %%
 # As we see the output contains a list of dictionaries.
 # The output list is of length batch_size.
 # We currently have just a single image so length of list is 1.
@@ -388,7 +388,7 @@ scores = outputs[0]['scores']
 print(kpts)
 print(scores)

-#####################################
+# %%
 # The KeypointRCNN model detects there are two instances in the image.
 # If you plot the boxes by using :func:`~draw_bounding_boxes`
 # you would recognize they are the person and the surfboard.
@@ -402,7 +402,7 @@ keypoints = kpts[idx]

 print(keypoints)

-#####################################
+# %%
 # Great, now we have the keypoints corresponding to the person.
 # Each keypoint is represented by x, y coordinates and the visibility.
 # We can now use the :func:`~torchvision.utils.draw_keypoints` function to draw keypoints.
@@ -413,7 +413,7 @@ from torchvision.utils import draw_keypoints
 res = draw_keypoints(person_int, keypoints, colors="blue", radius=3)
 show(res)

-#####################################
+# %%
 # As we see the keypoints appear as colored circles over the image.
 # The coco keypoints for a person are ordered and represent the following list.\

@@ -424,7 +424,7 @@ coco_keypoints = [
    "left_knee", "right_knee", "left_ankle", "right_ankle",
 ]

-#####################################
+# %%
 # What if we are interested in joining the keypoints?
 # This is especially useful in creating pose detection or action recognition.
 # We can join the keypoints easily using the `connectivity` parameter.
@@ -450,7 +450,7 @@ connect_skeleton = [
    (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14), (13, 15), (14, 16)
 ]

-#####################################
+# %%
 # We pass the above list to the connectivity parameter to connect the keypoints.
 #