Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
vision
Commits
408917d1
Unverified
Commit
408917d1
authored
Aug 02, 2023
by
Nicolas Hug
Committed by
GitHub
Aug 02, 2023
Browse files
Use `# %%` syntax for gallery examples (#7793)
parent
a893f313
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
110 additions
and
110 deletions
+110
-110
gallery/plot_datapoints.py
gallery/plot_datapoints.py
+9
-9
gallery/plot_optical_flow.py
gallery/plot_optical_flow.py
+8
-8
gallery/plot_repurposing_annotations.py
gallery/plot_repurposing_annotations.py
+10
-10
gallery/plot_scripted_tensor_transforms.py
gallery/plot_scripted_tensor_transforms.py
+6
-6
gallery/plot_transforms.py
gallery/plot_transforms.py
+26
-26
gallery/plot_transforms_v2.py
gallery/plot_transforms_v2.py
+4
-4
gallery/plot_transforms_v2_e2e.py
gallery/plot_transforms_v2_e2e.py
+6
-6
gallery/plot_video_api.py
gallery/plot_video_api.py
+17
-17
gallery/plot_visualization_utils.py
gallery/plot_visualization_utils.py
+24
-24
No files found.
gallery/plot_datapoints.py
View file @
408917d1
...
...
@@ -23,7 +23,7 @@ from torchvision import datapoints
from
torchvision.transforms.v2
import
functional
as
F
#
#######################################################################################################################
#
%%
# What are datapoints?
# --------------------
#
...
...
@@ -36,7 +36,7 @@ assert isinstance(image, torch.Tensor)
assert
image
.
data_ptr
()
==
tensor
.
data_ptr
()
#
#######################################################################################################################
#
%%
# Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
# for the input data.
#
...
...
@@ -59,7 +59,7 @@ image = datapoints.Image([[[[0, 1], [1, 0]]]])
print
(
image
)
#
#######################################################################################################################
#
%%
# Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad``
# parameters.
...
...
@@ -67,14 +67,14 @@ float_image = datapoints.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires
print
(
float_image
)
#
#######################################################################################################################
#
%%
# In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` also take a
# :class:`PIL.Image.Image` directly:
image
=
datapoints
.
Image
(
PIL
.
Image
.
open
(
"assets/astronaut.jpg"
))
print
(
image
.
shape
,
image
.
dtype
)
#
#######################################################################################################################
#
%%
# In general, the datapoints can also store additional metadata that complements the underlying tensor. For example,
# :class:`~torchvision.datapoints.BoundingBoxes` stores the coordinate format as well as the spatial size of the
# corresponding image alongside the actual values:
...
...
@@ -85,7 +85,7 @@ bounding_box = datapoints.BoundingBoxes(
print
(
bounding_box
)
#
#######################################################################################################################
#
%%
# Do I have to wrap the output of the datasets myself?
# ----------------------------------------------------
#
...
...
@@ -120,7 +120,7 @@ class PennFudanDataset(torch.utils.data.Dataset):
...
#
#######################################################################################################################
#
%%
# 2. Perform the wrapping inside a custom transformation at the beginning of your pipeline:
...
...
@@ -144,7 +144,7 @@ def get_transform(train):
transforms
.
append
(
T
.
PILToTensor
())
...
#
#######################################################################################################################
#
%%
# .. note::
#
# If both :class:`~torchvision.datapoints.BoundingBoxes`'es and :class:`~torchvision.datapoints.Mask`'s are included in
...
...
@@ -171,7 +171,7 @@ new_image = image + 0
assert
isinstance
(
new_image
,
torch
.
Tensor
)
and
not
isinstance
(
new_image
,
datapoints
.
Image
)
#
#######################################################################################################################
#
%%
# .. note::
#
# This "unwrapping" behaviour is something we're actively seeking feedback on. If you find this surprising or if you
...
...
gallery/plot_optical_flow.py
View file @
408917d1
...
...
@@ -42,7 +42,7 @@ def plot(imgs, **imshow_kwargs):
plt
.
tight_layout
()
#
##################################
#
%%
# Reading Videos Using Torchvision
# --------------------------------
# We will first read a video using :func:`~torchvision.io.read_video`.
...
...
@@ -62,7 +62,7 @@ video_url = "https://download.pytorch.org/tutorial/pexelscom_pavel_danilyuk_bask
video_path
=
Path
(
tempfile
.
mkdtemp
())
/
"basketball.mp4"
_
=
urlretrieve
(
video_url
,
video_path
)
#
########################
#
%%
# :func:`~torchvision.io.read_video` returns the video frames, audio frames and
# the metadata associated with the video. In our case, we only need the video
# frames.
...
...
@@ -79,7 +79,7 @@ img2_batch = torch.stack([frames[101], frames[151]])
plot
(
img1_batch
)
#
########################
#
%%
# The RAFT model accepts RGB images. We first get the frames from
# :func:`~torchvision.io.read_video` and resize them to ensure their dimensions
# are divisible by 8. Note that we explicitly use ``antialias=False``, because
...
...
@@ -104,7 +104,7 @@ img1_batch, img2_batch = preprocess(img1_batch, img2_batch)
print
(
f
"shape =
{
img1_batch
.
shape
}
, dtype =
{
img1_batch
.
dtype
}
"
)
#
###################################
#
%%
# Estimating Optical flow using RAFT
# ----------------------------------
# We will use our RAFT implementation from
...
...
@@ -125,7 +125,7 @@ list_of_flows = model(img1_batch.to(device), img2_batch.to(device))
print
(
f
"type =
{
type
(
list_of_flows
)
}
"
)
print
(
f
"length =
{
len
(
list_of_flows
)
}
= number of iterations of the model"
)
#
###################################
#
%%
# The RAFT model outputs lists of predicted flows where each entry is a
# (N, 2, H, W) batch of predicted flows that corresponds to a given "iteration"
# in the model. For more details on the iterative nature of the model, please
...
...
@@ -144,7 +144,7 @@ print(f"shape = {predicted_flows.shape} = (N, 2, H, W)")
print
(
f
"min =
{
predicted_flows
.
min
()
}
, max =
{
predicted_flows
.
max
()
}
"
)
#
###################################
#
%%
# Visualizing predicted flows
# ---------------------------
# Torchvision provides the :func:`~torchvision.utils.flow_to_image` utlity to
...
...
@@ -166,7 +166,7 @@ img1_batch = [(img1 + 1) / 2 for img1 in img1_batch]
grid
=
[[
img1
,
flow_img
]
for
(
img1
,
flow_img
)
in
zip
(
img1_batch
,
flow_imgs
)]
plot
(
grid
)
#
###################################
#
%%
# Bonus: Creating GIFs of predicted flows
# ---------------------------------------
# In the example above we have only shown the predicted flows of 2 pairs of
...
...
@@ -187,7 +187,7 @@ plot(grid)
# output_folder = "/tmp/" # Update this to the folder of your choice
# write_jpeg(flow_img, output_folder + f"predicted_flow_{i}.jpg")
#
###################################
#
%%
# Once the .jpg flow images are saved, you can convert them into a video or a
# GIF using ffmpeg with e.g.:
#
...
...
gallery/plot_repurposing_annotations.py
View file @
408917d1
...
...
@@ -36,7 +36,7 @@ def show(imgs):
axs
[
0
,
i
].
set
(
xticklabels
=
[],
yticklabels
=
[],
xticks
=
[],
yticks
=
[])
#
###################################
#
%%
# Masks
# -----
# In tasks like instance and panoptic segmentation, masks are commonly defined, and are defined by this package,
...
...
@@ -53,7 +53,7 @@ def show(imgs):
# A nice property of masks is that they can be easily repurposed to be used in methods to solve a variety of object
# localization tasks.
#
###################################
#
%%
# Converting Masks to Bounding Boxes
# -----------------------------------------------
# For example, the :func:`~torchvision.ops.masks_to_boxes` operation can be used to
...
...
@@ -70,7 +70,7 @@ img = read_image(img_path)
mask
=
read_image
(
mask_path
)
#
########################
#
%%
# Here the masks are represented as a PNG Image, with floating point values.
# Each pixel is encoded as different colors, with 0 being background.
# Notice that the spatial dimensions of image and mask match.
...
...
@@ -79,7 +79,7 @@ print(mask.size())
print
(
img
.
size
())
print
(
mask
)
#
###########################
#
%%
# We get the unique colors, as these would be the object ids.
obj_ids
=
torch
.
unique
(
mask
)
...
...
@@ -91,7 +91,7 @@ obj_ids = obj_ids[1:]
# Note that this snippet would work as well if the masks were float values instead of ints.
masks
=
mask
==
obj_ids
[:,
None
,
None
]
#
#######################
#
%%
# Now the masks are a boolean tensor.
# The first dimension in this case 3 and denotes the number of instances: there are 3 people in the image.
# The other two dimensions are height and width, which are equal to the dimensions of the image.
...
...
@@ -101,7 +101,7 @@ masks = mask == obj_ids[:, None, None]
print
(
masks
.
size
())
print
(
masks
)
#
###################################
#
%%
# Let us visualize an image and plot its corresponding segmentation masks.
# We will use the :func:`~torchvision.utils.draw_segmentation_masks` to draw the segmentation masks.
...
...
@@ -113,7 +113,7 @@ for mask in masks:
show
(
drawn_masks
)
#
###################################
#
%%
# To convert the boolean masks into bounding boxes.
# We will use the :func:`~torchvision.ops.masks_to_boxes` from the torchvision.ops module
# It returns the boxes in ``(xmin, ymin, xmax, ymax)`` format.
...
...
@@ -124,7 +124,7 @@ boxes = masks_to_boxes(masks)
print
(
boxes
.
size
())
print
(
boxes
)
#
###################################
#
%%
# As the shape denotes, there are 3 boxes and in ``(xmin, ymin, xmax, ymax)`` format.
# These can be visualized very easily with :func:`~torchvision.utils.draw_bounding_boxes` utility
# provided in :ref:`torchvision.utils <utils>`.
...
...
@@ -134,7 +134,7 @@ from torchvision.utils import draw_bounding_boxes
drawn_boxes
=
draw_bounding_boxes
(
img
,
boxes
,
colors
=
"red"
)
show
(
drawn_boxes
)
#
##################################
#
%%
# These boxes can now directly be used by detection models in torchvision.
# Here is demo with a Faster R-CNN model loaded from
# :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
...
...
@@ -153,7 +153,7 @@ target["labels"] = labels = torch.ones((masks.size(0),), dtype=torch.int64)
detection_outputs
=
model
(
img
.
unsqueeze
(
0
),
[
target
])
#
###################################
#
%%
# Converting Segmentation Dataset to Detection Dataset
# ----------------------------------------------------
#
...
...
gallery/plot_scripted_tensor_transforms.py
View file @
408917d1
...
...
@@ -45,7 +45,7 @@ def show(imgs):
axs
[
0
,
i
].
set
(
xticklabels
=
[],
yticklabels
=
[],
xticks
=
[],
yticks
=
[])
#
###################################
#
%%
# The :func:`~torchvision.io.read_image` function allows to read an image and
# directly load it as a tensor
...
...
@@ -53,7 +53,7 @@ dog1 = read_image(str(Path('assets') / 'dog1.jpg'))
dog2
=
read_image
(
str
(
Path
(
'assets'
)
/
'dog2.jpg'
))
show
([
dog1
,
dog2
])
#
###################################
#
%%
# Transforming images on GPU
# --------------------------
# Most transforms natively support tensors on top of PIL images (to visualize
...
...
@@ -76,7 +76,7 @@ transformed_dog1 = transforms(dog1)
transformed_dog2
=
transforms
(
dog2
)
show
([
transformed_dog1
,
transformed_dog2
])
#
###################################
#
%%
# Scriptable transforms for easier deployment via torchscript
# -----------------------------------------------------------
# We now show how to combine image transformations and a model forward pass,
...
...
@@ -103,7 +103,7 @@ class Predictor(nn.Module):
return
y_pred
.
argmax
(
dim
=
1
)
#
###################################
#
%%
# Now, let's define scripted and non-scripted instances of ``Predictor`` and
# apply it on multiple tensor images of the same size
...
...
@@ -115,7 +115,7 @@ batch = torch.stack([dog1, dog2]).to(device)
res
=
predictor
(
batch
)
res_scripted
=
scripted_predictor
(
batch
)
#
###################################
#
%%
# We can verify that the prediction of the scripted and non-scripted models are
# the same:
...
...
@@ -128,7 +128,7 @@ for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)):
assert
pred
==
pred_scripted
print
(
f
"Prediction for Dog
{
i
+
1
}
:
{
labels
[
str
(
pred
.
item
())]
}
"
)
#
###################################
#
%%
# Since the model is scripted, it can be easily dumped on disk and re-used
import
tempfile
...
...
gallery/plot_transforms.py
View file @
408917d1
...
...
@@ -50,7 +50,7 @@ def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
plt
.
tight_layout
()
#
###################################
#
%%
# Geometric Transforms
# --------------------
# Geometric image transformation refers to the process of altering the geometric properties of an image,
...
...
@@ -65,7 +65,7 @@ def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
padded_imgs
=
[
T
.
Pad
(
padding
=
padding
)(
orig_img
)
for
padding
in
(
3
,
10
,
30
,
50
)]
plot
(
padded_imgs
)
#
###################################
#
%%
# Resize
# ~~~~~~
# The :class:`~torchvision.transforms.Resize` transform
...
...
@@ -74,7 +74,7 @@ plot(padded_imgs)
resized_imgs
=
[
T
.
Resize
(
size
=
size
)(
orig_img
)
for
size
in
(
30
,
50
,
100
,
orig_img
.
size
)]
plot
(
resized_imgs
)
#
###################################
#
%%
# CenterCrop
# ~~~~~~~~~~
# The :class:`~torchvision.transforms.CenterCrop` transform
...
...
@@ -83,7 +83,7 @@ plot(resized_imgs)
center_crops
=
[
T
.
CenterCrop
(
size
=
size
)(
orig_img
)
for
size
in
(
30
,
50
,
100
,
orig_img
.
size
)]
plot
(
center_crops
)
#
###################################
#
%%
# FiveCrop
# ~~~~~~~~
# The :class:`~torchvision.transforms.FiveCrop` transform
...
...
@@ -92,7 +92,7 @@ plot(center_crops)
(
top_left
,
top_right
,
bottom_left
,
bottom_right
,
center
)
=
T
.
FiveCrop
(
size
=
(
100
,
100
))(
orig_img
)
plot
([
top_left
,
top_right
,
bottom_left
,
bottom_right
,
center
])
#
###################################
#
%%
# RandomPerspective
# ~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomPerspective` transform
...
...
@@ -102,7 +102,7 @@ perspective_transformer = T.RandomPerspective(distortion_scale=0.6, p=1.0)
perspective_imgs
=
[
perspective_transformer
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
perspective_imgs
)
#
###################################
#
%%
# RandomRotation
# ~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomRotation` transform
...
...
@@ -112,7 +112,7 @@ rotater = T.RandomRotation(degrees=(0, 180))
rotated_imgs
=
[
rotater
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
rotated_imgs
)
#
###################################
#
%%
# RandomAffine
# ~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomAffine` transform
...
...
@@ -122,7 +122,7 @@ affine_transfomer = T.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale
affine_imgs
=
[
affine_transfomer
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
affine_imgs
)
#
###################################
#
%%
# ElasticTransform
# ~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.ElasticTransform` transform
...
...
@@ -133,7 +133,7 @@ elastic_transformer = T.ElasticTransform(alpha=250.0)
transformed_imgs
=
[
elastic_transformer
(
orig_img
)
for
_
in
range
(
2
)]
plot
(
transformed_imgs
)
#
###################################
#
%%
# RandomCrop
# ~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomCrop` transform
...
...
@@ -143,7 +143,7 @@ cropper = T.RandomCrop(size=(128, 128))
crops
=
[
cropper
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
crops
)
#
###################################
#
%%
# RandomResizedCrop
# ~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomResizedCrop` transform
...
...
@@ -154,7 +154,7 @@ resize_cropper = T.RandomResizedCrop(size=(32, 32))
resized_crops
=
[
resize_cropper
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
resized_crops
)
#
###################################
#
%%
# Photometric Transforms
# ----------------------
# Photometric image transformation refers to the process of modifying the photometric properties of an image,
...
...
@@ -174,7 +174,7 @@ plot(resized_crops)
gray_img
=
T
.
Grayscale
()(
orig_img
)
plot
([
gray_img
],
cmap
=
'gray'
)
#
###################################
#
%%
# ColorJitter
# ~~~~~~~~~~~
# The :class:`~torchvision.transforms.ColorJitter` transform
...
...
@@ -183,7 +183,7 @@ jitter = T.ColorJitter(brightness=.5, hue=.3)
jitted_imgs
=
[
jitter
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
jitted_imgs
)
#
###################################
#
%%
# GaussianBlur
# ~~~~~~~~~~~~
# The :class:`~torchvision.transforms.GaussianBlur` transform
...
...
@@ -193,7 +193,7 @@ blurrer = T.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5))
blurred_imgs
=
[
blurrer
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
blurred_imgs
)
#
###################################
#
%%
# RandomInvert
# ~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomInvert` transform
...
...
@@ -203,7 +203,7 @@ inverter = T.RandomInvert()
invertered_imgs
=
[
inverter
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
invertered_imgs
)
#
###################################
#
%%
# RandomPosterize
# ~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomPosterize` transform
...
...
@@ -214,7 +214,7 @@ posterizer = T.RandomPosterize(bits=2)
posterized_imgs
=
[
posterizer
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
posterized_imgs
)
#
###################################
#
%%
# RandomSolarize
# ~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomSolarize` transform
...
...
@@ -225,7 +225,7 @@ solarizer = T.RandomSolarize(threshold=192.0)
solarized_imgs
=
[
solarizer
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
solarized_imgs
)
#
###################################
#
%%
# RandomAdjustSharpness
# ~~~~~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomAdjustSharpness` transform
...
...
@@ -235,7 +235,7 @@ sharpness_adjuster = T.RandomAdjustSharpness(sharpness_factor=2)
sharpened_imgs
=
[
sharpness_adjuster
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
sharpened_imgs
)
#
###################################
#
%%
# RandomAutocontrast
# ~~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomAutocontrast` transform
...
...
@@ -245,7 +245,7 @@ autocontraster = T.RandomAutocontrast()
autocontrasted_imgs
=
[
autocontraster
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
autocontrasted_imgs
)
#
###################################
#
%%
# RandomEqualize
# ~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomEqualize` transform
...
...
@@ -255,7 +255,7 @@ equalizer = T.RandomEqualize()
equalized_imgs
=
[
equalizer
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
equalized_imgs
)
#
###################################
#
%%
# Augmentation Transforms
# -----------------------
# The following transforms are combinations of multiple transforms,
...
...
@@ -275,7 +275,7 @@ imgs = [
row_title
=
[
str
(
policy
).
split
(
'.'
)[
-
1
]
for
policy
in
policies
]
plot
(
imgs
,
row_title
=
row_title
)
#
###################################
#
%%
# RandAugment
# ~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandAugment` is an alternate version of AutoAugment.
...
...
@@ -283,7 +283,7 @@ augmenter = T.RandAugment()
imgs
=
[
augmenter
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
imgs
)
#
###################################
#
%%
# TrivialAugmentWide
# ~~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.TrivialAugmentWide` is an alternate implementation of AutoAugment.
...
...
@@ -293,7 +293,7 @@ augmenter = T.TrivialAugmentWide()
imgs
=
[
augmenter
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
imgs
)
#
###################################
#
%%
# AugMix
# ~~~~~~
# The :class:`~torchvision.transforms.AugMix` transform interpolates between augmented versions of an image.
...
...
@@ -301,7 +301,7 @@ augmenter = T.AugMix()
imgs
=
[
augmenter
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
imgs
)
#
###################################
#
%%
# Randomly-applied Transforms
# ---------------------------
#
...
...
@@ -318,7 +318,7 @@ hflipper = T.RandomHorizontalFlip(p=0.5)
transformed_imgs
=
[
hflipper
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
transformed_imgs
)
#
###################################
#
%%
# RandomVerticalFlip
# ~~~~~~~~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomVerticalFlip` transform
...
...
@@ -328,7 +328,7 @@ vflipper = T.RandomVerticalFlip(p=0.5)
transformed_imgs
=
[
vflipper
(
orig_img
)
for
_
in
range
(
4
)]
plot
(
transformed_imgs
)
#
###################################
#
%%
# RandomApply
# ~~~~~~~~~~~
# The :class:`~torchvision.transforms.RandomApply` transform
...
...
gallery/plot_transforms_v2.py
View file @
408917d1
...
...
@@ -36,7 +36,7 @@ def load_data():
return
path
,
image
,
bounding_boxes
,
masks
,
labels
#
#######################################################################################################################
#
%%
# The :mod:`torchvision.transforms.v2` API supports images, videos, bounding boxes, and instance and segmentation
# masks. Thus, it offers native support for many Computer Vision tasks, like image and video classification, object
# detection or instance and semantic segmentation. Still, the interface is the same, making
...
...
@@ -55,7 +55,7 @@ transform = transforms.Compose(
]
)
#
#######################################################################################################################
#
%%
# :mod:`torchvision.transforms.v2` natively supports jointly transforming multiple inputs while making sure that
# potential random behavior is consistent across all inputs. However, it doesn't enforce a specific input structure or
# order.
...
...
@@ -70,7 +70,7 @@ new_image, new_bounding_boxes, new_masks, new_labels = transform(
)
# Instance Segmentation
new_image
,
new_target
=
transform
((
image
,
{
"boxes"
:
bounding_boxes
,
"labels"
:
labels
}))
# Arbitrary Structure
#
#######################################################################################################################
#
%%
# Under the hood, :mod:`torchvision.transforms.v2` relies on :mod:`torchvision.datapoints` for the dispatch to the
# appropriate function for the input data: :ref:`sphx_glr_auto_examples_plot_datapoints.py`. Note however, that as
# regular user, you likely don't have to touch this yourself. See
...
...
@@ -84,7 +84,7 @@ new_sample = transform(sample)
assert
new_sample
[
"path"
]
is
sample
[
"path"
]
#
#######################################################################################################################
#
%%
# As stated above, :mod:`torchvision.transforms.v2` is a drop-in replacement for :mod:`torchvision.transforms` and thus
# also supports transforming plain :class:`torch.Tensor`'s as image or video if applicable. This is achieved with a
# simple heuristic:
...
...
gallery/plot_transforms_v2_e2e.py
View file @
408917d1
...
...
@@ -47,7 +47,7 @@ from torchvision import models, datasets
import
torchvision.transforms.v2
as
transforms
#
#######################################################################################################################
#
%%
# We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently
# returns, and we'll see how to convert it to a format that is compatible with our new transforms.
...
...
@@ -67,7 +67,7 @@ print(type(image))
print
(
type
(
target
),
type
(
target
[
0
]),
list
(
target
[
0
].
keys
()))
#
#######################################################################################################################
#
%%
# The dataset returns a two-tuple with the first item being a :class:`PIL.Image.Image` and second one a list of
# dictionaries, which each containing the annotations for a single object instance. As is, this format is not compatible
# with the ``torchvision.transforms.v2``, nor with the models. To overcome that, we provide the
...
...
@@ -85,13 +85,13 @@ print(type(image))
print
(
type
(
target
),
list
(
target
.
keys
()))
print
(
type
(
target
[
"boxes"
]),
type
(
target
[
"labels"
]))
#
#######################################################################################################################
#
%%
# As baseline, let's have a look at a sample without transformations:
show
(
sample
)
#
#######################################################################################################################
#
%%
# With the dataset properly set up, we can now define the augmentation pipeline. This is done the same way it is done in
# ``torchvision.transforms`` v1, but now handles bounding boxes and masks without any extra configuration.
...
...
@@ -107,7 +107,7 @@ transform = transforms.Compose(
]
)
#
#######################################################################################################################
#
%%
# .. note::
# Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` transform is a no-op in this example, but it
# should be placed at least once at the end of a detection pipeline to remove degenerate bounding boxes as well as
...
...
@@ -126,7 +126,7 @@ sample = dataset[0]
show
(
sample
)
#
#######################################################################################################################
#
%%
# We can see that the color of the image was distorted, we zoomed out on it (off center) and flipped it horizontally.
# In all of this, the bounding box was transformed accordingly. And without any further ado, we can start training.
...
...
gallery/plot_video_api.py
View file @
408917d1
...
...
@@ -7,14 +7,14 @@ This example illustrates some of the APIs that torchvision offers for
videos, together with the examples on how to build datasets and more.
"""
#
###################################
#
%%
# 1. Introduction: building a new video object and examining the properties
# -------------------------------------------------------------------------
# First we select a video to test the object out. For the sake of argument
# we're using one from kinetics400 dataset.
# To create it, we need to define the path and the stream we want to use.
#
#####################################
#
%%
# Chosen video statistics:
#
# - WUzgd7C1pWA.mp4
...
...
@@ -42,7 +42,7 @@ download_url(
)
video_path
=
"./WUzgd7C1pWA.mp4"
#
#####################################
#
%%
# Streams are defined in a similar fashion as torch devices. We encode them as strings in a form
# of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int.
# The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered.
...
...
@@ -52,7 +52,7 @@ stream = "video"
video
=
torchvision
.
io
.
VideoReader
(
video_path
,
stream
)
video
.
get_metadata
()
#
#####################################
#
%%
# Here we can see that video has two streams - a video and an audio stream.
# Currently available stream types include ['video', 'audio'].
# Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id
...
...
@@ -61,7 +61,7 @@ video.get_metadata()
# users can access the one they want.
# If only stream type is passed, the decoder auto-detects first stream of that type and returns it.
#
#####################################
#
%%
# Let's read all the frames from the video stream. By default, the return value of
# ``next(video_reader)`` is a dict containing the following fields.
#
...
...
@@ -85,7 +85,7 @@ approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0]
print
(
"Approx total number of datapoints we can expect: "
,
approx_nf
)
print
(
"Read data size: "
,
frames
[
0
].
size
(
0
)
*
len
(
frames
))
#
#####################################
#
%%
# But what if we only want to read certain time segment of the video?
# That can be done easily using the combination of our ``seek`` function, and the fact that each call
# to next returns the presentation timestamp of the returned frame in seconds.
...
...
@@ -107,7 +107,7 @@ for frame, pts in itertools.islice(video.seek(2), 10):
print
(
"Total number of frames: "
,
len
(
frames
))
#
#####################################
#
%%
# Or if we wanted to read from 2nd to 5th second,
# We seek into a second second of the video,
# then we utilize the itertools takewhile to get the
...
...
@@ -125,7 +125,7 @@ approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0]
print
(
"We can expect approx: "
,
approx_nf
)
print
(
"Tensor size: "
,
frames
[
0
].
size
())
#
###################################
#
%%
# 2. Building a sample read_video function
# ----------------------------------------------------------------------------------------
# We can utilize the methods above to build the read video function that follows
...
...
@@ -170,21 +170,21 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au
vf
,
af
,
info
,
meta
=
example_read_video
(
video
)
print
(
vf
.
size
(),
af
.
size
())
#
###################################
#
%%
# 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400)
# -------------------------------------------------------------------------------------------------------
# Cool, so now we can use the same principle to make the sample dataset.
# We suggest trying out iterable dataset for this purpose.
# Here, we are going to build an example dataset that reads randomly selected 10 frames of video.
#
###################################
#
%%
# Make sample dataset
import
os
os
.
makedirs
(
"./dataset"
,
exist_ok
=
True
)
os
.
makedirs
(
"./dataset/1"
,
exist_ok
=
True
)
os
.
makedirs
(
"./dataset/2"
,
exist_ok
=
True
)
#
###################################
#
%%
# Download the videos
from
torchvision.datasets.utils
import
download_url
download_url
(
...
...
@@ -212,7 +212,7 @@ download_url(
"v_SoccerJuggling_g24_c01.avi"
)
#
###################################
#
%%
# Housekeeping and utilities
import
os
import
random
...
...
@@ -232,7 +232,7 @@ def get_samples(root, extensions=(".mp4", ".avi")):
_
,
class_to_idx
=
_find_classes
(
root
)
return
make_dataset
(
root
,
class_to_idx
,
extensions
=
extensions
)
#
###################################
#
%%
# We are going to define the dataset and some basic arguments.
# We assume the structure of the FolderDataset, and add the following parameters:
#
...
...
@@ -287,7 +287,7 @@ class RandomDataset(torch.utils.data.IterableDataset):
'end'
:
current_pts
}
yield
output
#
###################################
#
%%
# Given a path of videos in a folder structure, i.e:
#
# - dataset
...
...
@@ -309,7 +309,7 @@ frame_transform = t.Compose(transforms)
dataset
=
RandomDataset
(
"./dataset"
,
epoch_size
=
None
,
frame_transform
=
frame_transform
)
#
###################################
#
%%
from
torch.utils.data
import
DataLoader
loader
=
DataLoader
(
dataset
,
batch_size
=
12
)
data
=
{
"video"
:
[],
'start'
:
[],
'end'
:
[],
'tensorsize'
:
[]}
...
...
@@ -321,7 +321,7 @@ for batch in loader:
data
[
'tensorsize'
].
append
(
batch
[
'video'
][
i
].
size
())
print
(
data
)
#
###################################
#
%%
# 4. Data Visualization
# ----------------------------------
# Example of visualized video
...
...
@@ -334,7 +334,7 @@ for i in range(16):
plt
.
imshow
(
batch
[
"video"
][
0
,
i
,
...].
permute
(
1
,
2
,
0
))
plt
.
axis
(
"off"
)
#
###################################
#
%%
# Cleanup the video and dataset:
import
os
import
shutil
...
...
gallery/plot_visualization_utils.py
View file @
408917d1
...
...
@@ -30,7 +30,7 @@ def show(imgs):
axs
[
0
,
i
].
set
(
xticklabels
=
[],
yticklabels
=
[],
xticks
=
[],
yticks
=
[])
#
###################################
#
%%
# Visualizing a grid of images
# ----------------------------
# The :func:`~torchvision.utils.make_grid` function can be used to create a
...
...
@@ -48,7 +48,7 @@ dog_list = [dog1_int, dog2_int]
grid
=
make_grid
(
dog_list
)
show
(
grid
)
#
###################################
#
%%
# Visualizing bounding boxes
# --------------------------
# We can use :func:`~torchvision.utils.draw_bounding_boxes` to draw boxes on an
...
...
@@ -64,7 +64,7 @@ result = draw_bounding_boxes(dog1_int, boxes, colors=colors, width=5)
show
(
result
)
#
####################################
#
%%
# Naturally, we can also plot bounding boxes produced by torchvision detection
# models. Here is a demo with a Faster R-CNN model loaded from
# :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
...
...
@@ -85,7 +85,7 @@ model = model.eval()
outputs
=
model
(
images
)
print
(
outputs
)
#
####################################
#
%%
# Let's plot the boxes detected by our model. We will only plot the boxes with a
# score greater than a given threshold.
...
...
@@ -96,7 +96,7 @@ dogs_with_boxes = [
]
show
(
dogs_with_boxes
)
#
####################################
#
%%
# Visualizing segmentation masks
# ------------------------------
# The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
...
...
@@ -125,7 +125,7 @@ batch = torch.stack([transforms(d) for d in dog_list])
output
=
model
(
batch
)[
'out'
]
print
(
output
.
shape
,
output
.
min
().
item
(),
output
.
max
().
item
())
#
####################################
#
%%
# As we can see above, the output of the segmentation model is a tensor of shape
# ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score, and
# we can normalize them into ``[0, 1]`` by using a softmax. After the softmax,
...
...
@@ -147,7 +147,7 @@ dog_and_boat_masks = [
show
(
dog_and_boat_masks
)
#
####################################
#
%%
# As expected, the model is confident about the dog class, but not so much for
# the boat class.
#
...
...
@@ -162,7 +162,7 @@ print(f"shape = {boolean_dog_masks.shape}, dtype = {boolean_dog_masks.dtype}")
show
([
m
.
float
()
for
m
in
boolean_dog_masks
])
#
####################################
#
%%
# The line above where we define ``boolean_dog_masks`` is a bit cryptic, but you
# can read it as the following query: "For which pixels is 'dog' the most likely
# class?"
...
...
@@ -184,7 +184,7 @@ dogs_with_masks = [
]
show
(
dogs_with_masks
)
#
####################################
#
%%
# We can plot more than one mask per image! Remember that the model returned as
# many masks as there are classes. Let's ask the same query as above, but this
# time for *all* classes, not just the dog class: "For each pixel and each class
...
...
@@ -204,7 +204,7 @@ print(f"dog1_all_classes_masks = {dog1_all_classes_masks.shape}, dtype = {dog1_a
dog_with_all_masks
=
draw_segmentation_masks
(
dog1_int
,
masks
=
dog1_all_classes_masks
,
alpha
=
.
6
)
show
(
dog_with_all_masks
)
#
####################################
#
%%
# We can see in the image above that only 2 masks were drawn: the mask for the
# background and the mask for the dog. This is because the model thinks that
# only these 2 classes are the most likely ones across all the pixels. If the
...
...
@@ -231,7 +231,7 @@ dogs_with_masks = [
show
(
dogs_with_masks
)
#
####################################
#
%%
# .. _instance_seg_output:
#
# Instance segmentation models
...
...
@@ -265,7 +265,7 @@ model = model.eval()
output
=
model
(
images
)
print
(
output
)
#
####################################
#
%%
# Let's break this down. For each image in the batch, the model outputs some
# detections (or instances). The number of detections varies for each input
# image. Each instance is described by its bounding box, its label, its score
...
...
@@ -288,7 +288,7 @@ dog1_masks = dog1_output['masks']
print
(
f
"shape =
{
dog1_masks
.
shape
}
, dtype =
{
dog1_masks
.
dtype
}
, "
f
"min =
{
dog1_masks
.
min
()
}
, max =
{
dog1_masks
.
max
()
}
"
)
#
####################################
#
%%
# Here the masks correspond to probabilities indicating, for each pixel, how
# likely it is to belong to the predicted label of that instance. Those
# predicted labels correspond to the 'labels' element in the same output dict.
...
...
@@ -297,7 +297,7 @@ print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, "
print
(
"For the first dog, the following instances were detected:"
)
print
([
weights
.
meta
[
"categories"
][
label
]
for
label
in
dog1_output
[
'labels'
]])
#
####################################
#
%%
# Interestingly, the model detects two persons in the image. Let's go ahead and
# plot those masks. Since :func:`~torchvision.utils.draw_segmentation_masks`
# expects boolean masks, we need to convert those probabilities into boolean
...
...
@@ -315,14 +315,14 @@ dog1_bool_masks = dog1_bool_masks.squeeze(1)
show
(
draw_segmentation_masks
(
dog1_int
,
dog1_bool_masks
,
alpha
=
0.9
))
#
####################################
#
%%
# The model seems to have properly detected the dog, but it also confused trees
# with people. Looking more closely at the scores will help us plot more
# relevant masks:
print
(
dog1_output
[
'scores'
])
#
####################################
#
%%
# Clearly the model is more confident about the dog detection than it is about
# the people detections. That's good news. When plotting the masks, we can ask
# for only those that have a good score. Let's use a score threshold of .75
...
...
@@ -341,12 +341,12 @@ dogs_with_masks = [
]
show
(
dogs_with_masks
)
#
####################################
#
%%
# The two 'people' masks in the first image where not selected because they have
# a lower score than the score threshold. Similarly, in the second image, the
# instance with class 15 (which corresponds to 'bench') was not selected.
#
####################################
#
%%
# .. _keypoint_output:
#
# Visualizing keypoints
...
...
@@ -373,7 +373,7 @@ model = model.eval()
outputs
=
model
([
person_float
])
print
(
outputs
)
#
####################################
#
%%
# As we see the output contains a list of dictionaries.
# The output list is of length batch_size.
# We currently have just a single image so length of list is 1.
...
...
@@ -388,7 +388,7 @@ scores = outputs[0]['scores']
print
(
kpts
)
print
(
scores
)
#
####################################
#
%%
# The KeypointRCNN model detects there are two instances in the image.
# If you plot the boxes by using :func:`~draw_bounding_boxes`
# you would recognize they are the person and the surfboard.
...
...
@@ -402,7 +402,7 @@ keypoints = kpts[idx]
print
(
keypoints
)
#
####################################
#
%%
# Great, now we have the keypoints corresponding to the person.
# Each keypoint is represented by x, y coordinates and the visibility.
# We can now use the :func:`~torchvision.utils.draw_keypoints` function to draw keypoints.
...
...
@@ -413,7 +413,7 @@ from torchvision.utils import draw_keypoints
res
=
draw_keypoints
(
person_int
,
keypoints
,
colors
=
"blue"
,
radius
=
3
)
show
(
res
)
#
####################################
#
%%
# As we see the keypoints appear as colored circles over the image.
# The coco keypoints for a person are ordered and represent the following list.\
...
...
@@ -424,7 +424,7 @@ coco_keypoints = [
"left_knee"
,
"right_knee"
,
"left_ankle"
,
"right_ankle"
,
]
#
####################################
#
%%
# What if we are interested in joining the keypoints?
# This is especially useful in creating pose detection or action recognition.
# We can join the keypoints easily using the `connectivity` parameter.
...
...
@@ -450,7 +450,7 @@ connect_skeleton = [
(
7
,
9
),
(
8
,
10
),
(
5
,
11
),
(
6
,
12
),
(
11
,
13
),
(
12
,
14
),
(
13
,
15
),
(
14
,
16
)
]
#
####################################
#
%%
# We pass the above list to the connectivity parameter to connect the keypoints.
#
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment