Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
vision
Commits
cc26cd81
Commit
cc26cd81
authored
Nov 27, 2023
by
panning
Browse files
merge v0.16.0
parents
f78f29f5
fbb4cc54
Changes
370
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
2835 additions
and
0 deletions
+2835
-0
torchvision/transforms/v2/_container.py
torchvision/transforms/v2/_container.py
+180
-0
torchvision/transforms/v2/_deprecated.py
torchvision/transforms/v2/_deprecated.py
+50
-0
torchvision/transforms/v2/_geometry.py
torchvision/transforms/v2/_geometry.py
+1447
-0
torchvision/transforms/v2/_meta.py
torchvision/transforms/v2/_meta.py
+42
-0
torchvision/transforms/v2/_misc.py
torchvision/transforms/v2/_misc.py
+421
-0
torchvision/transforms/v2/_temporal.py
torchvision/transforms/v2/_temporal.py
+28
-0
torchvision/transforms/v2/_transform.py
torchvision/transforms/v2/_transform.py
+176
-0
torchvision/transforms/v2/_type_conversion.py
torchvision/transforms/v2/_type_conversion.py
+92
-0
torchvision/transforms/v2/_utils.py
torchvision/transforms/v2/_utils.py
+222
-0
torchvision/transforms/v2/functional/__init__.py
torchvision/transforms/v2/functional/__init__.py
+177
-0
No files found.
Too many changes to show.
To preserve performance only
370 of 370+
files are displayed.
Plain diff
Email patch
torchvision/transforms/v2/_container.py
0 → 100644
View file @
cc26cd81
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
,
Sequence
,
Union
import
torch
from
torch
import
nn
from
torchvision
import
transforms
as
_transforms
from
torchvision.transforms.v2
import
Transform
class
Compose
(
Transform
):
"""[BETA] Composes several transforms together.
.. v2betastatus:: Compose transform
This transform does not support torchscript.
Please, see the note below.
Args:
transforms (list of ``Transform`` objects): list of transforms to compose.
Example:
>>> transforms.Compose([
>>> transforms.CenterCrop(10),
>>> transforms.PILToTensor(),
>>> transforms.ConvertImageDtype(torch.float),
>>> ])
.. note::
In order to script the transformations, please use ``torch.nn.Sequential`` as below.
>>> transforms = torch.nn.Sequential(
>>> transforms.CenterCrop(10),
>>> transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
>>> )
>>> scripted_transforms = torch.jit.script(transforms)
Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
`lambda` functions or ``PIL.Image``.
"""
def
__init__
(
self
,
transforms
:
Sequence
[
Callable
])
->
None
:
super
().
__init__
()
if
not
isinstance
(
transforms
,
Sequence
):
raise
TypeError
(
"Argument transforms should be a sequence of callables"
)
elif
not
transforms
:
raise
ValueError
(
"Pass at least one transform"
)
self
.
transforms
=
transforms
def
forward
(
self
,
*
inputs
:
Any
)
->
Any
:
needs_unpacking
=
len
(
inputs
)
>
1
for
transform
in
self
.
transforms
:
outputs
=
transform
(
*
inputs
)
inputs
=
outputs
if
needs_unpacking
else
(
outputs
,)
return
outputs
def
extra_repr
(
self
)
->
str
:
format_string
=
[]
for
t
in
self
.
transforms
:
format_string
.
append
(
f
"
{
t
}
"
)
return
"
\n
"
.
join
(
format_string
)
class
RandomApply
(
Transform
):
"""[BETA] Apply randomly a list of transformations with a given probability.
.. v2betastatus:: RandomApply transform
.. note::
In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of
transforms as shown below:
>>> transforms = transforms.RandomApply(torch.nn.ModuleList([
>>> transforms.ColorJitter(),
>>> ]), p=0.3)
>>> scripted_transforms = torch.jit.script(transforms)
Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
`lambda` functions or ``PIL.Image``.
Args:
transforms (sequence or torch.nn.Module): list of transformations
p (float): probability of applying the list of transforms
"""
_v1_transform_cls
=
_transforms
.
RandomApply
def
__init__
(
self
,
transforms
:
Union
[
Sequence
[
Callable
],
nn
.
ModuleList
],
p
:
float
=
0.5
)
->
None
:
super
().
__init__
()
if
not
isinstance
(
transforms
,
(
Sequence
,
nn
.
ModuleList
)):
raise
TypeError
(
"Argument transforms should be a sequence of callables or a `nn.ModuleList`"
)
self
.
transforms
=
transforms
if
not
(
0.0
<=
p
<=
1.0
):
raise
ValueError
(
"`p` should be a floating point value in the interval [0.0, 1.0]."
)
self
.
p
=
p
def
_extract_params_for_v1_transform
(
self
)
->
Dict
[
str
,
Any
]:
return
{
"transforms"
:
self
.
transforms
,
"p"
:
self
.
p
}
def
forward
(
self
,
*
inputs
:
Any
)
->
Any
:
sample
=
inputs
if
len
(
inputs
)
>
1
else
inputs
[
0
]
if
torch
.
rand
(
1
)
>=
self
.
p
:
return
sample
for
transform
in
self
.
transforms
:
sample
=
transform
(
sample
)
return
sample
def
extra_repr
(
self
)
->
str
:
format_string
=
[]
for
t
in
self
.
transforms
:
format_string
.
append
(
f
"
{
t
}
"
)
return
"
\n
"
.
join
(
format_string
)
class
RandomChoice
(
Transform
):
"""[BETA] Apply single transformation randomly picked from a list.
.. v2betastatus:: RandomChoice transform
This transform does not support torchscript.
Args:
transforms (sequence or torch.nn.Module): list of transformations
p (list of floats or None, optional): probability of each transform being picked.
If ``p`` doesn't sum to 1, it is automatically normalized. If ``None``
(default), all transforms have the same probability.
"""
def
__init__
(
self
,
transforms
:
Sequence
[
Callable
],
p
:
Optional
[
List
[
float
]]
=
None
,
)
->
None
:
if
not
isinstance
(
transforms
,
Sequence
):
raise
TypeError
(
"Argument transforms should be a sequence of callables"
)
if
p
is
None
:
p
=
[
1
]
*
len
(
transforms
)
elif
len
(
p
)
!=
len
(
transforms
):
raise
ValueError
(
f
"Length of p doesn't match the number of transforms:
{
len
(
p
)
}
!=
{
len
(
transforms
)
}
"
)
super
().
__init__
()
self
.
transforms
=
transforms
total
=
sum
(
p
)
self
.
p
=
[
prob
/
total
for
prob
in
p
]
def
forward
(
self
,
*
inputs
:
Any
)
->
Any
:
idx
=
int
(
torch
.
multinomial
(
torch
.
tensor
(
self
.
p
),
1
))
transform
=
self
.
transforms
[
idx
]
return
transform
(
*
inputs
)
class
RandomOrder
(
Transform
):
"""[BETA] Apply a list of transformations in a random order.
.. v2betastatus:: RandomOrder transform
This transform does not support torchscript.
Args:
transforms (sequence or torch.nn.Module): list of transformations
"""
def
__init__
(
self
,
transforms
:
Sequence
[
Callable
])
->
None
:
if
not
isinstance
(
transforms
,
Sequence
):
raise
TypeError
(
"Argument transforms should be a sequence of callables"
)
super
().
__init__
()
self
.
transforms
=
transforms
def
forward
(
self
,
*
inputs
:
Any
)
->
Any
:
sample
=
inputs
if
len
(
inputs
)
>
1
else
inputs
[
0
]
for
idx
in
torch
.
randperm
(
len
(
self
.
transforms
)):
transform
=
self
.
transforms
[
idx
]
sample
=
transform
(
sample
)
return
sample
torchvision/transforms/v2/_deprecated.py
0 → 100644
View file @
cc26cd81
import
warnings
from
typing
import
Any
,
Dict
,
Union
import
numpy
as
np
import
PIL.Image
import
torch
from
torchvision.transforms
import
functional
as
_F
from
torchvision.transforms.v2
import
Transform
class
ToTensor
(
Transform
):
"""[BETA] [DEPRECATED] Use ``v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])`` instead.
Convert a PIL Image or ndarray to tensor and scale the values accordingly.
.. v2betastatus:: ToTensor transform
.. warning::
:class:`v2.ToTensor` is deprecated and will be removed in a future release.
Please use instead ``v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])``.
This transform does not support torchscript.
Converts a PIL Image or numpy.ndarray (H x W x C) in the range
[0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1)
or if the numpy.ndarray has dtype = np.uint8
In the other cases, tensors are returned without scaling.
.. note::
Because the input image is scaled to [0.0, 1.0], this transformation should not be used when
transforming target image masks. See the `references`_ for implementing the transforms for image masks.
.. _references: https://github.com/pytorch/vision/tree/main/references/segmentation
"""
_transformed_types
=
(
PIL
.
Image
.
Image
,
np
.
ndarray
)
def
__init__
(
self
)
->
None
:
warnings
.
warn
(
"The transform `ToTensor()` is deprecated and will be removed in a future release. "
"Instead, please use `v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])`."
)
super
().
__init__
()
def
_transform
(
self
,
inpt
:
Union
[
PIL
.
Image
.
Image
,
np
.
ndarray
],
params
:
Dict
[
str
,
Any
])
->
torch
.
Tensor
:
return
_F
.
to_tensor
(
inpt
)
torchvision/transforms/v2/_geometry.py
0 → 100644
View file @
cc26cd81
import
math
import
numbers
import
warnings
from
typing
import
Any
,
Callable
,
cast
,
Dict
,
List
,
Literal
,
Optional
,
Sequence
,
Tuple
,
Type
,
Union
import
PIL.Image
import
torch
from
torchvision
import
transforms
as
_transforms
,
tv_tensors
from
torchvision.ops.boxes
import
box_iou
from
torchvision.transforms.functional
import
_get_perspective_coeffs
from
torchvision.transforms.v2
import
functional
as
F
,
InterpolationMode
,
Transform
from
torchvision.transforms.v2.functional._geometry
import
_check_interpolation
from
torchvision.transforms.v2.functional._utils
import
_FillType
from
._transform
import
_RandomApplyTransform
from
._utils
import
(
_check_padding_arg
,
_check_padding_mode_arg
,
_check_sequence_input
,
_get_fill
,
_setup_angle
,
_setup_fill_arg
,
_setup_number_or_seq
,
_setup_size
,
get_bounding_boxes
,
has_all
,
has_any
,
is_pure_tensor
,
query_size
,
)
class
RandomHorizontalFlip
(
_RandomApplyTransform
):
"""[BETA] Horizontally flip the input with a given probability.
.. v2betastatus:: RandomHorizontalFlip transform
If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
:class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
Args:
p (float, optional): probability of the input being flipped. Default value is 0.5
"""
_v1_transform_cls
=
_transforms
.
RandomHorizontalFlip
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
return
self
.
_call_kernel
(
F
.
horizontal_flip
,
inpt
)
class
RandomVerticalFlip
(
_RandomApplyTransform
):
"""[BETA] Vertically flip the input with a given probability.
.. v2betastatus:: RandomVerticalFlip transform
If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
:class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
Args:
p (float, optional): probability of the input being flipped. Default value is 0.5
"""
_v1_transform_cls
=
_transforms
.
RandomVerticalFlip
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
return
self
.
_call_kernel
(
F
.
vertical_flip
,
inpt
)
class
Resize
(
Transform
):
"""[BETA] Resize the input to the given size.
.. v2betastatus:: Resize transform
If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
:class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
.. warning::
The output image might be different depending on its type: when downsampling, the interpolation of PIL images
and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences
in the performance of a network. Therefore, it is preferable to train and serve a model with the same input
types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors
closer.
Args:
size (sequence or int): Desired output size. If size is a sequence like
(h, w), output size will be matched to this. If size is an int,
smaller edge of the image will be matched to this number.
i.e, if height > width, then image will be rescaled to
(size * height / width, size).
.. note::
In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
interpolation (InterpolationMode, optional): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
max_size (int, optional): The maximum allowed for the longer edge of
the resized image. If the longer edge of the image is greater
than ``max_size`` after being resized according to ``size``,
``size`` will be overruled so that the longer edge is equal to
``max_size``.
As a result, the smaller edge may be shorter than ``size``. This
is only supported if ``size`` is an int (or a sequence of length
1 in torchscript mode).
antialias (bool, optional): Whether to apply antialiasing.
It only affects **tensors** with bilinear or bicubic modes and it is
ignored otherwise: on PIL images, antialiasing is always applied on
bilinear or bicubic modes; on other modes (for PIL images and
tensors), antialiasing makes no sense and this parameter is ignored.
Possible values are:
- ``True``: will apply antialiasing for bilinear or bicubic modes.
Other mode aren't affected. This is probably what you want to use.
- ``False``: will not apply antialiasing for tensors on any mode. PIL
images are still antialiased on bilinear or bicubic modes, because
PIL doesn't support no antialias.
- ``None``: equivalent to ``False`` for tensors and ``True`` for
PIL images. This value exists for legacy reasons and you probably
don't want to use it unless you really know what you are doing.
The current default is ``None`` **but will change to** ``True`` **in
v0.17** for the PIL and Tensor backends to be consistent.
"""
_v1_transform_cls
=
_transforms
.
Resize
def
__init__
(
self
,
size
:
Union
[
int
,
Sequence
[
int
]],
interpolation
:
Union
[
InterpolationMode
,
int
]
=
InterpolationMode
.
BILINEAR
,
max_size
:
Optional
[
int
]
=
None
,
antialias
:
Optional
[
Union
[
str
,
bool
]]
=
"warn"
,
)
->
None
:
super
().
__init__
()
if
isinstance
(
size
,
int
):
size
=
[
size
]
elif
isinstance
(
size
,
(
list
,
tuple
))
and
len
(
size
)
in
{
1
,
2
}:
size
=
list
(
size
)
else
:
raise
ValueError
(
f
"size can either be an integer or a list or tuple of one or two integers, "
f
"but got
{
size
}
instead."
)
self
.
size
=
size
self
.
interpolation
=
_check_interpolation
(
interpolation
)
self
.
max_size
=
max_size
self
.
antialias
=
antialias
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
return
self
.
_call_kernel
(
F
.
resize
,
inpt
,
self
.
size
,
interpolation
=
self
.
interpolation
,
max_size
=
self
.
max_size
,
antialias
=
self
.
antialias
,
)
class
CenterCrop
(
Transform
):
"""[BETA] Crop the input at the center.
.. v2betastatus:: CenterCrop transform
If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
:class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
Args:
size (sequence or int): Desired output size of the crop. If size is an
int instead of sequence like (h, w), a square crop (size, size) is
made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
"""
_v1_transform_cls
=
_transforms
.
CenterCrop
def
__init__
(
self
,
size
:
Union
[
int
,
Sequence
[
int
]]):
super
().
__init__
()
self
.
size
=
_setup_size
(
size
,
error_msg
=
"Please provide only two dimensions (h, w) for size."
)
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
return
self
.
_call_kernel
(
F
.
center_crop
,
inpt
,
output_size
=
self
.
size
)
class
RandomResizedCrop
(
Transform
):
"""[BETA] Crop a random portion of the input and resize it to a given size.
.. v2betastatus:: RandomResizedCrop transform
If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
:class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
A crop of the original input is made: the crop has a random area (H * W)
and a random aspect ratio. This crop is finally resized to the given
size. This is popularly used to train the Inception networks.
Args:
size (int or sequence): expected output size of the crop, for each edge. If size is an
int instead of sequence like (h, w), a square output size ``(size, size)`` is
made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
.. note::
In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
scale (tuple of float, optional): Specifies the lower and upper bounds for the random area of the crop,
before resizing. The scale is defined with respect to the area of the original image.
ratio (tuple of float, optional): lower and upper bounds for the random aspect ratio of the crop, before
resizing.
interpolation (InterpolationMode, optional): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
antialias (bool, optional): Whether to apply antialiasing.
It only affects **tensors** with bilinear or bicubic modes and it is
ignored otherwise: on PIL images, antialiasing is always applied on
bilinear or bicubic modes; on other modes (for PIL images and
tensors), antialiasing makes no sense and this parameter is ignored.
Possible values are:
- ``True``: will apply antialiasing for bilinear or bicubic modes.
Other mode aren't affected. This is probably what you want to use.
- ``False``: will not apply antialiasing for tensors on any mode. PIL
images are still antialiased on bilinear or bicubic modes, because
PIL doesn't support no antialias.
- ``None``: equivalent to ``False`` for tensors and ``True`` for
PIL images. This value exists for legacy reasons and you probably
don't want to use it unless you really know what you are doing.
The current default is ``None`` **but will change to** ``True`` **in
v0.17** for the PIL and Tensor backends to be consistent.
"""
_v1_transform_cls
=
_transforms
.
RandomResizedCrop
def
__init__
(
self
,
size
:
Union
[
int
,
Sequence
[
int
]],
scale
:
Tuple
[
float
,
float
]
=
(
0.08
,
1.0
),
ratio
:
Tuple
[
float
,
float
]
=
(
3.0
/
4.0
,
4.0
/
3.0
),
interpolation
:
Union
[
InterpolationMode
,
int
]
=
InterpolationMode
.
BILINEAR
,
antialias
:
Optional
[
Union
[
str
,
bool
]]
=
"warn"
,
)
->
None
:
super
().
__init__
()
self
.
size
=
_setup_size
(
size
,
error_msg
=
"Please provide only two dimensions (h, w) for size."
)
if
not
isinstance
(
scale
,
Sequence
):
raise
TypeError
(
"Scale should be a sequence"
)
scale
=
cast
(
Tuple
[
float
,
float
],
scale
)
if
not
isinstance
(
ratio
,
Sequence
):
raise
TypeError
(
"Ratio should be a sequence"
)
ratio
=
cast
(
Tuple
[
float
,
float
],
ratio
)
if
(
scale
[
0
]
>
scale
[
1
])
or
(
ratio
[
0
]
>
ratio
[
1
]):
warnings
.
warn
(
"Scale and ratio should be of kind (min, max)"
)
self
.
scale
=
scale
self
.
ratio
=
ratio
self
.
interpolation
=
_check_interpolation
(
interpolation
)
self
.
antialias
=
antialias
self
.
_log_ratio
=
torch
.
log
(
torch
.
tensor
(
self
.
ratio
))
def
_get_params
(
self
,
flat_inputs
:
List
[
Any
])
->
Dict
[
str
,
Any
]:
height
,
width
=
query_size
(
flat_inputs
)
area
=
height
*
width
log_ratio
=
self
.
_log_ratio
for
_
in
range
(
10
):
target_area
=
area
*
torch
.
empty
(
1
).
uniform_
(
self
.
scale
[
0
],
self
.
scale
[
1
]).
item
()
aspect_ratio
=
torch
.
exp
(
torch
.
empty
(
1
).
uniform_
(
log_ratio
[
0
],
# type: ignore[arg-type]
log_ratio
[
1
],
# type: ignore[arg-type]
)
).
item
()
w
=
int
(
round
(
math
.
sqrt
(
target_area
*
aspect_ratio
)))
h
=
int
(
round
(
math
.
sqrt
(
target_area
/
aspect_ratio
)))
if
0
<
w
<=
width
and
0
<
h
<=
height
:
i
=
torch
.
randint
(
0
,
height
-
h
+
1
,
size
=
(
1
,)).
item
()
j
=
torch
.
randint
(
0
,
width
-
w
+
1
,
size
=
(
1
,)).
item
()
break
else
:
# Fallback to central crop
in_ratio
=
float
(
width
)
/
float
(
height
)
if
in_ratio
<
min
(
self
.
ratio
):
w
=
width
h
=
int
(
round
(
w
/
min
(
self
.
ratio
)))
elif
in_ratio
>
max
(
self
.
ratio
):
h
=
height
w
=
int
(
round
(
h
*
max
(
self
.
ratio
)))
else
:
# whole image
w
=
width
h
=
height
i
=
(
height
-
h
)
//
2
j
=
(
width
-
w
)
//
2
return
dict
(
top
=
i
,
left
=
j
,
height
=
h
,
width
=
w
)
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
return
self
.
_call_kernel
(
F
.
resized_crop
,
inpt
,
**
params
,
size
=
self
.
size
,
interpolation
=
self
.
interpolation
,
antialias
=
self
.
antialias
)
class
FiveCrop
(
Transform
):
"""[BETA] Crop the image or video into four corners and the central crop.
.. v2betastatus:: FiveCrop transform
If the input is a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.Image` or a
:class:`~torchvision.tv_tensors.Video` it can have arbitrary number of leading batch dimensions.
For example, the image can have ``[..., C, H, W]`` shape.
.. Note::
This transform returns a tuple of images and there may be a mismatch in the number of
inputs and targets your Dataset returns. See below for an example of how to deal with
this.
Args:
size (sequence or int): Desired output size of the crop. If size is an ``int``
instead of sequence like (h, w), a square crop of size (size, size) is made.
If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
Example:
>>> class BatchMultiCrop(transforms.Transform):
... def forward(self, sample: Tuple[Tuple[Union[tv_tensors.Image, tv_tensors.Video], ...], int]):
... images_or_videos, labels = sample
... batch_size = len(images_or_videos)
... image_or_video = images_or_videos[0]
... images_or_videos = tv_tensors.wrap(torch.stack(images_or_videos), like=image_or_video)
... labels = torch.full((batch_size,), label, device=images_or_videos.device)
... return images_or_videos, labels
...
>>> image = tv_tensors.Image(torch.rand(3, 256, 256))
>>> label = 3
>>> transform = transforms.Compose([transforms.FiveCrop(224), BatchMultiCrop()])
>>> images, labels = transform(image, label)
>>> images.shape
torch.Size([5, 3, 224, 224])
>>> labels
tensor([3, 3, 3, 3, 3])
"""
_v1_transform_cls
=
_transforms
.
FiveCrop
def
__init__
(
self
,
size
:
Union
[
int
,
Sequence
[
int
]])
->
None
:
super
().
__init__
()
self
.
size
=
_setup_size
(
size
,
error_msg
=
"Please provide only two dimensions (h, w) for size."
)
def
_call_kernel
(
self
,
functional
:
Callable
,
inpt
:
Any
,
*
args
:
Any
,
**
kwargs
:
Any
)
->
Any
:
if
isinstance
(
inpt
,
(
tv_tensors
.
BoundingBoxes
,
tv_tensors
.
Mask
)):
warnings
.
warn
(
f
"
{
type
(
self
).
__name__
}
() is currently passing through inputs of type "
f
"tv_tensors.
{
type
(
inpt
).
__name__
}
. This will likely change in the future."
)
return
super
().
_call_kernel
(
functional
,
inpt
,
*
args
,
**
kwargs
)
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
return
self
.
_call_kernel
(
F
.
five_crop
,
inpt
,
self
.
size
)
def
_check_inputs
(
self
,
flat_inputs
:
List
[
Any
])
->
None
:
if
has_any
(
flat_inputs
,
tv_tensors
.
BoundingBoxes
,
tv_tensors
.
Mask
):
raise
TypeError
(
f
"BoundingBoxes'es and Mask's are not supported by
{
type
(
self
).
__name__
}
()"
)
class
TenCrop
(
Transform
):
"""[BETA] Crop the image or video into four corners and the central crop plus the flipped version of
these (horizontal flipping is used by default).
.. v2betastatus:: TenCrop transform
If the input is a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.Image` or a
:class:`~torchvision.tv_tensors.Video` it can have arbitrary number of leading batch dimensions.
For example, the image can have ``[..., C, H, W]`` shape.
See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
.. Note::
This transform returns a tuple of images and there may be a mismatch in the number of
inputs and targets your Dataset returns. See below for an example of how to deal with
this.
Args:
size (sequence or int): Desired output size of the crop. If size is an
int instead of sequence like (h, w), a square crop (size, size) is
made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
vertical_flip (bool, optional): Use vertical flipping instead of horizontal
"""
_v1_transform_cls
=
_transforms
.
TenCrop
def
__init__
(
self
,
size
:
Union
[
int
,
Sequence
[
int
]],
vertical_flip
:
bool
=
False
)
->
None
:
super
().
__init__
()
self
.
size
=
_setup_size
(
size
,
error_msg
=
"Please provide only two dimensions (h, w) for size."
)
self
.
vertical_flip
=
vertical_flip
def
_call_kernel
(
self
,
functional
:
Callable
,
inpt
:
Any
,
*
args
:
Any
,
**
kwargs
:
Any
)
->
Any
:
if
isinstance
(
inpt
,
(
tv_tensors
.
BoundingBoxes
,
tv_tensors
.
Mask
)):
warnings
.
warn
(
f
"
{
type
(
self
).
__name__
}
() is currently passing through inputs of type "
f
"tv_tensors.
{
type
(
inpt
).
__name__
}
. This will likely change in the future."
)
return
super
().
_call_kernel
(
functional
,
inpt
,
*
args
,
**
kwargs
)
def
_check_inputs
(
self
,
flat_inputs
:
List
[
Any
])
->
None
:
if
has_any
(
flat_inputs
,
tv_tensors
.
BoundingBoxes
,
tv_tensors
.
Mask
):
raise
TypeError
(
f
"BoundingBoxes'es and Mask's are not supported by
{
type
(
self
).
__name__
}
()"
)
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
return
self
.
_call_kernel
(
F
.
ten_crop
,
inpt
,
self
.
size
,
vertical_flip
=
self
.
vertical_flip
)
class
Pad
(
Transform
):
"""[BETA] Pad the input on all sides with the given "pad" value.
.. v2betastatus:: Pad transform
If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
:class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
Args:
padding (int or sequence): Padding on each border. If a single int is provided this
is used to pad all borders. If sequence of length 2 is provided this is the padding
on left/right and top/bottom respectively. If a sequence of length 4 is provided
this is the padding for the left, top, right and bottom borders respectively.
.. note::
In torchscript mode padding as single int is not supported, use a sequence of
length 1: ``[padding, ]``.
fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant.
Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
Fill value can be also a dictionary mapping data type to the fill value, e.g.
``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
``Mask`` will be filled with 0.
padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
Default is "constant".
- constant: pads with a constant value, this value is specified with fill
- edge: pads with the last value at the edge of the image.
- reflect: pads with reflection of image without repeating the last value on the edge.
For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
will result in [3, 2, 1, 2, 3, 4, 3, 2]
- symmetric: pads with reflection of image repeating the last value on the edge.
For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
will result in [2, 1, 1, 2, 3, 4, 4, 3]
"""
_v1_transform_cls
=
_transforms
.
Pad
def
_extract_params_for_v1_transform
(
self
)
->
Dict
[
str
,
Any
]:
params
=
super
().
_extract_params_for_v1_transform
()
if
not
(
params
[
"fill"
]
is
None
or
isinstance
(
params
[
"fill"
],
(
int
,
float
))):
raise
ValueError
(
f
"
{
type
(
self
).
__name__
}
() can only be scripted for a scalar `fill`, but got
{
self
.
fill
}
."
)
return
params
def
__init__
(
self
,
padding
:
Union
[
int
,
Sequence
[
int
]],
fill
:
Union
[
_FillType
,
Dict
[
Union
[
Type
,
str
],
_FillType
]]
=
0
,
padding_mode
:
Literal
[
"constant"
,
"edge"
,
"reflect"
,
"symmetric"
]
=
"constant"
,
)
->
None
:
super
().
__init__
()
_check_padding_arg
(
padding
)
_check_padding_mode_arg
(
padding_mode
)
# This cast does Sequence[int] -> List[int] and is required to make mypy happy
if
not
isinstance
(
padding
,
int
):
padding
=
list
(
padding
)
self
.
padding
=
padding
self
.
fill
=
fill
self
.
_fill
=
_setup_fill_arg
(
fill
)
self
.
padding_mode
=
padding_mode
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
fill
=
_get_fill
(
self
.
_fill
,
type
(
inpt
))
return
self
.
_call_kernel
(
F
.
pad
,
inpt
,
padding
=
self
.
padding
,
fill
=
fill
,
padding_mode
=
self
.
padding_mode
)
# type: ignore[arg-type]
class
RandomZoomOut
(
_RandomApplyTransform
):
"""[BETA] "Zoom out" transformation from
`"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
.. v2betastatus:: RandomZoomOut transform
This transformation randomly pads images, videos, bounding boxes and masks creating a zoom out effect.
Output spatial size is randomly sampled from original size up to a maximum size configured
with ``side_range`` parameter:
.. code-block:: python
r = uniform_sample(side_range[0], side_range[1])
output_width = input_width * r
output_height = input_height * r
If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
:class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
Args:
fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant.
Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
Fill value can be also a dictionary mapping data type to the fill value, e.g.
``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
``Mask`` will be filled with 0.
side_range (sequence of floats, optional): tuple of two floats defines minimum and maximum factors to
scale the input size.
p (float, optional): probability that the zoom operation will be performed.
"""
def
__init__
(
self
,
fill
:
Union
[
_FillType
,
Dict
[
Union
[
Type
,
str
],
_FillType
]]
=
0
,
side_range
:
Sequence
[
float
]
=
(
1.0
,
4.0
),
p
:
float
=
0.5
,
)
->
None
:
super
().
__init__
(
p
=
p
)
self
.
fill
=
fill
self
.
_fill
=
_setup_fill_arg
(
fill
)
_check_sequence_input
(
side_range
,
"side_range"
,
req_sizes
=
(
2
,))
self
.
side_range
=
side_range
if
side_range
[
0
]
<
1.0
or
side_range
[
0
]
>
side_range
[
1
]:
raise
ValueError
(
f
"Invalid canvas side range provided
{
side_range
}
."
)
def
_get_params
(
self
,
flat_inputs
:
List
[
Any
])
->
Dict
[
str
,
Any
]:
orig_h
,
orig_w
=
query_size
(
flat_inputs
)
r
=
self
.
side_range
[
0
]
+
torch
.
rand
(
1
)
*
(
self
.
side_range
[
1
]
-
self
.
side_range
[
0
])
canvas_width
=
int
(
orig_w
*
r
)
canvas_height
=
int
(
orig_h
*
r
)
r
=
torch
.
rand
(
2
)
left
=
int
((
canvas_width
-
orig_w
)
*
r
[
0
])
top
=
int
((
canvas_height
-
orig_h
)
*
r
[
1
])
right
=
canvas_width
-
(
left
+
orig_w
)
bottom
=
canvas_height
-
(
top
+
orig_h
)
padding
=
[
left
,
top
,
right
,
bottom
]
return
dict
(
padding
=
padding
)
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
fill
=
_get_fill
(
self
.
_fill
,
type
(
inpt
))
return
self
.
_call_kernel
(
F
.
pad
,
inpt
,
**
params
,
fill
=
fill
)
class
RandomRotation
(
Transform
):
"""[BETA] Rotate the input by angle.
.. v2betastatus:: RandomRotation transform
If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
:class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
Args:
degrees (sequence or number): Range of degrees to select from.
If degrees is a number instead of sequence like (min, max), the range of degrees
will be (-degrees, +degrees).
interpolation (InterpolationMode, optional): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
expand (bool, optional): Optional expansion flag.
If true, expands the output to make it large enough to hold the entire rotated image.
If false or omitted, make the output image the same size as the input image.
Note that the expand flag assumes rotation around the center (see note below) and no translation.
center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
Default is the center of the image.
.. note::
In theory, setting ``center`` has no effect if ``expand=True``, since the image center will become the
center of rotation. In practice however, due to numerical precision, this can lead to off-by-one
differences of the resulting image size compared to using the image center in the first place. Thus, when
setting ``expand=True``, it's best to leave ``center=None`` (default).
fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant.
Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
Fill value can be also a dictionary mapping data type to the fill value, e.g.
``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
``Mask`` will be filled with 0.
.. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
"""
_v1_transform_cls
=
_transforms
.
RandomRotation
def
__init__
(
self
,
degrees
:
Union
[
numbers
.
Number
,
Sequence
],
interpolation
:
Union
[
InterpolationMode
,
int
]
=
InterpolationMode
.
NEAREST
,
expand
:
bool
=
False
,
center
:
Optional
[
List
[
float
]]
=
None
,
fill
:
Union
[
_FillType
,
Dict
[
Union
[
Type
,
str
],
_FillType
]]
=
0
,
)
->
None
:
super
().
__init__
()
self
.
degrees
=
_setup_angle
(
degrees
,
name
=
"degrees"
,
req_sizes
=
(
2
,))
self
.
interpolation
=
_check_interpolation
(
interpolation
)
self
.
expand
=
expand
self
.
fill
=
fill
self
.
_fill
=
_setup_fill_arg
(
fill
)
if
center
is
not
None
:
_check_sequence_input
(
center
,
"center"
,
req_sizes
=
(
2
,))
self
.
center
=
center
def
_get_params
(
self
,
flat_inputs
:
List
[
Any
])
->
Dict
[
str
,
Any
]:
angle
=
torch
.
empty
(
1
).
uniform_
(
self
.
degrees
[
0
],
self
.
degrees
[
1
]).
item
()
return
dict
(
angle
=
angle
)
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
fill
=
_get_fill
(
self
.
_fill
,
type
(
inpt
))
return
self
.
_call_kernel
(
F
.
rotate
,
inpt
,
**
params
,
interpolation
=
self
.
interpolation
,
expand
=
self
.
expand
,
center
=
self
.
center
,
fill
=
fill
,
)
class
RandomAffine
(
Transform
):
"""[BETA] Random affine transformation the input keeping center invariant.
.. v2betastatus:: RandomAffine transform
If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
:class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
Args:
degrees (sequence or number): Range of degrees to select from.
If degrees is a number instead of sequence like (min, max), the range of degrees
will be (-degrees, +degrees). Set to 0 to deactivate rotations.
translate (tuple, optional): tuple of maximum absolute fraction for horizontal
and vertical translations. For example translate=(a, b), then horizontal shift
is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is
randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default.
scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
randomly sampled from the range a <= scale <= b. Will keep original scale by default.
shear (sequence or number, optional): Range of degrees to select from.
If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear)
will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the
range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
Will not apply shear by default.
interpolation (InterpolationMode, optional): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant.
Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
Fill value can be also a dictionary mapping data type to the fill value, e.g.
``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
``Mask`` will be filled with 0.
center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
Default is the center of the image.
.. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
"""
_v1_transform_cls
=
_transforms
.
RandomAffine
def
__init__
(
self
,
degrees
:
Union
[
numbers
.
Number
,
Sequence
],
translate
:
Optional
[
Sequence
[
float
]]
=
None
,
scale
:
Optional
[
Sequence
[
float
]]
=
None
,
shear
:
Optional
[
Union
[
int
,
float
,
Sequence
[
float
]]]
=
None
,
interpolation
:
Union
[
InterpolationMode
,
int
]
=
InterpolationMode
.
NEAREST
,
fill
:
Union
[
_FillType
,
Dict
[
Union
[
Type
,
str
],
_FillType
]]
=
0
,
center
:
Optional
[
List
[
float
]]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
degrees
=
_setup_angle
(
degrees
,
name
=
"degrees"
,
req_sizes
=
(
2
,))
if
translate
is
not
None
:
_check_sequence_input
(
translate
,
"translate"
,
req_sizes
=
(
2
,))
for
t
in
translate
:
if
not
(
0.0
<=
t
<=
1.0
):
raise
ValueError
(
"translation values should be between 0 and 1"
)
self
.
translate
=
translate
if
scale
is
not
None
:
_check_sequence_input
(
scale
,
"scale"
,
req_sizes
=
(
2
,))
for
s
in
scale
:
if
s
<=
0
:
raise
ValueError
(
"scale values should be positive"
)
self
.
scale
=
scale
if
shear
is
not
None
:
self
.
shear
=
_setup_angle
(
shear
,
name
=
"shear"
,
req_sizes
=
(
2
,
4
))
else
:
self
.
shear
=
shear
self
.
interpolation
=
_check_interpolation
(
interpolation
)
self
.
fill
=
fill
self
.
_fill
=
_setup_fill_arg
(
fill
)
if
center
is
not
None
:
_check_sequence_input
(
center
,
"center"
,
req_sizes
=
(
2
,))
self
.
center
=
center
def
_get_params
(
self
,
flat_inputs
:
List
[
Any
])
->
Dict
[
str
,
Any
]:
height
,
width
=
query_size
(
flat_inputs
)
angle
=
torch
.
empty
(
1
).
uniform_
(
self
.
degrees
[
0
],
self
.
degrees
[
1
]).
item
()
if
self
.
translate
is
not
None
:
max_dx
=
float
(
self
.
translate
[
0
]
*
width
)
max_dy
=
float
(
self
.
translate
[
1
]
*
height
)
tx
=
int
(
round
(
torch
.
empty
(
1
).
uniform_
(
-
max_dx
,
max_dx
).
item
()))
ty
=
int
(
round
(
torch
.
empty
(
1
).
uniform_
(
-
max_dy
,
max_dy
).
item
()))
translate
=
(
tx
,
ty
)
else
:
translate
=
(
0
,
0
)
if
self
.
scale
is
not
None
:
scale
=
torch
.
empty
(
1
).
uniform_
(
self
.
scale
[
0
],
self
.
scale
[
1
]).
item
()
else
:
scale
=
1.0
shear_x
=
shear_y
=
0.0
if
self
.
shear
is
not
None
:
shear_x
=
torch
.
empty
(
1
).
uniform_
(
self
.
shear
[
0
],
self
.
shear
[
1
]).
item
()
if
len
(
self
.
shear
)
==
4
:
shear_y
=
torch
.
empty
(
1
).
uniform_
(
self
.
shear
[
2
],
self
.
shear
[
3
]).
item
()
shear
=
(
shear_x
,
shear_y
)
return
dict
(
angle
=
angle
,
translate
=
translate
,
scale
=
scale
,
shear
=
shear
)
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
fill
=
_get_fill
(
self
.
_fill
,
type
(
inpt
))
return
self
.
_call_kernel
(
F
.
affine
,
inpt
,
**
params
,
interpolation
=
self
.
interpolation
,
fill
=
fill
,
center
=
self
.
center
,
)
class
RandomCrop
(
Transform
):
"""[BETA] Crop the input at a random location.
.. v2betastatus:: RandomCrop transform
If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
:class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
Args:
size (sequence or int): Desired output size of the crop. If size is an
int instead of sequence like (h, w), a square crop (size, size) is
made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
padding (int or sequence, optional): Optional padding on each border
of the image. Default is None. If a single int is provided this
is used to pad all borders. If sequence of length 2 is provided this is the padding
on left/right and top/bottom respectively. If a sequence of length 4 is provided
this is the padding for the left, top, right and bottom borders respectively.
.. note::
In torchscript mode padding as single int is not supported, use a sequence of
length 1: ``[padding, ]``.
pad_if_needed (boolean, optional): It will pad the image if smaller than the
desired size to avoid raising an exception. Since cropping is done
after padding, the padding seems to be done at a random offset.
fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant.
Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
Fill value can be also a dictionary mapping data type to the fill value, e.g.
``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
``Mask`` will be filled with 0.
padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
Default is constant.
- constant: pads with a constant value, this value is specified with fill
- edge: pads with the last value at the edge of the image.
- reflect: pads with reflection of image without repeating the last value on the edge.
For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
will result in [3, 2, 1, 2, 3, 4, 3, 2]
- symmetric: pads with reflection of image repeating the last value on the edge.
For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
will result in [2, 1, 1, 2, 3, 4, 4, 3]
"""
_v1_transform_cls
=
_transforms
.
RandomCrop
def
_extract_params_for_v1_transform
(
self
)
->
Dict
[
str
,
Any
]:
params
=
super
().
_extract_params_for_v1_transform
()
if
not
(
params
[
"fill"
]
is
None
or
isinstance
(
params
[
"fill"
],
(
int
,
float
))):
raise
ValueError
(
f
"
{
type
(
self
).
__name__
}
() can only be scripted for a scalar `fill`, but got
{
self
.
fill
}
."
)
padding
=
self
.
padding
if
padding
is
not
None
:
pad_left
,
pad_right
,
pad_top
,
pad_bottom
=
padding
padding
=
[
pad_left
,
pad_top
,
pad_right
,
pad_bottom
]
params
[
"padding"
]
=
padding
return
params
def
__init__
(
self
,
size
:
Union
[
int
,
Sequence
[
int
]],
padding
:
Optional
[
Union
[
int
,
Sequence
[
int
]]]
=
None
,
pad_if_needed
:
bool
=
False
,
fill
:
Union
[
_FillType
,
Dict
[
Union
[
Type
,
str
],
_FillType
]]
=
0
,
padding_mode
:
Literal
[
"constant"
,
"edge"
,
"reflect"
,
"symmetric"
]
=
"constant"
,
)
->
None
:
super
().
__init__
()
self
.
size
=
_setup_size
(
size
,
error_msg
=
"Please provide only two dimensions (h, w) for size."
)
if
pad_if_needed
or
padding
is
not
None
:
if
padding
is
not
None
:
_check_padding_arg
(
padding
)
_check_padding_mode_arg
(
padding_mode
)
self
.
padding
=
F
.
_geometry
.
_parse_pad_padding
(
padding
)
if
padding
else
None
# type: ignore[arg-type]
self
.
pad_if_needed
=
pad_if_needed
self
.
fill
=
fill
self
.
_fill
=
_setup_fill_arg
(
fill
)
self
.
padding_mode
=
padding_mode
def
_get_params
(
self
,
flat_inputs
:
List
[
Any
])
->
Dict
[
str
,
Any
]:
padded_height
,
padded_width
=
query_size
(
flat_inputs
)
if
self
.
padding
is
not
None
:
pad_left
,
pad_right
,
pad_top
,
pad_bottom
=
self
.
padding
padded_height
+=
pad_top
+
pad_bottom
padded_width
+=
pad_left
+
pad_right
else
:
pad_left
=
pad_right
=
pad_top
=
pad_bottom
=
0
cropped_height
,
cropped_width
=
self
.
size
if
self
.
pad_if_needed
:
if
padded_height
<
cropped_height
:
diff
=
cropped_height
-
padded_height
pad_top
+=
diff
pad_bottom
+=
diff
padded_height
+=
2
*
diff
if
padded_width
<
cropped_width
:
diff
=
cropped_width
-
padded_width
pad_left
+=
diff
pad_right
+=
diff
padded_width
+=
2
*
diff
if
padded_height
<
cropped_height
or
padded_width
<
cropped_width
:
raise
ValueError
(
f
"Required crop size
{
(
cropped_height
,
cropped_width
)
}
is larger than "
f
"
{
'padded '
if
self
.
padding
is
not
None
else
''
}
input image size
{
(
padded_height
,
padded_width
)
}
."
)
# We need a different order here than we have in self.padding since this padding will be parsed again in `F.pad`
padding
=
[
pad_left
,
pad_top
,
pad_right
,
pad_bottom
]
needs_pad
=
any
(
padding
)
needs_vert_crop
,
top
=
(
(
True
,
int
(
torch
.
randint
(
0
,
padded_height
-
cropped_height
+
1
,
size
=
())))
if
padded_height
>
cropped_height
else
(
False
,
0
)
)
needs_horz_crop
,
left
=
(
(
True
,
int
(
torch
.
randint
(
0
,
padded_width
-
cropped_width
+
1
,
size
=
())))
if
padded_width
>
cropped_width
else
(
False
,
0
)
)
return
dict
(
needs_crop
=
needs_vert_crop
or
needs_horz_crop
,
top
=
top
,
left
=
left
,
height
=
cropped_height
,
width
=
cropped_width
,
needs_pad
=
needs_pad
,
padding
=
padding
,
)
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
if
params
[
"needs_pad"
]:
fill
=
_get_fill
(
self
.
_fill
,
type
(
inpt
))
inpt
=
self
.
_call_kernel
(
F
.
pad
,
inpt
,
padding
=
params
[
"padding"
],
fill
=
fill
,
padding_mode
=
self
.
padding_mode
)
if
params
[
"needs_crop"
]:
inpt
=
self
.
_call_kernel
(
F
.
crop
,
inpt
,
top
=
params
[
"top"
],
left
=
params
[
"left"
],
height
=
params
[
"height"
],
width
=
params
[
"width"
]
)
return
inpt
class
RandomPerspective
(
_RandomApplyTransform
):
"""[BETA] Perform a random perspective transformation of the input with a given probability.
.. v2betastatus:: RandomPerspective transform
If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
:class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
Args:
distortion_scale (float, optional): argument to control the degree of distortion and ranges from 0 to 1.
Default is 0.5.
p (float, optional): probability of the input being transformed. Default is 0.5.
interpolation (InterpolationMode, optional): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant.
Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
Fill value can be also a dictionary mapping data type to the fill value, e.g.
``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
``Mask`` will be filled with 0.
"""
_v1_transform_cls
=
_transforms
.
RandomPerspective
def
__init__
(
self
,
distortion_scale
:
float
=
0.5
,
p
:
float
=
0.5
,
interpolation
:
Union
[
InterpolationMode
,
int
]
=
InterpolationMode
.
BILINEAR
,
fill
:
Union
[
_FillType
,
Dict
[
Union
[
Type
,
str
],
_FillType
]]
=
0
,
)
->
None
:
super
().
__init__
(
p
=
p
)
if
not
(
0
<=
distortion_scale
<=
1
):
raise
ValueError
(
"Argument distortion_scale value should be between 0 and 1"
)
self
.
distortion_scale
=
distortion_scale
self
.
interpolation
=
_check_interpolation
(
interpolation
)
self
.
fill
=
fill
self
.
_fill
=
_setup_fill_arg
(
fill
)
def
_get_params
(
self
,
flat_inputs
:
List
[
Any
])
->
Dict
[
str
,
Any
]:
height
,
width
=
query_size
(
flat_inputs
)
distortion_scale
=
self
.
distortion_scale
half_height
=
height
//
2
half_width
=
width
//
2
bound_height
=
int
(
distortion_scale
*
half_height
)
+
1
bound_width
=
int
(
distortion_scale
*
half_width
)
+
1
topleft
=
[
int
(
torch
.
randint
(
0
,
bound_width
,
size
=
(
1
,))),
int
(
torch
.
randint
(
0
,
bound_height
,
size
=
(
1
,))),
]
topright
=
[
int
(
torch
.
randint
(
width
-
bound_width
,
width
,
size
=
(
1
,))),
int
(
torch
.
randint
(
0
,
bound_height
,
size
=
(
1
,))),
]
botright
=
[
int
(
torch
.
randint
(
width
-
bound_width
,
width
,
size
=
(
1
,))),
int
(
torch
.
randint
(
height
-
bound_height
,
height
,
size
=
(
1
,))),
]
botleft
=
[
int
(
torch
.
randint
(
0
,
bound_width
,
size
=
(
1
,))),
int
(
torch
.
randint
(
height
-
bound_height
,
height
,
size
=
(
1
,))),
]
startpoints
=
[[
0
,
0
],
[
width
-
1
,
0
],
[
width
-
1
,
height
-
1
],
[
0
,
height
-
1
]]
endpoints
=
[
topleft
,
topright
,
botright
,
botleft
]
perspective_coeffs
=
_get_perspective_coeffs
(
startpoints
,
endpoints
)
return
dict
(
coefficients
=
perspective_coeffs
)
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
fill
=
_get_fill
(
self
.
_fill
,
type
(
inpt
))
return
self
.
_call_kernel
(
F
.
perspective
,
inpt
,
None
,
None
,
fill
=
fill
,
interpolation
=
self
.
interpolation
,
**
params
,
)
class
ElasticTransform
(
Transform
):
"""[BETA] Transform the input with elastic transformations.
.. v2betastatus:: RandomPerspective transform
If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
:class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
Given alpha and sigma, it will generate displacement
vectors for all pixels based on random offsets. Alpha controls the strength
and sigma controls the smoothness of the displacements.
The displacements are added to an identity grid and the resulting grid is
used to transform the input.
.. note::
Implementation to transform bounding boxes is approximative (not exact).
We construct an approximation of the inverse grid as ``inverse_grid = identity - displacement``.
This is not an exact inverse of the grid used to transform images, i.e. ``grid = identity + displacement``.
Our assumption is that ``displacement * displacement`` is small and can be ignored.
Large displacements would lead to large errors in the approximation.
Applications:
Randomly transforms the morphology of objects in images and produces a
see-through-water-like effect.
Args:
alpha (float or sequence of floats, optional): Magnitude of displacements. Default is 50.0.
sigma (float or sequence of floats, optional): Smoothness of displacements. Default is 5.0.
interpolation (InterpolationMode, optional): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant.
Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
Fill value can be also a dictionary mapping data type to the fill value, e.g.
``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
``Mask`` will be filled with 0.
"""
_v1_transform_cls
=
_transforms
.
ElasticTransform
def
__init__
(
self
,
alpha
:
Union
[
float
,
Sequence
[
float
]]
=
50.0
,
sigma
:
Union
[
float
,
Sequence
[
float
]]
=
5.0
,
interpolation
:
Union
[
InterpolationMode
,
int
]
=
InterpolationMode
.
BILINEAR
,
fill
:
Union
[
_FillType
,
Dict
[
Union
[
Type
,
str
],
_FillType
]]
=
0
,
)
->
None
:
super
().
__init__
()
self
.
alpha
=
_setup_number_or_seq
(
alpha
,
"alpha"
)
self
.
sigma
=
_setup_number_or_seq
(
sigma
,
"sigma"
)
self
.
interpolation
=
_check_interpolation
(
interpolation
)
self
.
fill
=
fill
self
.
_fill
=
_setup_fill_arg
(
fill
)
def
_get_params
(
self
,
flat_inputs
:
List
[
Any
])
->
Dict
[
str
,
Any
]:
size
=
list
(
query_size
(
flat_inputs
))
dx
=
torch
.
rand
([
1
,
1
]
+
size
)
*
2
-
1
if
self
.
sigma
[
0
]
>
0.0
:
kx
=
int
(
8
*
self
.
sigma
[
0
]
+
1
)
# if kernel size is even we have to make it odd
if
kx
%
2
==
0
:
kx
+=
1
dx
=
self
.
_call_kernel
(
F
.
gaussian_blur
,
dx
,
[
kx
,
kx
],
list
(
self
.
sigma
))
dx
=
dx
*
self
.
alpha
[
0
]
/
size
[
0
]
dy
=
torch
.
rand
([
1
,
1
]
+
size
)
*
2
-
1
if
self
.
sigma
[
1
]
>
0.0
:
ky
=
int
(
8
*
self
.
sigma
[
1
]
+
1
)
# if kernel size is even we have to make it odd
if
ky
%
2
==
0
:
ky
+=
1
dy
=
self
.
_call_kernel
(
F
.
gaussian_blur
,
dy
,
[
ky
,
ky
],
list
(
self
.
sigma
))
dy
=
dy
*
self
.
alpha
[
1
]
/
size
[
1
]
displacement
=
torch
.
concat
([
dx
,
dy
],
1
).
permute
([
0
,
2
,
3
,
1
])
# 1 x H x W x 2
return
dict
(
displacement
=
displacement
)
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
fill
=
_get_fill
(
self
.
_fill
,
type
(
inpt
))
return
self
.
_call_kernel
(
F
.
elastic
,
inpt
,
**
params
,
fill
=
fill
,
interpolation
=
self
.
interpolation
,
)
class
RandomIoUCrop
(
Transform
):
"""[BETA] Random IoU crop transformation from
`"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
.. v2betastatus:: RandomIoUCrop transform
This transformation requires an image or video data and ``tv_tensors.BoundingBoxes`` in the input.
.. warning::
In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop`
must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately
after or later in the transforms pipeline.
If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
:class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
Args:
min_scale (float, optional): Minimum factors to scale the input size.
max_scale (float, optional): Maximum factors to scale the input size.
min_aspect_ratio (float, optional): Minimum aspect ratio for the cropped image or video.
max_aspect_ratio (float, optional): Maximum aspect ratio for the cropped image or video.
sampler_options (list of float, optional): List of minimal IoU (Jaccard) overlap between all the boxes and
a cropped image or video. Default, ``None`` which corresponds to ``[0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]``
trials (int, optional): Number of trials to find a crop for a given value of minimal IoU (Jaccard) overlap.
Default, 40.
"""
def
__init__
(
self
,
min_scale
:
float
=
0.3
,
max_scale
:
float
=
1.0
,
min_aspect_ratio
:
float
=
0.5
,
max_aspect_ratio
:
float
=
2.0
,
sampler_options
:
Optional
[
List
[
float
]]
=
None
,
trials
:
int
=
40
,
):
super
().
__init__
()
# Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
self
.
min_scale
=
min_scale
self
.
max_scale
=
max_scale
self
.
min_aspect_ratio
=
min_aspect_ratio
self
.
max_aspect_ratio
=
max_aspect_ratio
if
sampler_options
is
None
:
sampler_options
=
[
0.0
,
0.1
,
0.3
,
0.5
,
0.7
,
0.9
,
1.0
]
self
.
options
=
sampler_options
self
.
trials
=
trials
def
_check_inputs
(
self
,
flat_inputs
:
List
[
Any
])
->
None
:
if
not
(
has_all
(
flat_inputs
,
tv_tensors
.
BoundingBoxes
)
and
has_any
(
flat_inputs
,
PIL
.
Image
.
Image
,
tv_tensors
.
Image
,
is_pure_tensor
)
):
raise
TypeError
(
f
"
{
type
(
self
).
__name__
}
() requires input sample to contain tensor or PIL images "
"and bounding boxes. Sample can also contain masks."
)
def
_get_params
(
self
,
flat_inputs
:
List
[
Any
])
->
Dict
[
str
,
Any
]:
orig_h
,
orig_w
=
query_size
(
flat_inputs
)
bboxes
=
get_bounding_boxes
(
flat_inputs
)
while
True
:
# sample an option
idx
=
int
(
torch
.
randint
(
low
=
0
,
high
=
len
(
self
.
options
),
size
=
(
1
,)))
min_jaccard_overlap
=
self
.
options
[
idx
]
if
min_jaccard_overlap
>=
1.0
:
# a value larger than 1 encodes the leave as-is option
return
dict
()
for
_
in
range
(
self
.
trials
):
# check the aspect ratio limitations
r
=
self
.
min_scale
+
(
self
.
max_scale
-
self
.
min_scale
)
*
torch
.
rand
(
2
)
new_w
=
int
(
orig_w
*
r
[
0
])
new_h
=
int
(
orig_h
*
r
[
1
])
aspect_ratio
=
new_w
/
new_h
if
not
(
self
.
min_aspect_ratio
<=
aspect_ratio
<=
self
.
max_aspect_ratio
):
continue
# check for 0 area crops
r
=
torch
.
rand
(
2
)
left
=
int
((
orig_w
-
new_w
)
*
r
[
0
])
top
=
int
((
orig_h
-
new_h
)
*
r
[
1
])
right
=
left
+
new_w
bottom
=
top
+
new_h
if
left
==
right
or
top
==
bottom
:
continue
# check for any valid boxes with centers within the crop area
xyxy_bboxes
=
F
.
convert_bounding_box_format
(
bboxes
.
as_subclass
(
torch
.
Tensor
),
bboxes
.
format
,
tv_tensors
.
BoundingBoxFormat
.
XYXY
,
)
cx
=
0.5
*
(
xyxy_bboxes
[...,
0
]
+
xyxy_bboxes
[...,
2
])
cy
=
0.5
*
(
xyxy_bboxes
[...,
1
]
+
xyxy_bboxes
[...,
3
])
is_within_crop_area
=
(
left
<
cx
)
&
(
cx
<
right
)
&
(
top
<
cy
)
&
(
cy
<
bottom
)
if
not
is_within_crop_area
.
any
():
continue
# check at least 1 box with jaccard limitations
xyxy_bboxes
=
xyxy_bboxes
[
is_within_crop_area
]
ious
=
box_iou
(
xyxy_bboxes
,
torch
.
tensor
([[
left
,
top
,
right
,
bottom
]],
dtype
=
xyxy_bboxes
.
dtype
,
device
=
xyxy_bboxes
.
device
),
)
if
ious
.
max
()
<
min_jaccard_overlap
:
continue
return
dict
(
top
=
top
,
left
=
left
,
height
=
new_h
,
width
=
new_w
,
is_within_crop_area
=
is_within_crop_area
)
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
if
len
(
params
)
<
1
:
return
inpt
output
=
self
.
_call_kernel
(
F
.
crop
,
inpt
,
top
=
params
[
"top"
],
left
=
params
[
"left"
],
height
=
params
[
"height"
],
width
=
params
[
"width"
]
)
if
isinstance
(
output
,
tv_tensors
.
BoundingBoxes
):
# We "mark" the invalid boxes as degenreate, and they can be
# removed by a later call to SanitizeBoundingBoxes()
output
[
~
params
[
"is_within_crop_area"
]]
=
0
return
output
class
ScaleJitter
(
Transform
):
"""[BETA] Perform Large Scale Jitter on the input according to
`"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" <https://arxiv.org/abs/2012.07177>`_.
.. v2betastatus:: ScaleJitter transform
If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
:class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
Args:
target_size (tuple of int): Target size. This parameter defines base scale for jittering,
e.g. ``min(target_size[0] / width, target_size[1] / height)``.
scale_range (tuple of float, optional): Minimum and maximum of the scale range. Default, ``(0.1, 2.0)``.
interpolation (InterpolationMode, optional): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
antialias (bool, optional): Whether to apply antialiasing.
It only affects **tensors** with bilinear or bicubic modes and it is
ignored otherwise: on PIL images, antialiasing is always applied on
bilinear or bicubic modes; on other modes (for PIL images and
tensors), antialiasing makes no sense and this parameter is ignored.
Possible values are:
- ``True``: will apply antialiasing for bilinear or bicubic modes.
Other mode aren't affected. This is probably what you want to use.
- ``False``: will not apply antialiasing for tensors on any mode. PIL
images are still antialiased on bilinear or bicubic modes, because
PIL doesn't support no antialias.
- ``None``: equivalent to ``False`` for tensors and ``True`` for
PIL images. This value exists for legacy reasons and you probably
don't want to use it unless you really know what you are doing.
The current default is ``None`` **but will change to** ``True`` **in
v0.17** for the PIL and Tensor backends to be consistent.
"""
def
__init__
(
self
,
target_size
:
Tuple
[
int
,
int
],
scale_range
:
Tuple
[
float
,
float
]
=
(
0.1
,
2.0
),
interpolation
:
Union
[
InterpolationMode
,
int
]
=
InterpolationMode
.
BILINEAR
,
antialias
:
Optional
[
Union
[
str
,
bool
]]
=
"warn"
,
):
super
().
__init__
()
self
.
target_size
=
target_size
self
.
scale_range
=
scale_range
self
.
interpolation
=
_check_interpolation
(
interpolation
)
self
.
antialias
=
antialias
def
_get_params
(
self
,
flat_inputs
:
List
[
Any
])
->
Dict
[
str
,
Any
]:
orig_height
,
orig_width
=
query_size
(
flat_inputs
)
scale
=
self
.
scale_range
[
0
]
+
torch
.
rand
(
1
)
*
(
self
.
scale_range
[
1
]
-
self
.
scale_range
[
0
])
r
=
min
(
self
.
target_size
[
1
]
/
orig_height
,
self
.
target_size
[
0
]
/
orig_width
)
*
scale
new_width
=
int
(
orig_width
*
r
)
new_height
=
int
(
orig_height
*
r
)
return
dict
(
size
=
(
new_height
,
new_width
))
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
return
self
.
_call_kernel
(
F
.
resize
,
inpt
,
size
=
params
[
"size"
],
interpolation
=
self
.
interpolation
,
antialias
=
self
.
antialias
)
class
RandomShortestSize
(
Transform
):
"""[BETA] Randomly resize the input.
.. v2betastatus:: RandomShortestSize transform
If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
:class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
Args:
min_size (int or sequence of int): Minimum spatial size. Single integer value or a sequence of integer values.
max_size (int, optional): Maximum spatial size. Default, None.
interpolation (InterpolationMode, optional): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
antialias (bool, optional): Whether to apply antialiasing.
It only affects **tensors** with bilinear or bicubic modes and it is
ignored otherwise: on PIL images, antialiasing is always applied on
bilinear or bicubic modes; on other modes (for PIL images and
tensors), antialiasing makes no sense and this parameter is ignored.
Possible values are:
- ``True``: will apply antialiasing for bilinear or bicubic modes.
Other mode aren't affected. This is probably what you want to use.
- ``False``: will not apply antialiasing for tensors on any mode. PIL
images are still antialiased on bilinear or bicubic modes, because
PIL doesn't support no antialias.
- ``None``: equivalent to ``False`` for tensors and ``True`` for
PIL images. This value exists for legacy reasons and you probably
don't want to use it unless you really know what you are doing.
The current default is ``None`` **but will change to** ``True`` **in
v0.17** for the PIL and Tensor backends to be consistent.
"""
def
__init__
(
self
,
min_size
:
Union
[
List
[
int
],
Tuple
[
int
],
int
],
max_size
:
Optional
[
int
]
=
None
,
interpolation
:
Union
[
InterpolationMode
,
int
]
=
InterpolationMode
.
BILINEAR
,
antialias
:
Optional
[
Union
[
str
,
bool
]]
=
"warn"
,
):
super
().
__init__
()
self
.
min_size
=
[
min_size
]
if
isinstance
(
min_size
,
int
)
else
list
(
min_size
)
self
.
max_size
=
max_size
self
.
interpolation
=
_check_interpolation
(
interpolation
)
self
.
antialias
=
antialias
def
_get_params
(
self
,
flat_inputs
:
List
[
Any
])
->
Dict
[
str
,
Any
]:
orig_height
,
orig_width
=
query_size
(
flat_inputs
)
min_size
=
self
.
min_size
[
int
(
torch
.
randint
(
len
(
self
.
min_size
),
()))]
r
=
min_size
/
min
(
orig_height
,
orig_width
)
if
self
.
max_size
is
not
None
:
r
=
min
(
r
,
self
.
max_size
/
max
(
orig_height
,
orig_width
))
new_width
=
int
(
orig_width
*
r
)
new_height
=
int
(
orig_height
*
r
)
return
dict
(
size
=
(
new_height
,
new_width
))
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
return
self
.
_call_kernel
(
F
.
resize
,
inpt
,
size
=
params
[
"size"
],
interpolation
=
self
.
interpolation
,
antialias
=
self
.
antialias
)
class
RandomResize
(
Transform
):
"""[BETA] Randomly resize the input.
.. v2betastatus:: RandomResize transform
This transformation can be used together with ``RandomCrop`` as data augmentations to train
models on image segmentation task.
Output spatial size is randomly sampled from the interval ``[min_size, max_size]``:
.. code-block:: python
size = uniform_sample(min_size, max_size)
output_width = size
output_height = size
If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
:class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
it can have arbitrary number of leading batch dimensions. For example,
the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
Args:
min_size (int): Minimum output size for random sampling
max_size (int): Maximum output size for random sampling
interpolation (InterpolationMode, optional): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
antialias (bool, optional): Whether to apply antialiasing.
It only affects **tensors** with bilinear or bicubic modes and it is
ignored otherwise: on PIL images, antialiasing is always applied on
bilinear or bicubic modes; on other modes (for PIL images and
tensors), antialiasing makes no sense and this parameter is ignored.
Possible values are:
- ``True``: will apply antialiasing for bilinear or bicubic modes.
Other mode aren't affected. This is probably what you want to use.
- ``False``: will not apply antialiasing for tensors on any mode. PIL
images are still antialiased on bilinear or bicubic modes, because
PIL doesn't support no antialias.
- ``None``: equivalent to ``False`` for tensors and ``True`` for
PIL images. This value exists for legacy reasons and you probably
don't want to use it unless you really know what you are doing.
The current default is ``None`` **but will change to** ``True`` **in
v0.17** for the PIL and Tensor backends to be consistent.
"""
def
__init__
(
self
,
min_size
:
int
,
max_size
:
int
,
interpolation
:
Union
[
InterpolationMode
,
int
]
=
InterpolationMode
.
BILINEAR
,
antialias
:
Optional
[
Union
[
str
,
bool
]]
=
"warn"
,
)
->
None
:
super
().
__init__
()
self
.
min_size
=
min_size
self
.
max_size
=
max_size
self
.
interpolation
=
_check_interpolation
(
interpolation
)
self
.
antialias
=
antialias
def
_get_params
(
self
,
flat_inputs
:
List
[
Any
])
->
Dict
[
str
,
Any
]:
size
=
int
(
torch
.
randint
(
self
.
min_size
,
self
.
max_size
,
()))
return
dict
(
size
=
[
size
])
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
return
self
.
_call_kernel
(
F
.
resize
,
inpt
,
params
[
"size"
],
interpolation
=
self
.
interpolation
,
antialias
=
self
.
antialias
)
torchvision/transforms/v2/_meta.py
0 → 100644
View file @
cc26cd81
from
typing
import
Any
,
Dict
,
Union
from
torchvision
import
tv_tensors
from
torchvision.transforms.v2
import
functional
as
F
,
Transform
class
ConvertBoundingBoxFormat
(
Transform
):
"""[BETA] Convert bounding box coordinates to the given ``format``, eg from "CXCYWH" to "XYXY".
.. v2betastatus:: ConvertBoundingBoxFormat transform
Args:
format (str or tv_tensors.BoundingBoxFormat): output bounding box format.
Possible values are defined by :class:`~torchvision.tv_tensors.BoundingBoxFormat` and
string values match the enums, e.g. "XYXY" or "XYWH" etc.
"""
_transformed_types
=
(
tv_tensors
.
BoundingBoxes
,)
def
__init__
(
self
,
format
:
Union
[
str
,
tv_tensors
.
BoundingBoxFormat
])
->
None
:
super
().
__init__
()
if
isinstance
(
format
,
str
):
format
=
tv_tensors
.
BoundingBoxFormat
[
format
]
self
.
format
=
format
def
_transform
(
self
,
inpt
:
tv_tensors
.
BoundingBoxes
,
params
:
Dict
[
str
,
Any
])
->
tv_tensors
.
BoundingBoxes
:
return
F
.
convert_bounding_box_format
(
inpt
,
new_format
=
self
.
format
)
# type: ignore[return-value]
class
ClampBoundingBoxes
(
Transform
):
"""[BETA] Clamp bounding boxes to their corresponding image dimensions.
The clamping is done according to the bounding boxes' ``canvas_size`` meta-data.
.. v2betastatus:: ClampBoundingBoxes transform
"""
_transformed_types
=
(
tv_tensors
.
BoundingBoxes
,)
def
_transform
(
self
,
inpt
:
tv_tensors
.
BoundingBoxes
,
params
:
Dict
[
str
,
Any
])
->
tv_tensors
.
BoundingBoxes
:
return
F
.
clamp_bounding_boxes
(
inpt
)
# type: ignore[return-value]
torchvision/transforms/v2/_misc.py
0 → 100644
View file @
cc26cd81
import
warnings
from
typing
import
Any
,
Callable
,
cast
,
Dict
,
List
,
Optional
,
Sequence
,
Type
,
Union
import
PIL.Image
import
torch
from
torch.utils._pytree
import
tree_flatten
,
tree_unflatten
from
torchvision
import
transforms
as
_transforms
,
tv_tensors
from
torchvision.transforms.v2
import
functional
as
F
,
Transform
from
._utils
import
_parse_labels_getter
,
_setup_number_or_seq
,
_setup_size
,
get_bounding_boxes
,
has_any
,
is_pure_tensor
# TODO: do we want/need to expose this?
class
Identity
(
Transform
):
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
return
inpt
class
Lambda
(
Transform
):
"""[BETA] Apply a user-defined function as a transform.
.. v2betastatus:: Lambda transform
This transform does not support torchscript.
Args:
lambd (function): Lambda/function to be used for transform.
"""
_transformed_types
=
(
object
,)
def
__init__
(
self
,
lambd
:
Callable
[[
Any
],
Any
],
*
types
:
Type
):
super
().
__init__
()
self
.
lambd
=
lambd
self
.
types
=
types
or
self
.
_transformed_types
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
if
isinstance
(
inpt
,
self
.
types
):
return
self
.
lambd
(
inpt
)
else
:
return
inpt
def
extra_repr
(
self
)
->
str
:
extras
=
[]
name
=
getattr
(
self
.
lambd
,
"__name__"
,
None
)
if
name
:
extras
.
append
(
name
)
extras
.
append
(
f
"types=
{
[
type
.
__name__
for
type
in
self
.
types
]
}
"
)
return
", "
.
join
(
extras
)
class
LinearTransformation
(
Transform
):
"""[BETA] Transform a tensor image or video with a square transformation matrix and a mean_vector computed offline.
.. v2betastatus:: LinearTransformation transform
This transform does not support PIL Image.
Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and
subtract mean_vector from it which is then followed by computing the dot
product with the transformation matrix and then reshaping the tensor to its
original shape.
Applications:
whitening transformation: Suppose X is a column vector zero-centered data.
Then compute the data covariance matrix [D x D] with torch.mm(X.t(), X),
perform SVD on this matrix and pass it as transformation_matrix.
Args:
transformation_matrix (Tensor): tensor [D x D], D = C x H x W
mean_vector (Tensor): tensor [D], D = C x H x W
"""
_v1_transform_cls
=
_transforms
.
LinearTransformation
_transformed_types
=
(
is_pure_tensor
,
tv_tensors
.
Image
,
tv_tensors
.
Video
)
def
__init__
(
self
,
transformation_matrix
:
torch
.
Tensor
,
mean_vector
:
torch
.
Tensor
):
super
().
__init__
()
if
transformation_matrix
.
size
(
0
)
!=
transformation_matrix
.
size
(
1
):
raise
ValueError
(
"transformation_matrix should be square. Got "
f
"
{
tuple
(
transformation_matrix
.
size
())
}
rectangular matrix."
)
if
mean_vector
.
size
(
0
)
!=
transformation_matrix
.
size
(
0
):
raise
ValueError
(
f
"mean_vector should have the same length
{
mean_vector
.
size
(
0
)
}
"
f
" as any one of the dimensions of the transformation_matrix [
{
tuple
(
transformation_matrix
.
size
())
}
]"
)
if
transformation_matrix
.
device
!=
mean_vector
.
device
:
raise
ValueError
(
f
"Input tensors should be on the same device. Got
{
transformation_matrix
.
device
}
and
{
mean_vector
.
device
}
"
)
if
transformation_matrix
.
dtype
!=
mean_vector
.
dtype
:
raise
ValueError
(
f
"Input tensors should have the same dtype. Got
{
transformation_matrix
.
dtype
}
and
{
mean_vector
.
dtype
}
"
)
self
.
transformation_matrix
=
transformation_matrix
self
.
mean_vector
=
mean_vector
def
_check_inputs
(
self
,
sample
:
Any
)
->
Any
:
if
has_any
(
sample
,
PIL
.
Image
.
Image
):
raise
TypeError
(
f
"
{
type
(
self
).
__name__
}
() does not support PIL images."
)
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
shape
=
inpt
.
shape
n
=
shape
[
-
3
]
*
shape
[
-
2
]
*
shape
[
-
1
]
if
n
!=
self
.
transformation_matrix
.
shape
[
0
]:
raise
ValueError
(
"Input tensor and transformation matrix have incompatible shape."
+
f
"[
{
shape
[
-
3
]
}
x
{
shape
[
-
2
]
}
x
{
shape
[
-
1
]
}
] != "
+
f
"
{
self
.
transformation_matrix
.
shape
[
0
]
}
"
)
if
inpt
.
device
.
type
!=
self
.
mean_vector
.
device
.
type
:
raise
ValueError
(
"Input tensor should be on the same device as transformation matrix and mean vector. "
f
"Got
{
inpt
.
device
}
vs
{
self
.
mean_vector
.
device
}
"
)
flat_inpt
=
inpt
.
reshape
(
-
1
,
n
)
-
self
.
mean_vector
transformation_matrix
=
self
.
transformation_matrix
.
to
(
flat_inpt
.
dtype
)
output
=
torch
.
mm
(
flat_inpt
,
transformation_matrix
)
output
=
output
.
reshape
(
shape
)
if
isinstance
(
inpt
,
(
tv_tensors
.
Image
,
tv_tensors
.
Video
)):
output
=
tv_tensors
.
wrap
(
output
,
like
=
inpt
)
return
output
class
Normalize
(
Transform
):
"""[BETA] Normalize a tensor image or video with mean and standard deviation.
.. v2betastatus:: Normalize transform
This transform does not support PIL Image.
Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
channels, this transform will normalize each channel of the input
``torch.*Tensor`` i.e.,
``output[channel] = (input[channel] - mean[channel]) / std[channel]``
.. note::
This transform acts out of place, i.e., it does not mutate the input tensor.
Args:
mean (sequence): Sequence of means for each channel.
std (sequence): Sequence of standard deviations for each channel.
inplace(bool,optional): Bool to make this operation in-place.
"""
_v1_transform_cls
=
_transforms
.
Normalize
def
__init__
(
self
,
mean
:
Sequence
[
float
],
std
:
Sequence
[
float
],
inplace
:
bool
=
False
):
super
().
__init__
()
self
.
mean
=
list
(
mean
)
self
.
std
=
list
(
std
)
self
.
inplace
=
inplace
def
_check_inputs
(
self
,
sample
:
Any
)
->
Any
:
if
has_any
(
sample
,
PIL
.
Image
.
Image
):
raise
TypeError
(
f
"
{
type
(
self
).
__name__
}
() does not support PIL images."
)
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
return
self
.
_call_kernel
(
F
.
normalize
,
inpt
,
mean
=
self
.
mean
,
std
=
self
.
std
,
inplace
=
self
.
inplace
)
class
GaussianBlur
(
Transform
):
"""[BETA] Blurs image with randomly chosen Gaussian blur.
.. v2betastatus:: GausssianBlur transform
If the input is a Tensor, it is expected
to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions.
Args:
kernel_size (int or sequence): Size of the Gaussian kernel.
sigma (float or tuple of float (min, max)): Standard deviation to be used for
creating kernel to perform blurring. If float, sigma is fixed. If it is tuple
of float (min, max), sigma is chosen uniformly at random to lie in the
given range.
"""
_v1_transform_cls
=
_transforms
.
GaussianBlur
def
__init__
(
self
,
kernel_size
:
Union
[
int
,
Sequence
[
int
]],
sigma
:
Union
[
int
,
float
,
Sequence
[
float
]]
=
(
0.1
,
2.0
)
)
->
None
:
super
().
__init__
()
self
.
kernel_size
=
_setup_size
(
kernel_size
,
"Kernel size should be a tuple/list of two integers"
)
for
ks
in
self
.
kernel_size
:
if
ks
<=
0
or
ks
%
2
==
0
:
raise
ValueError
(
"Kernel size value should be an odd and positive number."
)
self
.
sigma
=
_setup_number_or_seq
(
sigma
,
"sigma"
)
if
not
0.0
<
self
.
sigma
[
0
]
<=
self
.
sigma
[
1
]:
raise
ValueError
(
f
"sigma values should be positive and of the form (min, max). Got
{
self
.
sigma
}
"
)
def
_get_params
(
self
,
flat_inputs
:
List
[
Any
])
->
Dict
[
str
,
Any
]:
sigma
=
torch
.
empty
(
1
).
uniform_
(
self
.
sigma
[
0
],
self
.
sigma
[
1
]).
item
()
return
dict
(
sigma
=
[
sigma
,
sigma
])
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
return
self
.
_call_kernel
(
F
.
gaussian_blur
,
inpt
,
self
.
kernel_size
,
**
params
)
class
ToDtype
(
Transform
):
"""[BETA] Converts the input to a specific dtype, optionally scaling the values for images or videos.
.. v2betastatus:: ToDtype transform
.. note::
``ToDtype(dtype, scale=True)`` is the recommended replacement for ``ConvertImageDtype(dtype)``.
Args:
dtype (``torch.dtype`` or dict of ``TVTensor`` -> ``torch.dtype``): The dtype to convert to.
If a ``torch.dtype`` is passed, e.g. ``torch.float32``, only images and videos will be converted
to that dtype: this is for compatibility with :class:`~torchvision.transforms.v2.ConvertImageDtype`.
A dict can be passed to specify per-tv_tensor conversions, e.g.
``dtype={tv_tensors.Image: torch.float32, tv_tensors.Mask: torch.int64, "others":None}``. The "others"
key can be used as a catch-all for any other tv_tensor type, and ``None`` means no conversion.
scale (bool, optional): Whether to scale the values for images or videos. See :ref:`range_and_dtype`.
Default: ``False``.
"""
_transformed_types
=
(
torch
.
Tensor
,)
def
__init__
(
self
,
dtype
:
Union
[
torch
.
dtype
,
Dict
[
Union
[
Type
,
str
],
Optional
[
torch
.
dtype
]]],
scale
:
bool
=
False
)
->
None
:
super
().
__init__
()
if
not
isinstance
(
dtype
,
(
dict
,
torch
.
dtype
)):
raise
ValueError
(
f
"dtype must be a dict or a torch.dtype, got
{
type
(
dtype
)
}
instead"
)
if
(
isinstance
(
dtype
,
dict
)
and
torch
.
Tensor
in
dtype
and
any
(
cls
in
dtype
for
cls
in
[
tv_tensors
.
Image
,
tv_tensors
.
Video
])
):
warnings
.
warn
(
"Got `dtype` values for `torch.Tensor` and either `tv_tensors.Image` or `tv_tensors.Video`. "
"Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
"in case a `tv_tensors.Image` or `tv_tensors.Video` is present in the input."
)
self
.
dtype
=
dtype
self
.
scale
=
scale
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
if
isinstance
(
self
.
dtype
,
torch
.
dtype
):
# For consistency / BC with ConvertImageDtype, we only care about images or videos when dtype
# is a simple torch.dtype
if
not
is_pure_tensor
(
inpt
)
and
not
isinstance
(
inpt
,
(
tv_tensors
.
Image
,
tv_tensors
.
Video
)):
return
inpt
dtype
:
Optional
[
torch
.
dtype
]
=
self
.
dtype
elif
type
(
inpt
)
in
self
.
dtype
:
dtype
=
self
.
dtype
[
type
(
inpt
)]
elif
"others"
in
self
.
dtype
:
dtype
=
self
.
dtype
[
"others"
]
else
:
raise
ValueError
(
f
"No dtype was specified for type
{
type
(
inpt
)
}
. "
"If you only need to convert the dtype of images or videos, you can just pass e.g. dtype=torch.float32. "
"If you're passing a dict as dtype, "
'you can use "others" as a catch-all key '
'e.g. dtype={tv_tensors.Mask: torch.int64, "others": None} to pass-through the rest of the inputs.'
)
supports_scaling
=
is_pure_tensor
(
inpt
)
or
isinstance
(
inpt
,
(
tv_tensors
.
Image
,
tv_tensors
.
Video
))
if
dtype
is
None
:
if
self
.
scale
and
supports_scaling
:
warnings
.
warn
(
"scale was set to True but no dtype was specified for images or videos: no scaling will be done."
)
return
inpt
return
self
.
_call_kernel
(
F
.
to_dtype
,
inpt
,
dtype
=
dtype
,
scale
=
self
.
scale
)
class
ConvertImageDtype
(
Transform
):
"""[BETA] [DEPRECATED] Use ``v2.ToDtype(dtype, scale=True)`` instead.
Convert input image to the given ``dtype`` and scale the values accordingly.
.. v2betastatus:: ConvertImageDtype transform
.. warning::
Consider using ``ToDtype(dtype, scale=True)`` instead. See :class:`~torchvision.transforms.v2.ToDtype`.
This function does not support PIL Image.
Args:
dtype (torch.dtype): Desired data type of the output
.. note::
When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly.
If converted back and forth, this mismatch has no effect.
Raises:
RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as
well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to
overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range
of the integer ``dtype``.
"""
_v1_transform_cls
=
_transforms
.
ConvertImageDtype
def
__init__
(
self
,
dtype
:
torch
.
dtype
=
torch
.
float32
)
->
None
:
super
().
__init__
()
self
.
dtype
=
dtype
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
return
self
.
_call_kernel
(
F
.
to_dtype
,
inpt
,
dtype
=
self
.
dtype
,
scale
=
True
)
class
SanitizeBoundingBoxes
(
Transform
):
"""[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and masks.
.. v2betastatus:: SanitizeBoundingBoxes transform
This transform removes bounding boxes and their associated labels/masks that:
- are below a given ``min_size``: by default this also removes degenerate boxes that have e.g. X2 <= X1.
- have any coordinate outside of their corresponding image. You may want to
call :class:`~torchvision.transforms.v2.ClampBoundingBoxes` first to avoid undesired removals.
It is recommended to call it at the end of a pipeline, before passing the
input to the models. It is critical to call this transform if
:class:`~torchvision.transforms.v2.RandomIoUCrop` was called.
If you want to be extra careful, you may call it after all transforms that
may modify bounding boxes but once at the end should be enough in most
cases.
Args:
min_size (float, optional) The size below which bounding boxes are removed. Default is 1.
labels_getter (callable or str or None, optional): indicates how to identify the labels in the input.
By default, this will try to find a "labels" key in the input (case-insensitive), if
the input is a dict or it is a tuple whose second element is a dict.
This heuristic should work well with a lot of datasets, including the built-in torchvision datasets.
It can also be a callable that takes the same input
as the transform, and returns the labels.
"""
def
__init__
(
self
,
min_size
:
float
=
1.0
,
labels_getter
:
Union
[
Callable
[[
Any
],
Optional
[
torch
.
Tensor
]],
str
,
None
]
=
"default"
,
)
->
None
:
super
().
__init__
()
if
min_size
<
1
:
raise
ValueError
(
f
"min_size must be >= 1, got
{
min_size
}
."
)
self
.
min_size
=
min_size
self
.
labels_getter
=
labels_getter
self
.
_labels_getter
=
_parse_labels_getter
(
labels_getter
)
def
forward
(
self
,
*
inputs
:
Any
)
->
Any
:
inputs
=
inputs
if
len
(
inputs
)
>
1
else
inputs
[
0
]
labels
=
self
.
_labels_getter
(
inputs
)
if
labels
is
not
None
and
not
isinstance
(
labels
,
torch
.
Tensor
):
raise
ValueError
(
f
"The labels in the input to forward() must be a tensor or None, got
{
type
(
labels
)
}
instead."
)
flat_inputs
,
spec
=
tree_flatten
(
inputs
)
boxes
=
get_bounding_boxes
(
flat_inputs
)
if
labels
is
not
None
and
boxes
.
shape
[
0
]
!=
labels
.
shape
[
0
]:
raise
ValueError
(
f
"Number of boxes (shape=
{
boxes
.
shape
}
) and number of labels (shape=
{
labels
.
shape
}
) do not match."
)
boxes
=
cast
(
tv_tensors
.
BoundingBoxes
,
F
.
convert_bounding_box_format
(
boxes
,
new_format
=
tv_tensors
.
BoundingBoxFormat
.
XYXY
,
),
)
ws
,
hs
=
boxes
[:,
2
]
-
boxes
[:,
0
],
boxes
[:,
3
]
-
boxes
[:,
1
]
valid
=
(
ws
>=
self
.
min_size
)
&
(
hs
>=
self
.
min_size
)
&
(
boxes
>=
0
).
all
(
dim
=-
1
)
# TODO: Do we really need to check for out of bounds here? All
# transforms should be clamping anyway, so this should never happen?
image_h
,
image_w
=
boxes
.
canvas_size
valid
&=
(
boxes
[:,
0
]
<=
image_w
)
&
(
boxes
[:,
2
]
<=
image_w
)
valid
&=
(
boxes
[:,
1
]
<=
image_h
)
&
(
boxes
[:,
3
]
<=
image_h
)
params
=
dict
(
valid
=
valid
.
as_subclass
(
torch
.
Tensor
),
labels
=
labels
)
flat_outputs
=
[
# Even-though it may look like we're transforming all inputs, we don't:
# _transform() will only care about BoundingBoxeses and the labels
self
.
_transform
(
inpt
,
params
)
for
inpt
in
flat_inputs
]
return
tree_unflatten
(
flat_outputs
,
spec
)
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
is_label
=
inpt
is
not
None
and
inpt
is
params
[
"labels"
]
is_bounding_boxes_or_mask
=
isinstance
(
inpt
,
(
tv_tensors
.
BoundingBoxes
,
tv_tensors
.
Mask
))
if
not
(
is_label
or
is_bounding_boxes_or_mask
):
return
inpt
output
=
inpt
[
params
[
"valid"
]]
if
is_label
:
return
output
return
tv_tensors
.
wrap
(
output
,
like
=
inpt
)
torchvision/transforms/v2/_temporal.py
0 → 100644
View file @
cc26cd81
from
typing
import
Any
,
Dict
import
torch
from
torchvision.transforms.v2
import
functional
as
F
,
Transform
class
UniformTemporalSubsample
(
Transform
):
"""[BETA] Uniformly subsample ``num_samples`` indices from the temporal dimension of the video.
.. v2betastatus:: UniformTemporalSubsample transform
Videos are expected to be of shape ``[..., T, C, H, W]`` where ``T`` denotes the temporal dimension.
When ``num_samples`` is larger than the size of temporal dimension of the video, it
will sample frames based on nearest neighbor interpolation.
Args:
num_samples (int): The number of equispaced samples to be selected
"""
_transformed_types
=
(
torch
.
Tensor
,)
def
__init__
(
self
,
num_samples
:
int
):
super
().
__init__
()
self
.
num_samples
=
num_samples
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
return
self
.
_call_kernel
(
F
.
uniform_temporal_subsample
,
inpt
,
self
.
num_samples
)
torchvision/transforms/v2/_transform.py
0 → 100644
View file @
cc26cd81
from
__future__
import
annotations
import
enum
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Type
,
Union
import
PIL.Image
import
torch
from
torch
import
nn
from
torch.utils._pytree
import
tree_flatten
,
tree_unflatten
from
torchvision
import
tv_tensors
from
torchvision.transforms.v2._utils
import
check_type
,
has_any
,
is_pure_tensor
from
torchvision.utils
import
_log_api_usage_once
from
.functional._utils
import
_get_kernel
class
Transform
(
nn
.
Module
):
# Class attribute defining transformed types. Other types are passed-through without any transformation
# We support both Types and callables that are able to do further checks on the type of the input.
_transformed_types
:
Tuple
[
Union
[
Type
,
Callable
[[
Any
],
bool
]],
...]
=
(
torch
.
Tensor
,
PIL
.
Image
.
Image
)
def
__init__
(
self
)
->
None
:
super
().
__init__
()
_log_api_usage_once
(
self
)
def
_check_inputs
(
self
,
flat_inputs
:
List
[
Any
])
->
None
:
pass
def
_get_params
(
self
,
flat_inputs
:
List
[
Any
])
->
Dict
[
str
,
Any
]:
return
dict
()
def
_call_kernel
(
self
,
functional
:
Callable
,
inpt
:
Any
,
*
args
:
Any
,
**
kwargs
:
Any
)
->
Any
:
kernel
=
_get_kernel
(
functional
,
type
(
inpt
),
allow_passthrough
=
True
)
return
kernel
(
inpt
,
*
args
,
**
kwargs
)
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
Any
:
raise
NotImplementedError
def
forward
(
self
,
*
inputs
:
Any
)
->
Any
:
flat_inputs
,
spec
=
tree_flatten
(
inputs
if
len
(
inputs
)
>
1
else
inputs
[
0
])
self
.
_check_inputs
(
flat_inputs
)
needs_transform_list
=
self
.
_needs_transform_list
(
flat_inputs
)
params
=
self
.
_get_params
(
[
inpt
for
(
inpt
,
needs_transform
)
in
zip
(
flat_inputs
,
needs_transform_list
)
if
needs_transform
]
)
flat_outputs
=
[
self
.
_transform
(
inpt
,
params
)
if
needs_transform
else
inpt
for
(
inpt
,
needs_transform
)
in
zip
(
flat_inputs
,
needs_transform_list
)
]
return
tree_unflatten
(
flat_outputs
,
spec
)
def
_needs_transform_list
(
self
,
flat_inputs
:
List
[
Any
])
->
List
[
bool
]:
# Below is a heuristic on how to deal with pure tensor inputs:
# 1. Pure tensors, i.e. tensors that are not a tv_tensor, are passed through if there is an explicit image
# (`tv_tensors.Image` or `PIL.Image.Image`) or video (`tv_tensors.Video`) in the sample.
# 2. If there is no explicit image or video in the sample, only the first encountered pure tensor is
# transformed as image, while the rest is passed through. The order is defined by the returned `flat_inputs`
# of `tree_flatten`, which recurses depth-first through the input.
#
# This heuristic stems from two requirements:
# 1. We need to keep BC for single input pure tensors and treat them as images.
# 2. We don't want to treat all pure tensors as images, because some datasets like `CelebA` or `Widerface`
# return supplemental numerical data as tensors that cannot be transformed as images.
#
# The heuristic should work well for most people in practice. The only case where it doesn't is if someone
# tries to transform multiple pure tensors at the same time, expecting them all to be treated as images.
# However, this case wasn't supported by transforms v1 either, so there is no BC concern.
needs_transform_list
=
[]
transform_pure_tensor
=
not
has_any
(
flat_inputs
,
tv_tensors
.
Image
,
tv_tensors
.
Video
,
PIL
.
Image
.
Image
)
for
inpt
in
flat_inputs
:
needs_transform
=
True
if
not
check_type
(
inpt
,
self
.
_transformed_types
):
needs_transform
=
False
elif
is_pure_tensor
(
inpt
):
if
transform_pure_tensor
:
transform_pure_tensor
=
False
else
:
needs_transform
=
False
needs_transform_list
.
append
(
needs_transform
)
return
needs_transform_list
def
extra_repr
(
self
)
->
str
:
extra
=
[]
for
name
,
value
in
self
.
__dict__
.
items
():
if
name
.
startswith
(
"_"
)
or
name
==
"training"
:
continue
if
not
isinstance
(
value
,
(
bool
,
int
,
float
,
str
,
tuple
,
list
,
enum
.
Enum
)):
continue
extra
.
append
(
f
"
{
name
}
=
{
value
}
"
)
return
", "
.
join
(
extra
)
# This attribute should be set on all transforms that have a v1 equivalent. Doing so enables two things:
# 1. In case the v1 transform has a static `get_params` method, it will also be available under the same name on
# the v2 transform. See `__init_subclass__` for details.
# 2. The v2 transform will be JIT scriptable. See `_extract_params_for_v1_transform` and `__prepare_scriptable__`
# for details.
_v1_transform_cls
:
Optional
[
Type
[
nn
.
Module
]]
=
None
def
__init_subclass__
(
cls
)
->
None
:
# Since `get_params` is a `@staticmethod`, we have to bind it to the class itself rather than to an instance.
# This method is called after subclassing has happened, i.e. `cls` is the subclass, e.g. `Resize`.
if
cls
.
_v1_transform_cls
is
not
None
and
hasattr
(
cls
.
_v1_transform_cls
,
"get_params"
):
cls
.
get_params
=
staticmethod
(
cls
.
_v1_transform_cls
.
get_params
)
# type: ignore[attr-defined]
def
_extract_params_for_v1_transform
(
self
)
->
Dict
[
str
,
Any
]:
# This method is called by `__prepare_scriptable__` to instantiate the equivalent v1 transform from the current
# v2 transform instance. It extracts all available public attributes that are specific to that transform and
# not `nn.Module` in general.
# Overwrite this method on the v2 transform class if the above is not sufficient. For example, this might happen
# if the v2 transform introduced new parameters that are not support by the v1 transform.
common_attrs
=
nn
.
Module
().
__dict__
.
keys
()
return
{
attr
:
value
for
attr
,
value
in
self
.
__dict__
.
items
()
if
not
attr
.
startswith
(
"_"
)
and
attr
not
in
common_attrs
}
def
__prepare_scriptable__
(
self
)
->
nn
.
Module
:
# This method is called early on when `torch.jit.script`'ing an `nn.Module` instance. If it succeeds, the return
# value is used for scripting over the original object that should have been scripted. Since the v1 transforms
# are JIT scriptable, and we made sure that for single image inputs v1 and v2 are equivalent, we just return the
# equivalent v1 transform here. This of course only makes transforms v2 JIT scriptable as long as transforms v1
# is around.
if
self
.
_v1_transform_cls
is
None
:
raise
RuntimeError
(
f
"Transform
{
type
(
self
).
__name__
}
cannot be JIT scripted. "
"torchscript is only supported for backward compatibility with transforms "
"which are already in torchvision.transforms. "
"For torchscript support (on tensors only), you can use the functional API instead."
)
return
self
.
_v1_transform_cls
(
**
self
.
_extract_params_for_v1_transform
())
class
_RandomApplyTransform
(
Transform
):
def
__init__
(
self
,
p
:
float
=
0.5
)
->
None
:
if
not
(
0.0
<=
p
<=
1.0
):
raise
ValueError
(
"`p` should be a floating point value in the interval [0.0, 1.0]."
)
super
().
__init__
()
self
.
p
=
p
def
forward
(
self
,
*
inputs
:
Any
)
->
Any
:
# We need to almost duplicate `Transform.forward()` here since we always want to check the inputs, but return
# early afterwards in case the random check triggers. The same result could be achieved by calling
# `super().forward()` after the random check, but that would call `self._check_inputs` twice.
inputs
=
inputs
if
len
(
inputs
)
>
1
else
inputs
[
0
]
flat_inputs
,
spec
=
tree_flatten
(
inputs
)
self
.
_check_inputs
(
flat_inputs
)
if
torch
.
rand
(
1
)
>=
self
.
p
:
return
inputs
needs_transform_list
=
self
.
_needs_transform_list
(
flat_inputs
)
params
=
self
.
_get_params
(
[
inpt
for
(
inpt
,
needs_transform
)
in
zip
(
flat_inputs
,
needs_transform_list
)
if
needs_transform
]
)
flat_outputs
=
[
self
.
_transform
(
inpt
,
params
)
if
needs_transform
else
inpt
for
(
inpt
,
needs_transform
)
in
zip
(
flat_inputs
,
needs_transform_list
)
]
return
tree_unflatten
(
flat_outputs
,
spec
)
torchvision/transforms/v2/_type_conversion.py
0 → 100644
View file @
cc26cd81
from
typing
import
Any
,
Dict
,
Optional
,
Union
import
numpy
as
np
import
PIL.Image
import
torch
from
torchvision
import
tv_tensors
from
torchvision.transforms.v2
import
functional
as
F
,
Transform
from
torchvision.transforms.v2._utils
import
is_pure_tensor
class
PILToTensor
(
Transform
):
"""[BETA] Convert a PIL Image to a tensor of the same type - this does not scale values.
.. v2betastatus:: PILToTensor transform
This transform does not support torchscript.
Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W).
"""
_transformed_types
=
(
PIL
.
Image
.
Image
,)
def
_transform
(
self
,
inpt
:
PIL
.
Image
.
Image
,
params
:
Dict
[
str
,
Any
])
->
torch
.
Tensor
:
return
F
.
pil_to_tensor
(
inpt
)
class
ToImage
(
Transform
):
"""[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.tv_tensors.Image`
; this does not scale values.
.. v2betastatus:: ToImage transform
This transform does not support torchscript.
"""
_transformed_types
=
(
is_pure_tensor
,
PIL
.
Image
.
Image
,
np
.
ndarray
)
def
_transform
(
self
,
inpt
:
Union
[
torch
.
Tensor
,
PIL
.
Image
.
Image
,
np
.
ndarray
],
params
:
Dict
[
str
,
Any
]
)
->
tv_tensors
.
Image
:
return
F
.
to_image
(
inpt
)
class
ToPILImage
(
Transform
):
"""[BETA] Convert a tensor or an ndarray to PIL Image
.. v2betastatus:: ToPILImage transform
This transform does not support torchscript.
Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
H x W x C to a PIL Image while adjusting the value range depending on the ``mode``.
Args:
mode (`PIL.Image mode`_): color space and pixel depth of input data (optional).
If ``mode`` is ``None`` (default) there are some assumptions made about the input data:
- If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``.
- If the input has 3 channels, the ``mode`` is assumed to be ``RGB``.
- If the input has 2 channels, the ``mode`` is assumed to be ``LA``.
- If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``,
``short``).
.. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
"""
_transformed_types
=
(
is_pure_tensor
,
tv_tensors
.
Image
,
np
.
ndarray
)
def
__init__
(
self
,
mode
:
Optional
[
str
]
=
None
)
->
None
:
super
().
__init__
()
self
.
mode
=
mode
def
_transform
(
self
,
inpt
:
Union
[
torch
.
Tensor
,
PIL
.
Image
.
Image
,
np
.
ndarray
],
params
:
Dict
[
str
,
Any
]
)
->
PIL
.
Image
.
Image
:
return
F
.
to_pil_image
(
inpt
,
mode
=
self
.
mode
)
class
ToPureTensor
(
Transform
):
"""[BETA] Convert all tv_tensors to pure tensors, removing associated metadata (if any).
.. v2betastatus:: ToPureTensor transform
This doesn't scale or change the values, only the type.
"""
_transformed_types
=
(
tv_tensors
.
TVTensor
,)
def
_transform
(
self
,
inpt
:
Any
,
params
:
Dict
[
str
,
Any
])
->
torch
.
Tensor
:
return
inpt
.
as_subclass
(
torch
.
Tensor
)
torchvision/transforms/v2/_utils.py
0 → 100644
View file @
cc26cd81
from
__future__
import
annotations
import
collections.abc
import
numbers
from
contextlib
import
suppress
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Literal
,
Optional
,
Sequence
,
Tuple
,
Type
,
Union
import
PIL.Image
import
torch
from
torchvision
import
tv_tensors
from
torchvision._utils
import
sequence_to_str
from
torchvision.transforms.transforms
import
_check_sequence_input
,
_setup_angle
,
_setup_size
# noqa: F401
from
torchvision.transforms.v2.functional
import
get_dimensions
,
get_size
,
is_pure_tensor
from
torchvision.transforms.v2.functional._utils
import
_FillType
,
_FillTypeJIT
def
_setup_number_or_seq
(
arg
:
Union
[
int
,
float
,
Sequence
[
Union
[
int
,
float
]]],
name
:
str
)
->
Sequence
[
float
]:
if
not
isinstance
(
arg
,
(
int
,
float
,
Sequence
)):
raise
TypeError
(
f
"
{
name
}
should be a number or a sequence of numbers. Got
{
type
(
arg
)
}
"
)
if
isinstance
(
arg
,
Sequence
)
and
len
(
arg
)
not
in
(
1
,
2
):
raise
ValueError
(
f
"If
{
name
}
is a sequence its length should be 1 or 2. Got
{
len
(
arg
)
}
"
)
if
isinstance
(
arg
,
Sequence
):
for
element
in
arg
:
if
not
isinstance
(
element
,
(
int
,
float
)):
raise
ValueError
(
f
"
{
name
}
should be a sequence of numbers. Got
{
type
(
element
)
}
"
)
if
isinstance
(
arg
,
(
int
,
float
)):
arg
=
[
float
(
arg
),
float
(
arg
)]
elif
isinstance
(
arg
,
Sequence
):
if
len
(
arg
)
==
1
:
arg
=
[
float
(
arg
[
0
]),
float
(
arg
[
0
])]
else
:
arg
=
[
float
(
arg
[
0
]),
float
(
arg
[
1
])]
return
arg
def
_check_fill_arg
(
fill
:
Union
[
_FillType
,
Dict
[
Union
[
Type
,
str
],
_FillType
]])
->
None
:
if
isinstance
(
fill
,
dict
):
for
value
in
fill
.
values
():
_check_fill_arg
(
value
)
else
:
if
fill
is
not
None
and
not
isinstance
(
fill
,
(
numbers
.
Number
,
tuple
,
list
)):
raise
TypeError
(
"Got inappropriate fill arg, only Numbers, tuples, lists and dicts are allowed."
)
def
_convert_fill_arg
(
fill
:
_FillType
)
->
_FillTypeJIT
:
# Fill = 0 is not equivalent to None, https://github.com/pytorch/vision/issues/6517
# So, we can't reassign fill to 0
# if fill is None:
# fill = 0
if
fill
is
None
:
return
fill
if
not
isinstance
(
fill
,
(
int
,
float
)):
fill
=
[
float
(
v
)
for
v
in
list
(
fill
)]
return
fill
# type: ignore[return-value]
def
_setup_fill_arg
(
fill
:
Union
[
_FillType
,
Dict
[
Union
[
Type
,
str
],
_FillType
]])
->
Dict
[
Union
[
Type
,
str
],
_FillTypeJIT
]:
_check_fill_arg
(
fill
)
if
isinstance
(
fill
,
dict
):
for
k
,
v
in
fill
.
items
():
fill
[
k
]
=
_convert_fill_arg
(
v
)
return
fill
# type: ignore[return-value]
else
:
return
{
"others"
:
_convert_fill_arg
(
fill
)}
def
_get_fill
(
fill_dict
,
inpt_type
):
if
inpt_type
in
fill_dict
:
return
fill_dict
[
inpt_type
]
elif
"others"
in
fill_dict
:
return
fill_dict
[
"others"
]
else
:
RuntimeError
(
"This should never happen, please open an issue on the torchvision repo if you hit this."
)
def
_check_padding_arg
(
padding
:
Union
[
int
,
Sequence
[
int
]])
->
None
:
if
not
isinstance
(
padding
,
(
numbers
.
Number
,
tuple
,
list
)):
raise
TypeError
(
"Got inappropriate padding arg"
)
if
isinstance
(
padding
,
(
tuple
,
list
))
and
len
(
padding
)
not
in
[
1
,
2
,
4
]:
raise
ValueError
(
f
"Padding must be an int or a 1, 2, or 4 element tuple, not a
{
len
(
padding
)
}
element tuple"
)
# TODO: let's use torchvision._utils.StrEnum to have the best of both worlds (strings and enums)
# https://github.com/pytorch/vision/issues/6250
def
_check_padding_mode_arg
(
padding_mode
:
Literal
[
"constant"
,
"edge"
,
"reflect"
,
"symmetric"
])
->
None
:
if
padding_mode
not
in
[
"constant"
,
"edge"
,
"reflect"
,
"symmetric"
]:
raise
ValueError
(
"Padding mode should be either constant, edge, reflect or symmetric"
)
def
_find_labels_default_heuristic
(
inputs
:
Any
)
->
torch
.
Tensor
:
"""
This heuristic covers three cases:
1. The input is tuple or list whose second item is a labels tensor. This happens for already batched
classification inputs for MixUp and CutMix (typically after the Dataloder).
2. The input is a tuple or list whose second item is a dictionary that contains the labels tensor
under a label-like (see below) key. This happens for the inputs of detection models.
3. The input is a dictionary that is structured as the one from 2.
What is "label-like" key? We first search for an case-insensitive match of 'labels' inside the keys of the
dictionary. This is the name our detection models expect. If we can't find that, we look for a case-insensitive
match of the term 'label' anywhere inside the key, i.e. 'FooLaBeLBar'. If we can't find that either, the dictionary
contains no "label-like" key.
"""
if
isinstance
(
inputs
,
(
tuple
,
list
)):
inputs
=
inputs
[
1
]
# MixUp, CutMix
if
is_pure_tensor
(
inputs
):
return
inputs
if
not
isinstance
(
inputs
,
collections
.
abc
.
Mapping
):
raise
ValueError
(
f
"When using the default labels_getter, the input passed to forward must be a dictionary or a two-tuple "
f
"whose second item is a dictionary or a tensor, but got
{
inputs
}
instead."
)
candidate_key
=
None
with
suppress
(
StopIteration
):
candidate_key
=
next
(
key
for
key
in
inputs
.
keys
()
if
key
.
lower
()
==
"labels"
)
if
candidate_key
is
None
:
with
suppress
(
StopIteration
):
candidate_key
=
next
(
key
for
key
in
inputs
.
keys
()
if
"label"
in
key
.
lower
())
if
candidate_key
is
None
:
raise
ValueError
(
"Could not infer where the labels are in the sample. Try passing a callable as the labels_getter parameter?"
"If there are no labels in the sample by design, pass labels_getter=None."
)
return
inputs
[
candidate_key
]
def
_parse_labels_getter
(
labels_getter
:
Union
[
str
,
Callable
[[
Any
],
Optional
[
torch
.
Tensor
]],
None
]
)
->
Callable
[[
Any
],
Optional
[
torch
.
Tensor
]]:
if
labels_getter
==
"default"
:
return
_find_labels_default_heuristic
elif
callable
(
labels_getter
):
return
labels_getter
elif
labels_getter
is
None
:
return
lambda
_
:
None
else
:
raise
ValueError
(
f
"labels_getter should either be 'default', a callable, or None, but got
{
labels_getter
}
."
)
def
get_bounding_boxes
(
flat_inputs
:
List
[
Any
])
->
tv_tensors
.
BoundingBoxes
:
# This assumes there is only one bbox per sample as per the general convention
try
:
return
next
(
inpt
for
inpt
in
flat_inputs
if
isinstance
(
inpt
,
tv_tensors
.
BoundingBoxes
))
except
StopIteration
:
raise
ValueError
(
"No bounding boxes were found in the sample"
)
def
query_chw
(
flat_inputs
:
List
[
Any
])
->
Tuple
[
int
,
int
,
int
]:
chws
=
{
tuple
(
get_dimensions
(
inpt
))
for
inpt
in
flat_inputs
if
check_type
(
inpt
,
(
is_pure_tensor
,
tv_tensors
.
Image
,
PIL
.
Image
.
Image
,
tv_tensors
.
Video
))
}
if
not
chws
:
raise
TypeError
(
"No image or video was found in the sample"
)
elif
len
(
chws
)
>
1
:
raise
ValueError
(
f
"Found multiple CxHxW dimensions in the sample:
{
sequence_to_str
(
sorted
(
chws
))
}
"
)
c
,
h
,
w
=
chws
.
pop
()
return
c
,
h
,
w
def
query_size
(
flat_inputs
:
List
[
Any
])
->
Tuple
[
int
,
int
]:
sizes
=
{
tuple
(
get_size
(
inpt
))
for
inpt
in
flat_inputs
if
check_type
(
inpt
,
(
is_pure_tensor
,
tv_tensors
.
Image
,
PIL
.
Image
.
Image
,
tv_tensors
.
Video
,
tv_tensors
.
Mask
,
tv_tensors
.
BoundingBoxes
,
),
)
}
if
not
sizes
:
raise
TypeError
(
"No image, video, mask or bounding box was found in the sample"
)
elif
len
(
sizes
)
>
1
:
raise
ValueError
(
f
"Found multiple HxW dimensions in the sample:
{
sequence_to_str
(
sorted
(
sizes
))
}
"
)
h
,
w
=
sizes
.
pop
()
return
h
,
w
def
check_type
(
obj
:
Any
,
types_or_checks
:
Tuple
[
Union
[
Type
,
Callable
[[
Any
],
bool
]],
...])
->
bool
:
for
type_or_check
in
types_or_checks
:
if
isinstance
(
obj
,
type_or_check
)
if
isinstance
(
type_or_check
,
type
)
else
type_or_check
(
obj
):
return
True
return
False
def
has_any
(
flat_inputs
:
List
[
Any
],
*
types_or_checks
:
Union
[
Type
,
Callable
[[
Any
],
bool
]])
->
bool
:
for
inpt
in
flat_inputs
:
if
check_type
(
inpt
,
types_or_checks
):
return
True
return
False
def
has_all
(
flat_inputs
:
List
[
Any
],
*
types_or_checks
:
Union
[
Type
,
Callable
[[
Any
],
bool
]])
->
bool
:
for
type_or_check
in
types_or_checks
:
for
inpt
in
flat_inputs
:
if
isinstance
(
inpt
,
type_or_check
)
if
isinstance
(
type_or_check
,
type
)
else
type_or_check
(
inpt
):
break
else
:
return
False
return
True
torchvision/transforms/v2/functional/__init__.py
0 → 100644
View file @
cc26cd81
from
torchvision.transforms
import
InterpolationMode
# usort: skip
from
._utils
import
is_pure_tensor
,
register_kernel
# usort: skip
from
._meta
import
(
clamp_bounding_boxes
,
convert_bounding_box_format
,
get_dimensions_image
,
_get_dimensions_image_pil
,
get_dimensions_video
,
get_dimensions
,
get_num_frames_video
,
get_num_frames
,
get_image_num_channels
,
get_num_channels_image
,
_get_num_channels_image_pil
,
get_num_channels_video
,
get_num_channels
,
get_size_bounding_boxes
,
get_size_image
,
_get_size_image_pil
,
get_size_mask
,
get_size_video
,
get_size
,
)
# usort: skip
from
._augment
import
_erase_image_pil
,
erase
,
erase_image
,
erase_video
from
._color
import
(
_adjust_brightness_image_pil
,
_adjust_contrast_image_pil
,
_adjust_gamma_image_pil
,
_adjust_hue_image_pil
,
_adjust_saturation_image_pil
,
_adjust_sharpness_image_pil
,
_autocontrast_image_pil
,
_equalize_image_pil
,
_invert_image_pil
,
_permute_channels_image_pil
,
_posterize_image_pil
,
_rgb_to_grayscale_image_pil
,
_solarize_image_pil
,
adjust_brightness
,
adjust_brightness_image
,
adjust_brightness_video
,
adjust_contrast
,
adjust_contrast_image
,
adjust_contrast_video
,
adjust_gamma
,
adjust_gamma_image
,
adjust_gamma_video
,
adjust_hue
,
adjust_hue_image
,
adjust_hue_video
,
adjust_saturation
,
adjust_saturation_image
,
adjust_saturation_video
,
adjust_sharpness
,
adjust_sharpness_image
,
adjust_sharpness_video
,
autocontrast
,
autocontrast_image
,
autocontrast_video
,
equalize
,
equalize_image
,
equalize_video
,
invert
,
invert_image
,
invert_video
,
permute_channels
,
permute_channels_image
,
permute_channels_video
,
posterize
,
posterize_image
,
posterize_video
,
rgb_to_grayscale
,
rgb_to_grayscale_image
,
solarize
,
solarize_image
,
solarize_video
,
to_grayscale
,
)
from
._geometry
import
(
_affine_image_pil
,
_center_crop_image_pil
,
_crop_image_pil
,
_elastic_image_pil
,
_five_crop_image_pil
,
_horizontal_flip_image_pil
,
_pad_image_pil
,
_perspective_image_pil
,
_resize_image_pil
,
_resized_crop_image_pil
,
_rotate_image_pil
,
_ten_crop_image_pil
,
_vertical_flip_image_pil
,
affine
,
affine_bounding_boxes
,
affine_image
,
affine_mask
,
affine_video
,
center_crop
,
center_crop_bounding_boxes
,
center_crop_image
,
center_crop_mask
,
center_crop_video
,
crop
,
crop_bounding_boxes
,
crop_image
,
crop_mask
,
crop_video
,
elastic
,
elastic_bounding_boxes
,
elastic_image
,
elastic_mask
,
elastic_transform
,
elastic_video
,
five_crop
,
five_crop_image
,
five_crop_video
,
hflip
,
# TODO: Consider moving all pure alias definitions at the bottom of the file
horizontal_flip
,
horizontal_flip_bounding_boxes
,
horizontal_flip_image
,
horizontal_flip_mask
,
horizontal_flip_video
,
pad
,
pad_bounding_boxes
,
pad_image
,
pad_mask
,
pad_video
,
perspective
,
perspective_bounding_boxes
,
perspective_image
,
perspective_mask
,
perspective_video
,
resize
,
resize_bounding_boxes
,
resize_image
,
resize_mask
,
resize_video
,
resized_crop
,
resized_crop_bounding_boxes
,
resized_crop_image
,
resized_crop_mask
,
resized_crop_video
,
rotate
,
rotate_bounding_boxes
,
rotate_image
,
rotate_mask
,
rotate_video
,
ten_crop
,
ten_crop_image
,
ten_crop_video
,
vertical_flip
,
vertical_flip_bounding_boxes
,
vertical_flip_image
,
vertical_flip_mask
,
vertical_flip_video
,
vflip
,
)
from
._misc
import
(
_gaussian_blur_image_pil
,
convert_image_dtype
,
gaussian_blur
,
gaussian_blur_image
,
gaussian_blur_video
,
normalize
,
normalize_image
,
normalize_video
,
to_dtype
,
to_dtype_image
,
to_dtype_video
,
)
from
._temporal
import
uniform_temporal_subsample
,
uniform_temporal_subsample_video
from
._type_conversion
import
pil_to_tensor
,
to_image
,
to_pil_image
from
._deprecated
import
get_image_size
,
to_tensor
# usort: skip
Prev
1
…
15
16
17
18
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment