Unverified Commit 928b05ca authored by Nicolas Hug's avatar Nicolas Hug Committed by GitHub
Browse files

Added docs for v2 transforms (part 1) (#7297)


Co-authored-by: default avatarvfdev <vfdev.5@gmail.com>
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>
parent d03b776a
......@@ -33,6 +33,8 @@ from tabulate import tabulate
sys.path.append(os.path.abspath("."))
torchvision.disable_beta_transforms_warning()
# -- General configuration ------------------------------------------------
# Required version of sphinx is set from docs/requirements.txt
......
......@@ -98,17 +98,29 @@ Geometry
:template: class.rst
Resize
v2.Resize
RandomCrop
v2.RandomCrop
RandomResizedCrop
v2.RandomResizedCrop
CenterCrop
v2.CenterCrop
FiveCrop
v2.FiveCrop
TenCrop
v2.TenCrop
Pad
v2.Pad
RandomAffine
v2.RandomAffine
RandomPerspective
v2.RandomPerspective
RandomRotation
v2.RandomRotation
RandomHorizontalFlip
v2.RandomHorizontalFlip
RandomVerticalFlip
v2.RandomVerticalFlip
Color
-----
......@@ -118,15 +130,25 @@ Color
:template: class.rst
ColorJitter
v2.ColorJitter
Grayscale
v2.Grayscale
RandomGrayscale
v2.RandomGrayscale
GaussianBlur
v2.GaussianBlur
RandomInvert
v2.RandomInvert
RandomPosterize
v2.RandomPosterize
RandomSolarize
v2.RandomSolarize
RandomAdjustSharpness
v2.RandomAdjustSharpness
RandomAutocontrast
v2.RandomAutocontrast
RandomEqualize
v2.RandomEqualize
Composition
-----------
......@@ -136,9 +158,13 @@ Composition
:template: class.rst
Compose
v2.Compose
RandomApply
v2.RandomApply
RandomChoice
v2.RandomChoice
RandomOrder
v2.RandomOrder
Miscellaneous
-------------
......@@ -148,9 +174,13 @@ Miscellaneous
:template: class.rst
LinearTransformation
v2.LinearTransformation
Normalize
v2.Normalize
RandomErasing
v2.RandomErasing
Lambda
v2.Lambda
.. _conversion_transforms:
......@@ -162,9 +192,15 @@ Conversion
:template: class.rst
ToPILImage
v2.ToPILImage
v2.ToImagePIL
ToTensor
v2.ToTensor
PILToTensor
v2.PILToTensor
ConvertImageDtype
v2.ConvertImageDtype
v2.ConvertDtype
Auto-Augmentation
-----------------
......@@ -181,9 +217,13 @@ The new transform can be used standalone or mixed-and-matched with existing tran
AutoAugmentPolicy
AutoAugment
v2.AutoAugment
RandAugment
v2.RandAugment
TrivialAugmentWide
v2.TrivialAugmentWide
AugMix
v2.AugMix
.. _functional_transforms:
......
......@@ -13,6 +13,38 @@ from .utils import is_simple_tensor, query_chw
class RandomErasing(_RandomApplyTransform):
"""[BETA] Randomly selects a rectangle region in the input image or video and erases its pixels.
.. betastatus:: RandomErasing transform
This transform does not support PIL Image.
'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
Args:
p: probability that the random erasing operation will be performed.
scale: range of proportion of erased area against input image.
ratio: range of aspect ratio of erased area.
value: erasing value. Default is 0. If a single int, it is used to
erase all pixels. If a tuple of length 3, it is used to erase
R, G, B channels respectively.
If a str of 'random', erasing each pixel with random values.
inplace: boolean to make this transform inplace. Default set to False.
Returns:
Erased input.
Example:
>>> from torchvision.transforms import v2 as transforms
>>>
>>> transform = transforms.Compose([
>>> transforms.RandomHorizontalFlip(),
>>> transforms.PILToTensor(),
>>> transforms.ConvertImageDtype(torch.float),
>>> transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
>>> transforms.RandomErasing(),
>>> ])
"""
_v1_transform_cls = _transforms.RandomErasing
def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
......
......@@ -162,6 +162,24 @@ class _AutoAugmentBase(Transform):
class AutoAugment(_AutoAugmentBase):
r"""[BETA] AutoAugment data augmentation method based on
`"AutoAugment: Learning Augmentation Strategies from Data" <https://arxiv.org/pdf/1805.09501.pdf>`_.
.. betastatus:: AutoAugment transform
If the image is torch Tensor, it should be of type torch.uint8, and it is expected
to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "L" or "RGB".
Args:
policy (AutoAugmentPolicy): Desired policy enum defined by
:class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``.
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
fill (sequence or number, optional): Pixel fill value for the area outside the transformed
image. If given a number, the value is used for all bands respectively.
"""
_v1_transform_cls = _transforms.AutoAugment
_AUGMENTATION_SPACE = {
......@@ -318,6 +336,27 @@ class AutoAugment(_AutoAugmentBase):
class RandAugment(_AutoAugmentBase):
r"""[BETA] RandAugment data augmentation method based on
`"RandAugment: Practical automated data augmentation with a reduced search space"
<https://arxiv.org/abs/1909.13719>`_.
.. betastatus:: RandAugment transform
If the image is torch Tensor, it should be of type torch.uint8, and it is expected
to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "L" or "RGB".
Args:
num_ops (int): Number of augmentation transformations to apply sequentially.
magnitude (int): Magnitude for all the transformations.
num_magnitude_bins (int): The number of different magnitude values.
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
fill (sequence or number, optional): Pixel fill value for the area outside the transformed
image. If given a number, the value is used for all bands respectively.
"""
_v1_transform_cls = _transforms.RandAugment
_AUGMENTATION_SPACE = {
"Identity": (lambda num_bins, height, width: None, False),
......@@ -379,6 +418,24 @@ class RandAugment(_AutoAugmentBase):
class TrivialAugmentWide(_AutoAugmentBase):
r"""[BETA] Dataset-independent data-augmentation with TrivialAugment Wide, as described in
`"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" <https://arxiv.org/abs/2103.10158>`_.
.. betastatus:: TrivialAugmentWide transform
If the image is torch Tensor, it should be of type torch.uint8, and it is expected
to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "L" or "RGB".
Args:
num_magnitude_bins (int): The number of different magnitude values.
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
fill (sequence or number, optional): Pixel fill value for the area outside the transformed
image. If given a number, the value is used for all bands respectively.
"""
_v1_transform_cls = _transforms.TrivialAugmentWide
_AUGMENTATION_SPACE = {
"Identity": (lambda num_bins, height, width: None, False),
......@@ -430,6 +487,29 @@ class TrivialAugmentWide(_AutoAugmentBase):
class AugMix(_AutoAugmentBase):
r"""[BETA] AugMix data augmentation method based on
`"AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty" <https://arxiv.org/abs/1912.02781>`_.
.. betastatus:: AugMix transform
If the image is torch Tensor, it should be of type torch.uint8, and it is expected
to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "L" or "RGB".
Args:
severity (int): The severity of base augmentation operators. Default is ``3``.
mixture_width (int): The number of augmentation chains. Default is ``3``.
chain_depth (int): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3].
Default is ``-1``.
alpha (float): The hyperparameter for the probability distributions. Default is ``1.0``.
all_ops (bool): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``.
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
fill (sequence or number, optional): Pixel fill value for the area outside the transformed
image. If given a number, the value is used for all bands respectively.
"""
_v1_transform_cls = _transforms.AugMix
_PARTIAL_AUGMENTATION_SPACE = {
......
......@@ -11,6 +11,23 @@ from .utils import is_simple_tensor, query_chw
class Grayscale(Transform):
"""[BETA] Convert images or videos to grayscale.
.. betastatus:: Grayscale transform
If the image is torch Tensor, it is expected
to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
Args:
num_output_channels (int): (1 or 3) number of channels desired for output image
Returns:
PIL Image: Grayscale version of the input.
- If ``num_output_channels == 1`` : returned image is single channel
- If ``num_output_channels == 3`` : returned image is 3 channel with r == g == b
"""
_v1_transform_cls = _transforms.Grayscale
_transformed_types = (
......@@ -29,6 +46,24 @@ class Grayscale(Transform):
class RandomGrayscale(_RandomApplyTransform):
"""[BETA] Randomly convert image to grayscale with a probability of p (default 0.1).
.. betastatus:: RandomGrayscale transform
If the image is torch Tensor, it is expected
to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
Args:
p (float): probability that image should be converted to grayscale.
Returns:
PIL Image or Tensor: Grayscale version of the input image with probability p and unchanged
with probability (1-p).
- If input image is 1 channel: grayscale version is 1 channel
- If input image is 3 channel: grayscale version is 3 channel with r == g == b
"""
_v1_transform_cls = _transforms.RandomGrayscale
_transformed_types = (
......@@ -50,6 +85,32 @@ class RandomGrayscale(_RandomApplyTransform):
class ColorJitter(Transform):
"""[BETA] Randomly change the brightness, contrast, saturation and hue of an image.
.. betastatus:: ColorJitter transform
If the image is torch Tensor, it is expected
to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported.
Args:
brightness (float or tuple of float (min, max)): How much to jitter brightness.
brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
or the given [min, max]. Should be non negative numbers.
contrast (float or tuple of float (min, max)): How much to jitter contrast.
contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
or the given [min, max]. Should be non-negative numbers.
saturation (float or tuple of float (min, max)): How much to jitter saturation.
saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
or the given [min, max]. Should be non negative numbers.
hue (float or tuple of float (min, max)): How much to jitter hue.
hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
thus it does not work if you normalize your image to an interval with negative values,
or use an interpolation that generates negative values before using this function.
"""
_v1_transform_cls = _transforms.ColorJitter
def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
......@@ -205,6 +266,18 @@ class RandomPhotometricDistort(Transform):
class RandomEqualize(_RandomApplyTransform):
"""[BETA] Equalize the histogram of the given image randomly with a given probability.
.. betastatus:: RandomEqualize transform
If the image is torch Tensor, it is expected
to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "P", "L" or "RGB".
Args:
p (float): probability of the image being equalized. Default value is 0.5
"""
_v1_transform_cls = _transforms.RandomEqualize
def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
......@@ -212,6 +285,18 @@ class RandomEqualize(_RandomApplyTransform):
class RandomInvert(_RandomApplyTransform):
"""[BETA] Inverts the colors of the given image randomly with a given probability.
.. betastatus:: RandomInvert transform
If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
where ... means it can have an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "L" or "RGB".
Args:
p (float): probability of the image being color inverted. Default value is 0.5
"""
_v1_transform_cls = _transforms.RandomInvert
def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
......@@ -219,6 +304,20 @@ class RandomInvert(_RandomApplyTransform):
class RandomPosterize(_RandomApplyTransform):
"""[BETA] Posterize the image randomly with a given probability by reducing the
number of bits for each color channel.
.. betastatus:: RandomPosterize transform
If the image is torch Tensor, it should be of type torch.uint8,
and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "L" or "RGB".
Args:
bits (int): number of bits to keep for each channel (0-8)
p (float): probability of the image being posterized. Default value is 0.5
"""
_v1_transform_cls = _transforms.RandomPosterize
def __init__(self, bits: int, p: float = 0.5) -> None:
......@@ -230,6 +329,20 @@ class RandomPosterize(_RandomApplyTransform):
class RandomSolarize(_RandomApplyTransform):
"""[BETA] Solarize the image randomly with a given probability by inverting all pixel
values above a threshold.
.. betastatus:: RandomSolarize transform
If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
where ... means it can have an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "L" or "RGB".
Args:
threshold (float): all pixels equal or above this value are inverted.
p (float): probability of the image being solarized. Default value is 0.5
"""
_v1_transform_cls = _transforms.RandomSolarize
def __init__(self, threshold: float, p: float = 0.5) -> None:
......@@ -241,6 +354,18 @@ class RandomSolarize(_RandomApplyTransform):
class RandomAutocontrast(_RandomApplyTransform):
"""[BETA] Autocontrast the pixels of the given image randomly with a given probability.
.. betastatus:: RandomAutocontrast transform
If the image is torch Tensor, it is expected
to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "L" or "RGB".
Args:
p (float): probability of the image being autocontrasted. Default value is 0.5
"""
_v1_transform_cls = _transforms.RandomAutocontrast
def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
......@@ -248,6 +373,20 @@ class RandomAutocontrast(_RandomApplyTransform):
class RandomAdjustSharpness(_RandomApplyTransform):
"""[BETA] Adjust the sharpness of the image randomly with a given probability.
.. betastatus:: RandomAdjustSharpness transform
If the image is torch Tensor,
it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
Args:
sharpness_factor (float): How much to adjust the sharpness. Can be
any non-negative number. 0 gives a blurred image, 1 gives the
original image while 2 increases the sharpness by a factor of 2.
p (float): probability of the image being sharpened. Default value is 0.5
"""
_v1_transform_cls = _transforms.RandomAdjustSharpness
def __init__(self, sharpness_factor: float, p: float = 0.5) -> None:
......
......@@ -9,6 +9,37 @@ from torchvision.transforms.v2 import Transform
class Compose(Transform):
"""[BETA] Composes several transforms together.
.. betastatus:: Compose transform
This transform does not support torchscript.
Please, see the note below.
Args:
transforms (list of ``Transform`` objects): list of transforms to compose.
Example:
>>> transforms.Compose([
>>> transforms.CenterCrop(10),
>>> transforms.PILToTensor(),
>>> transforms.ConvertImageDtype(torch.float),
>>> ])
.. note::
In order to script the transformations, please use ``torch.nn.Sequential`` as below.
>>> transforms = torch.nn.Sequential(
>>> transforms.CenterCrop(10),
>>> transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
>>> )
>>> scripted_transforms = torch.jit.script(transforms)
Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
`lambda` functions or ``PIL.Image``.
"""
def __init__(self, transforms: Sequence[Callable]) -> None:
super().__init__()
if not isinstance(transforms, Sequence):
......@@ -29,6 +60,27 @@ class Compose(Transform):
class RandomApply(Transform):
"""[BETA] Apply randomly a list of transformations with a given probability.
.. betastatus:: RandomApply transform
.. note::
In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of
transforms as shown below:
>>> transforms = transforms.RandomApply(torch.nn.ModuleList([
>>> transforms.ColorJitter(),
>>> ]), p=0.3)
>>> scripted_transforms = torch.jit.script(transforms)
Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
`lambda` functions or ``PIL.Image``.
Args:
transforms (sequence or torch.nn.Module): list of transformations
p (float): probability
"""
_v1_transform_cls = _transforms.RandomApply
def __init__(self, transforms: Union[Sequence[Callable], nn.ModuleList], p: float = 0.5) -> None:
......@@ -63,6 +115,12 @@ class RandomApply(Transform):
class RandomChoice(Transform):
"""[BETA] Apply single transformation randomly picked from a list.
.. betastatus:: RandomChoice transform
This transform does not support torchscript."""
def __init__(
self,
transforms: Sequence[Callable],
......@@ -99,6 +157,13 @@ class RandomChoice(Transform):
class RandomOrder(Transform):
"""[BETA] Apply a list of transformations in a random order.
.. betastatus:: RandomOrder transform
This transform does not support torchscript.
"""
def __init__(self, transforms: Sequence[Callable]) -> None:
if not isinstance(transforms, Sequence):
raise TypeError("Argument transforms should be a sequence of callables")
......
......@@ -10,6 +10,31 @@ from torchvision.transforms.v2 import Transform
class ToTensor(Transform):
"""[BETA] Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.
.. betastatus:: ToTensor transform
.. warning::
:class:`v2.ToTensor` is deprecated and will be removed in a future release.
Please use instead ``transforms.Compose([transforms.ToImageTensor(), transforms.ConvertImageDtype()])``.
This transform does not support torchscript.
Converts a PIL Image or numpy.ndarray (H x W x C) in the range
[0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1)
or if the numpy.ndarray has dtype = np.uint8
In the other cases, tensors are returned without scaling.
.. note::
Because the input image is scaled to [0.0, 1.0], this transformation should not be used when
transforming target image masks. See the `references`_ for implementing the transforms for image masks.
.. _references: https://github.com/pytorch/vision/tree/main/references/segmentation
"""
_transformed_types = (PIL.Image.Image, np.ndarray)
def __init__(self) -> None:
......
......@@ -26,6 +26,18 @@ from .utils import has_all, has_any, is_simple_tensor, query_bounding_box, query
class RandomHorizontalFlip(_RandomApplyTransform):
"""[BETA] Horizontally flip the given image/box/mask randomly with a given probability.
.. betastatus:: RandomHorizontalFlip transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading
dimensions
Args:
p (float): probability of the image being flipped. Default value is 0.5
"""
_v1_transform_cls = _transforms.RandomHorizontalFlip
def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
......@@ -33,6 +45,18 @@ class RandomHorizontalFlip(_RandomApplyTransform):
class RandomVerticalFlip(_RandomApplyTransform):
"""[BETA] Vertically flip the given image/box/mask randomly with a given probability.
.. betastatus:: RandomVerticalFlip transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading
dimensions
Args:
p (float): probability of the image being flipped. Default value is 0.5
"""
_v1_transform_cls = _transforms.RandomVerticalFlip
def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
......@@ -40,6 +64,62 @@ class RandomVerticalFlip(_RandomApplyTransform):
class Resize(Transform):
"""[BETA] Resize the input image/box/mask to the given size.
.. betastatus:: Resize transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
.. warning::
The output image might be different depending on its type: when downsampling, the interpolation of PIL images
and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences
in the performance of a network. Therefore, it is preferable to train and serve a model with the same input
types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors
closer.
Args:
size (sequence or int): Desired output size. If size is a sequence like
(h, w), output size will be matched to this. If size is an int,
smaller edge of the image will be matched to this number.
i.e, if height > width, then image will be rescaled to
(size * height / width, size).
.. note::
In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
max_size (int, optional): The maximum allowed for the longer edge of
the resized image: if the longer edge of the image is greater
than ``max_size`` after being resized according to ``size``, then
the image is resized again so that the longer edge is equal to
``max_size``. As a result, ``size`` might be overruled, i.e. the
smaller edge may be shorter than ``size``. This is only supported
if ``size`` is an int (or a sequence of length 1 in torchscript
mode).
antialias (bool, optional): Whether to apply antialiasing.
It only affects **tensors** with bilinear or bicubic modes and it is
ignored otherwise: on PIL images, antialiasing is always applied on
bilinear or bicubic modes; on other modes (for PIL images and
tensors), antialiasing makes no sense and this parameter is ignored.
Possible values are:
- ``True``: will apply antialiasing for bilinear or bicubic modes.
Other mode aren't affected. This is probably what you want to use.
- ``False``: will not apply antialiasing for tensors on any mode. PIL
images are still antialiased on bilinear or bicubic modes, because
PIL doesn't support no antialias.
- ``None``: equivalent to ``False`` for tensors and ``True`` for
PIL images. This value exists for legacy reasons and you probably
don't want to use it unless you really know what you are doing.
The current default is ``None`` **but will change to** ``True`` **in
v0.17** for the PIL and Tensor backends to be consistent.
"""
_v1_transform_cls = _transforms.Resize
def __init__(
......@@ -76,6 +156,20 @@ class Resize(Transform):
class CenterCrop(Transform):
"""[BETA] Crops the given image/box/mask at the center.
.. betastatus:: CenterCrop transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
Args:
size (sequence or int): Desired output size of the crop. If size is an
int instead of sequence like (h, w), a square crop (size, size) is
made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
"""
_v1_transform_cls = _transforms.CenterCrop
def __init__(self, size: Union[int, Sequence[int]]):
......@@ -87,6 +181,53 @@ class CenterCrop(Transform):
class RandomResizedCrop(Transform):
"""[BETA] Crop a random portion of image/box/mask and resize it to a given size.
.. betastatus:: RandomResizedCrop transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
A crop of the original image is made: the crop has a random area (H * W)
and a random aspect ratio. This crop is finally resized to the given
size. This is popularly used to train the Inception networks.
Args:
size (int or sequence): expected output size of the crop, for each edge. If size is an
int instead of sequence like (h, w), a square output size ``(size, size)`` is
made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
.. note::
In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
scale (tuple of float): Specifies the lower and upper bounds for the random area of the crop,
before resizing. The scale is defined with respect to the area of the original image.
ratio (tuple of float): lower and upper bounds for the random aspect ratio of the crop, before
resizing.
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
antialias (bool, optional): Whether to apply antialiasing.
It only affects **tensors** with bilinear or bicubic modes and it is
ignored otherwise: on PIL images, antialiasing is always applied on
bilinear or bicubic modes; on other modes (for PIL images and
tensors), antialiasing makes no sense and this parameter is ignored.
Possible values are:
- ``True``: will apply antialiasing for bilinear or bicubic modes.
Other mode aren't affected. This is probably what you want to use.
- ``False``: will not apply antialiasing for tensors on any mode. PIL
images are still antialiased on bilinear or bicubic modes, because
PIL doesn't support no antialias.
- ``None``: equivalent to ``False`` for tensors and ``True`` for
PIL images. This value exists for legacy reasons and you probably
don't want to use it unless you really know what you are doing.
The current default is ``None`` **but will change to** ``True`` **in
v0.17** for the PIL and Tensor backends to be consistent.
"""
_v1_transform_cls = _transforms.RandomResizedCrop
def __init__(
......@@ -164,7 +305,24 @@ ImageOrVideoTypeJIT = Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]
class FiveCrop(Transform):
"""
"""[BETA] Crop the given image/box/mask into four corners and the central crop.
.. betastatus:: FiveCrop transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading
dimensions
.. Note::
This transform returns a tuple of images and there may be a mismatch in the number of
inputs and targets your Dataset returns. See below for an example of how to deal with
this.
Args:
size (sequence or int): Desired output size of the crop. If size is an ``int``
instead of sequence like (h, w), a square crop of size (size, size) is made.
If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
Example:
>>> class BatchMultiCrop(transforms.Transform):
... def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], int]):
......@@ -209,8 +367,27 @@ class FiveCrop(Transform):
class TenCrop(Transform):
"""
"""[BETA] Crop the given image/box/mask into four corners and the central crop plus the flipped version of
these (horizontal flipping is used by default).
.. betastatus:: TenCrop transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading
dimensions.
See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
.. Note::
This transform returns a tuple of images and there may be a mismatch in the number of
inputs and targets your Dataset returns. See below for an example of how to deal with
this.
Args:
size (sequence or int): Desired output size of the crop. If size is an
int instead of sequence like (h, w), a square crop (size, size) is
made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
vertical_flip (bool): Use vertical flipping instead of horizontal
"""
_v1_transform_cls = _transforms.TenCrop
......@@ -249,6 +426,46 @@ class TenCrop(Transform):
class Pad(Transform):
"""[BETA] Pad the given image/box/mask on all sides with the given "pad" value.
.. betastatus:: Pad transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means at most 2 leading dimensions for mode reflect and symmetric,
at most 3 leading dimensions for mode edge,
and an arbitrary number of leading dimensions for mode constant
Args:
padding (int or sequence): Padding on each border. If a single int is provided this
is used to pad all borders. If sequence of length 2 is provided this is the padding
on left/right and top/bottom respectively. If a sequence of length 4 is provided
this is the padding for the left, top, right and bottom borders respectively.
.. note::
In torchscript mode padding as single int is not supported, use a sequence of
length 1: ``[padding, ]``.
fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
length 3, it is used to fill R, G, B channels respectively.
This value is only used when the padding_mode is constant.
Only number is supported for torch Tensor.
Only int or tuple value is supported for PIL Image.
padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
Default is constant.
- constant: pads with a constant value, this value is specified with fill
- edge: pads with the last value at the edge of the image.
If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
- reflect: pads with reflection of image without repeating the last value on the edge.
For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
will result in [3, 2, 1, 2, 3, 4, 3, 2]
- symmetric: pads with reflection of image repeating the last value on the edge.
For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
will result in [2, 1, 1, 2, 3, 4, 4, 3]
"""
_v1_transform_cls = _transforms.Pad
def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
......@@ -323,6 +540,34 @@ class RandomZoomOut(_RandomApplyTransform):
class RandomRotation(Transform):
"""[BETA] Rotate the image/box/mask by angle.
.. betastatus:: RandomRotation transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
Args:
degrees (sequence or number): Range of degrees to select from.
If degrees is a number instead of sequence like (min, max), the range of degrees
will be (-degrees, +degrees).
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
expand (bool, optional): Optional expansion flag.
If true, expands the output to make it large enough to hold the entire rotated image.
If false or omitted, make the output image the same size as the input image.
Note that the expand flag assumes rotation around the center and no translation.
center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
Default is the center of the image.
fill (sequence or number): Pixel fill value for the area outside the rotated
image. Default is ``0``. If given a number, the value is used for all bands respectively.
.. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
"""
_v1_transform_cls = _transforms.RandomRotation
def __init__(
......@@ -363,6 +608,42 @@ class RandomRotation(Transform):
class RandomAffine(Transform):
"""[BETA] Random affine transformation of the image/box/mask keeping center invariant.
.. betastatus:: RandomAffine transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
Args:
degrees (sequence or number): Range of degrees to select from.
If degrees is a number instead of sequence like (min, max), the range of degrees
will be (-degrees, +degrees). Set to 0 to deactivate rotations.
translate (tuple, optional): tuple of maximum absolute fraction for horizontal
and vertical translations. For example translate=(a, b), then horizontal shift
is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is
randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default.
scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
randomly sampled from the range a <= scale <= b. Will keep original scale by default.
shear (sequence or number, optional): Range of degrees to select from.
If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear)
will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the
range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
Will not apply shear by default.
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
fill (sequence or number): Pixel fill value for the area outside the transformed
image. Default is ``0``. If given a number, the value is used for all bands respectively.
center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
Default is the center of the image.
.. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
"""
_v1_transform_cls = _transforms.RandomAffine
def __init__(
......@@ -443,6 +724,52 @@ class RandomAffine(Transform):
class RandomCrop(Transform):
"""[BETA] Crop the given image/box/mask at a random location.
.. betastatus:: RandomCrop transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions,
but if non-constant padding is used, the input is expected to have at most 2 leading dimensions
Args:
size (sequence or int): Desired output size of the crop. If size is an
int instead of sequence like (h, w), a square crop (size, size) is
made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
padding (int or sequence, optional): Optional padding on each border
of the image. Default is None. If a single int is provided this
is used to pad all borders. If sequence of length 2 is provided this is the padding
on left/right and top/bottom respectively. If a sequence of length 4 is provided
this is the padding for the left, top, right and bottom borders respectively.
.. note::
In torchscript mode padding as single int is not supported, use a sequence of
length 1: ``[padding, ]``.
pad_if_needed (boolean): It will pad the image if smaller than the
desired size to avoid raising an exception. Since cropping is done
after padding, the padding seems to be done at a random offset.
fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
length 3, it is used to fill R, G, B channels respectively.
This value is only used when the padding_mode is constant.
Only number is supported for torch Tensor.
Only int or tuple value is supported for PIL Image.
padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
Default is constant.
- constant: pads with a constant value, this value is specified with fill
- edge: pads with the last value at the edge of the image.
If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
- reflect: pads with reflection of image without repeating the last value on the edge.
For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
will result in [3, 2, 1, 2, 3, 4, 3, 2]
- symmetric: pads with reflection of image repeating the last value on the edge.
For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
will result in [2, 1, 1, 2, 3, 4, 4, 3]
"""
_v1_transform_cls = _transforms.RandomCrop
def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
......@@ -552,6 +879,25 @@ class RandomCrop(Transform):
class RandomPerspective(_RandomApplyTransform):
"""[BETA] Performs a random perspective transformation of the given image/box/mask with a given probability.
.. betastatus:: RandomPerspective transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
Args:
distortion_scale (float): argument to control the degree of distortion and ranges from 0 to 1.
Default is 0.5.
p (float): probability of the image being transformed. Default is 0.5.
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
fill (sequence or number): Pixel fill value for the area outside the transformed
image. Default is ``0``. If given a number, the value is used for all bands respectively.
"""
_v1_transform_cls = _transforms.RandomPerspective
def __init__(
......
......@@ -22,6 +22,27 @@ class ConvertBoundingBoxFormat(Transform):
class ConvertDtype(Transform):
"""[BETA] Convert a tensor image/box/mask to the given ``dtype`` and scale the values accordingly
.. betastatus:: ConvertDtype transform
This function does not support PIL Image.
Args:
dtype (torch.dtype): Desired data type of the output
.. note::
When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly.
If converted back and forth, this mismatch has no effect.
Raises:
RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as
well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to
overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range
of the integer ``dtype``.
"""
_v1_transform_cls = _transforms.ConvertImageDtype
_transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
......
......@@ -21,6 +21,16 @@ class Identity(Transform):
class Lambda(Transform):
"""[BETA] Apply a user-defined lambda as a transform.
.. betastatus:: Lambda transform
This transform does not support torchscript.
Args:
lambd (function): Lambda/function to be used for transform.
"""
def __init__(self, lambd: Callable[[Any], Any], *types: Type):
super().__init__()
self.lambd = lambd
......@@ -42,6 +52,26 @@ class Lambda(Transform):
class LinearTransformation(Transform):
"""[BETA] Transform a tensor image with a square transformation matrix and a mean_vector computed offline.
.. betastatus:: LinearTransformation transform
This transform does not support PIL Image.
Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and
subtract mean_vector from it which is then followed by computing the dot
product with the transformation matrix and then reshaping the tensor to its
original shape.
Applications:
whitening transformation: Suppose X is a column vector zero-centered data.
Then compute the data covariance matrix [D x D] with torch.mm(X.t(), X),
perform SVD on this matrix and pass it as transformation_matrix.
Args:
transformation_matrix (Tensor): tensor [D x D], D = C x H x W
mean_vector (Tensor): tensor [D], D = C x H x W
"""
_v1_transform_cls = _transforms.LinearTransformation
_transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
......@@ -105,6 +135,26 @@ class LinearTransformation(Transform):
class Normalize(Transform):
"""[BETA] Normalize a tensor image with mean and standard deviation.
.. betastatus:: Normalize transform
This transform does not support PIL Image.
Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
channels, this transform will normalize each channel of the input
``torch.*Tensor`` i.e.,
``output[channel] = (input[channel] - mean[channel]) / std[channel]``
.. note::
This transform acts out of place, i.e., it does not mutate the input tensor.
Args:
mean (sequence): Sequence of means for each channel.
std (sequence): Sequence of standard deviations for each channel.
inplace(bool,optional): Bool to make this operation in-place.
"""
_v1_transform_cls = _transforms.Normalize
_transformed_types = (datapoints.Image, is_simple_tensor, datapoints.Video)
......@@ -125,6 +175,24 @@ class Normalize(Transform):
class GaussianBlur(Transform):
"""[BETA] Blurs image with randomly chosen Gaussian blur.
.. betastatus:: GausssianBlur transform
If the image is torch Tensor, it is expected
to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions.
Args:
kernel_size (int or sequence): Size of the Gaussian kernel.
sigma (float or tuple of float (min, max)): Standard deviation to be used for
creating kernel to perform blurring. If float, sigma is fixed. If it is tuple
of float (min, max), sigma is chosen uniformly at random to lie in the
given range.
Returns:
PIL Image or Tensor: Gaussian blurred version of the input image.
"""
_v1_transform_cls = _transforms.GaussianBlur
def __init__(
......
......@@ -11,6 +11,15 @@ from torchvision.transforms.v2.utils import is_simple_tensor
class PILToTensor(Transform):
"""[BETA] Convert a ``PIL Image`` to a tensor of the same type.
.. betastatus:: PILToTensor transform
This transform does not support torchscript.
Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W).
"""
_transformed_types = (PIL.Image.Image,)
def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Tensor:
......@@ -27,6 +36,27 @@ class ToImageTensor(Transform):
class ToImagePIL(Transform):
"""[BETA] Convert a tensor or an ndarray to PIL Image.
.. betastatus:: ToImagePIL transform
This transform does not support torchscript.
Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
H x W x C to a PIL Image while preserving the value range.
Args:
mode (`PIL.Image mode`_): color space and pixel depth of input data (optional).
If ``mode`` is ``None`` (default) there are some assumptions made about the input data:
- If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``.
- If the input has 3 channels, the ``mode`` is assumed to be ``RGB``.
- If the input has 2 channels, the ``mode`` is assumed to be ``LA``.
- If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``,
``short``).
.. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
"""
_transformed_types = (is_simple_tensor, datapoints.Image, np.ndarray)
def __init__(self, mode: Optional[str] = None) -> None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment