Unverified Commit 928b05ca authored by Nicolas Hug's avatar Nicolas Hug Committed by GitHub
Browse files

Added docs for v2 transforms (part 1) (#7297)


Co-authored-by: default avatarvfdev <vfdev.5@gmail.com>
Co-authored-by: default avatarPhilip Meier <github.pmeier@posteo.de>
parent d03b776a
...@@ -33,6 +33,8 @@ from tabulate import tabulate ...@@ -33,6 +33,8 @@ from tabulate import tabulate
sys.path.append(os.path.abspath(".")) sys.path.append(os.path.abspath("."))
torchvision.disable_beta_transforms_warning()
# -- General configuration ------------------------------------------------ # -- General configuration ------------------------------------------------
# Required version of sphinx is set from docs/requirements.txt # Required version of sphinx is set from docs/requirements.txt
......
...@@ -98,17 +98,29 @@ Geometry ...@@ -98,17 +98,29 @@ Geometry
:template: class.rst :template: class.rst
Resize Resize
v2.Resize
RandomCrop RandomCrop
v2.RandomCrop
RandomResizedCrop RandomResizedCrop
v2.RandomResizedCrop
CenterCrop CenterCrop
v2.CenterCrop
FiveCrop FiveCrop
v2.FiveCrop
TenCrop TenCrop
v2.TenCrop
Pad Pad
v2.Pad
RandomAffine RandomAffine
v2.RandomAffine
RandomPerspective RandomPerspective
v2.RandomPerspective
RandomRotation RandomRotation
v2.RandomRotation
RandomHorizontalFlip RandomHorizontalFlip
v2.RandomHorizontalFlip
RandomVerticalFlip RandomVerticalFlip
v2.RandomVerticalFlip
Color Color
----- -----
...@@ -118,15 +130,25 @@ Color ...@@ -118,15 +130,25 @@ Color
:template: class.rst :template: class.rst
ColorJitter ColorJitter
v2.ColorJitter
Grayscale Grayscale
v2.Grayscale
RandomGrayscale RandomGrayscale
v2.RandomGrayscale
GaussianBlur GaussianBlur
v2.GaussianBlur
RandomInvert RandomInvert
v2.RandomInvert
RandomPosterize RandomPosterize
v2.RandomPosterize
RandomSolarize RandomSolarize
v2.RandomSolarize
RandomAdjustSharpness RandomAdjustSharpness
v2.RandomAdjustSharpness
RandomAutocontrast RandomAutocontrast
v2.RandomAutocontrast
RandomEqualize RandomEqualize
v2.RandomEqualize
Composition Composition
----------- -----------
...@@ -136,9 +158,13 @@ Composition ...@@ -136,9 +158,13 @@ Composition
:template: class.rst :template: class.rst
Compose Compose
v2.Compose
RandomApply RandomApply
v2.RandomApply
RandomChoice RandomChoice
v2.RandomChoice
RandomOrder RandomOrder
v2.RandomOrder
Miscellaneous Miscellaneous
------------- -------------
...@@ -148,9 +174,13 @@ Miscellaneous ...@@ -148,9 +174,13 @@ Miscellaneous
:template: class.rst :template: class.rst
LinearTransformation LinearTransformation
v2.LinearTransformation
Normalize Normalize
v2.Normalize
RandomErasing RandomErasing
v2.RandomErasing
Lambda Lambda
v2.Lambda
.. _conversion_transforms: .. _conversion_transforms:
...@@ -162,9 +192,15 @@ Conversion ...@@ -162,9 +192,15 @@ Conversion
:template: class.rst :template: class.rst
ToPILImage ToPILImage
v2.ToPILImage
v2.ToImagePIL
ToTensor ToTensor
v2.ToTensor
PILToTensor PILToTensor
v2.PILToTensor
ConvertImageDtype ConvertImageDtype
v2.ConvertImageDtype
v2.ConvertDtype
Auto-Augmentation Auto-Augmentation
----------------- -----------------
...@@ -181,9 +217,13 @@ The new transform can be used standalone or mixed-and-matched with existing tran ...@@ -181,9 +217,13 @@ The new transform can be used standalone or mixed-and-matched with existing tran
AutoAugmentPolicy AutoAugmentPolicy
AutoAugment AutoAugment
v2.AutoAugment
RandAugment RandAugment
v2.RandAugment
TrivialAugmentWide TrivialAugmentWide
v2.TrivialAugmentWide
AugMix AugMix
v2.AugMix
.. _functional_transforms: .. _functional_transforms:
......
...@@ -13,6 +13,38 @@ from .utils import is_simple_tensor, query_chw ...@@ -13,6 +13,38 @@ from .utils import is_simple_tensor, query_chw
class RandomErasing(_RandomApplyTransform): class RandomErasing(_RandomApplyTransform):
"""[BETA] Randomly selects a rectangle region in the input image or video and erases its pixels.
.. betastatus:: RandomErasing transform
This transform does not support PIL Image.
'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
Args:
p: probability that the random erasing operation will be performed.
scale: range of proportion of erased area against input image.
ratio: range of aspect ratio of erased area.
value: erasing value. Default is 0. If a single int, it is used to
erase all pixels. If a tuple of length 3, it is used to erase
R, G, B channels respectively.
If a str of 'random', erasing each pixel with random values.
inplace: boolean to make this transform inplace. Default set to False.
Returns:
Erased input.
Example:
>>> from torchvision.transforms import v2 as transforms
>>>
>>> transform = transforms.Compose([
>>> transforms.RandomHorizontalFlip(),
>>> transforms.PILToTensor(),
>>> transforms.ConvertImageDtype(torch.float),
>>> transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
>>> transforms.RandomErasing(),
>>> ])
"""
_v1_transform_cls = _transforms.RandomErasing _v1_transform_cls = _transforms.RandomErasing
def _extract_params_for_v1_transform(self) -> Dict[str, Any]: def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
......
...@@ -162,6 +162,24 @@ class _AutoAugmentBase(Transform): ...@@ -162,6 +162,24 @@ class _AutoAugmentBase(Transform):
class AutoAugment(_AutoAugmentBase): class AutoAugment(_AutoAugmentBase):
r"""[BETA] AutoAugment data augmentation method based on
`"AutoAugment: Learning Augmentation Strategies from Data" <https://arxiv.org/pdf/1805.09501.pdf>`_.
.. betastatus:: AutoAugment transform
If the image is torch Tensor, it should be of type torch.uint8, and it is expected
to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "L" or "RGB".
Args:
policy (AutoAugmentPolicy): Desired policy enum defined by
:class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``.
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
fill (sequence or number, optional): Pixel fill value for the area outside the transformed
image. If given a number, the value is used for all bands respectively.
"""
_v1_transform_cls = _transforms.AutoAugment _v1_transform_cls = _transforms.AutoAugment
_AUGMENTATION_SPACE = { _AUGMENTATION_SPACE = {
...@@ -318,6 +336,27 @@ class AutoAugment(_AutoAugmentBase): ...@@ -318,6 +336,27 @@ class AutoAugment(_AutoAugmentBase):
class RandAugment(_AutoAugmentBase): class RandAugment(_AutoAugmentBase):
r"""[BETA] RandAugment data augmentation method based on
`"RandAugment: Practical automated data augmentation with a reduced search space"
<https://arxiv.org/abs/1909.13719>`_.
.. betastatus:: RandAugment transform
If the image is torch Tensor, it should be of type torch.uint8, and it is expected
to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "L" or "RGB".
Args:
num_ops (int): Number of augmentation transformations to apply sequentially.
magnitude (int): Magnitude for all the transformations.
num_magnitude_bins (int): The number of different magnitude values.
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
fill (sequence or number, optional): Pixel fill value for the area outside the transformed
image. If given a number, the value is used for all bands respectively.
"""
_v1_transform_cls = _transforms.RandAugment _v1_transform_cls = _transforms.RandAugment
_AUGMENTATION_SPACE = { _AUGMENTATION_SPACE = {
"Identity": (lambda num_bins, height, width: None, False), "Identity": (lambda num_bins, height, width: None, False),
...@@ -379,6 +418,24 @@ class RandAugment(_AutoAugmentBase): ...@@ -379,6 +418,24 @@ class RandAugment(_AutoAugmentBase):
class TrivialAugmentWide(_AutoAugmentBase): class TrivialAugmentWide(_AutoAugmentBase):
r"""[BETA] Dataset-independent data-augmentation with TrivialAugment Wide, as described in
`"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" <https://arxiv.org/abs/2103.10158>`_.
.. betastatus:: TrivialAugmentWide transform
If the image is torch Tensor, it should be of type torch.uint8, and it is expected
to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "L" or "RGB".
Args:
num_magnitude_bins (int): The number of different magnitude values.
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
fill (sequence or number, optional): Pixel fill value for the area outside the transformed
image. If given a number, the value is used for all bands respectively.
"""
_v1_transform_cls = _transforms.TrivialAugmentWide _v1_transform_cls = _transforms.TrivialAugmentWide
_AUGMENTATION_SPACE = { _AUGMENTATION_SPACE = {
"Identity": (lambda num_bins, height, width: None, False), "Identity": (lambda num_bins, height, width: None, False),
...@@ -430,6 +487,29 @@ class TrivialAugmentWide(_AutoAugmentBase): ...@@ -430,6 +487,29 @@ class TrivialAugmentWide(_AutoAugmentBase):
class AugMix(_AutoAugmentBase): class AugMix(_AutoAugmentBase):
r"""[BETA] AugMix data augmentation method based on
`"AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty" <https://arxiv.org/abs/1912.02781>`_.
.. betastatus:: AugMix transform
If the image is torch Tensor, it should be of type torch.uint8, and it is expected
to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "L" or "RGB".
Args:
severity (int): The severity of base augmentation operators. Default is ``3``.
mixture_width (int): The number of augmentation chains. Default is ``3``.
chain_depth (int): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3].
Default is ``-1``.
alpha (float): The hyperparameter for the probability distributions. Default is ``1.0``.
all_ops (bool): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``.
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
fill (sequence or number, optional): Pixel fill value for the area outside the transformed
image. If given a number, the value is used for all bands respectively.
"""
_v1_transform_cls = _transforms.AugMix _v1_transform_cls = _transforms.AugMix
_PARTIAL_AUGMENTATION_SPACE = { _PARTIAL_AUGMENTATION_SPACE = {
......
...@@ -11,6 +11,23 @@ from .utils import is_simple_tensor, query_chw ...@@ -11,6 +11,23 @@ from .utils import is_simple_tensor, query_chw
class Grayscale(Transform): class Grayscale(Transform):
"""[BETA] Convert images or videos to grayscale.
.. betastatus:: Grayscale transform
If the image is torch Tensor, it is expected
to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
Args:
num_output_channels (int): (1 or 3) number of channels desired for output image
Returns:
PIL Image: Grayscale version of the input.
- If ``num_output_channels == 1`` : returned image is single channel
- If ``num_output_channels == 3`` : returned image is 3 channel with r == g == b
"""
_v1_transform_cls = _transforms.Grayscale _v1_transform_cls = _transforms.Grayscale
_transformed_types = ( _transformed_types = (
...@@ -29,6 +46,24 @@ class Grayscale(Transform): ...@@ -29,6 +46,24 @@ class Grayscale(Transform):
class RandomGrayscale(_RandomApplyTransform): class RandomGrayscale(_RandomApplyTransform):
"""[BETA] Randomly convert image to grayscale with a probability of p (default 0.1).
.. betastatus:: RandomGrayscale transform
If the image is torch Tensor, it is expected
to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
Args:
p (float): probability that image should be converted to grayscale.
Returns:
PIL Image or Tensor: Grayscale version of the input image with probability p and unchanged
with probability (1-p).
- If input image is 1 channel: grayscale version is 1 channel
- If input image is 3 channel: grayscale version is 3 channel with r == g == b
"""
_v1_transform_cls = _transforms.RandomGrayscale _v1_transform_cls = _transforms.RandomGrayscale
_transformed_types = ( _transformed_types = (
...@@ -50,6 +85,32 @@ class RandomGrayscale(_RandomApplyTransform): ...@@ -50,6 +85,32 @@ class RandomGrayscale(_RandomApplyTransform):
class ColorJitter(Transform): class ColorJitter(Transform):
"""[BETA] Randomly change the brightness, contrast, saturation and hue of an image.
.. betastatus:: ColorJitter transform
If the image is torch Tensor, it is expected
to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported.
Args:
brightness (float or tuple of float (min, max)): How much to jitter brightness.
brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
or the given [min, max]. Should be non negative numbers.
contrast (float or tuple of float (min, max)): How much to jitter contrast.
contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
or the given [min, max]. Should be non-negative numbers.
saturation (float or tuple of float (min, max)): How much to jitter saturation.
saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
or the given [min, max]. Should be non negative numbers.
hue (float or tuple of float (min, max)): How much to jitter hue.
hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
thus it does not work if you normalize your image to an interval with negative values,
or use an interpolation that generates negative values before using this function.
"""
_v1_transform_cls = _transforms.ColorJitter _v1_transform_cls = _transforms.ColorJitter
def _extract_params_for_v1_transform(self) -> Dict[str, Any]: def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
...@@ -205,6 +266,18 @@ class RandomPhotometricDistort(Transform): ...@@ -205,6 +266,18 @@ class RandomPhotometricDistort(Transform):
class RandomEqualize(_RandomApplyTransform): class RandomEqualize(_RandomApplyTransform):
"""[BETA] Equalize the histogram of the given image randomly with a given probability.
.. betastatus:: RandomEqualize transform
If the image is torch Tensor, it is expected
to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "P", "L" or "RGB".
Args:
p (float): probability of the image being equalized. Default value is 0.5
"""
_v1_transform_cls = _transforms.RandomEqualize _v1_transform_cls = _transforms.RandomEqualize
def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
...@@ -212,6 +285,18 @@ class RandomEqualize(_RandomApplyTransform): ...@@ -212,6 +285,18 @@ class RandomEqualize(_RandomApplyTransform):
class RandomInvert(_RandomApplyTransform): class RandomInvert(_RandomApplyTransform):
"""[BETA] Inverts the colors of the given image randomly with a given probability.
.. betastatus:: RandomInvert transform
If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
where ... means it can have an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "L" or "RGB".
Args:
p (float): probability of the image being color inverted. Default value is 0.5
"""
_v1_transform_cls = _transforms.RandomInvert _v1_transform_cls = _transforms.RandomInvert
def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
...@@ -219,6 +304,20 @@ class RandomInvert(_RandomApplyTransform): ...@@ -219,6 +304,20 @@ class RandomInvert(_RandomApplyTransform):
class RandomPosterize(_RandomApplyTransform): class RandomPosterize(_RandomApplyTransform):
"""[BETA] Posterize the image randomly with a given probability by reducing the
number of bits for each color channel.
.. betastatus:: RandomPosterize transform
If the image is torch Tensor, it should be of type torch.uint8,
and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "L" or "RGB".
Args:
bits (int): number of bits to keep for each channel (0-8)
p (float): probability of the image being posterized. Default value is 0.5
"""
_v1_transform_cls = _transforms.RandomPosterize _v1_transform_cls = _transforms.RandomPosterize
def __init__(self, bits: int, p: float = 0.5) -> None: def __init__(self, bits: int, p: float = 0.5) -> None:
...@@ -230,6 +329,20 @@ class RandomPosterize(_RandomApplyTransform): ...@@ -230,6 +329,20 @@ class RandomPosterize(_RandomApplyTransform):
class RandomSolarize(_RandomApplyTransform): class RandomSolarize(_RandomApplyTransform):
"""[BETA] Solarize the image randomly with a given probability by inverting all pixel
values above a threshold.
.. betastatus:: RandomSolarize transform
If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
where ... means it can have an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "L" or "RGB".
Args:
threshold (float): all pixels equal or above this value are inverted.
p (float): probability of the image being solarized. Default value is 0.5
"""
_v1_transform_cls = _transforms.RandomSolarize _v1_transform_cls = _transforms.RandomSolarize
def __init__(self, threshold: float, p: float = 0.5) -> None: def __init__(self, threshold: float, p: float = 0.5) -> None:
...@@ -241,6 +354,18 @@ class RandomSolarize(_RandomApplyTransform): ...@@ -241,6 +354,18 @@ class RandomSolarize(_RandomApplyTransform):
class RandomAutocontrast(_RandomApplyTransform): class RandomAutocontrast(_RandomApplyTransform):
"""[BETA] Autocontrast the pixels of the given image randomly with a given probability.
.. betastatus:: RandomAutocontrast transform
If the image is torch Tensor, it is expected
to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
If img is PIL Image, it is expected to be in mode "L" or "RGB".
Args:
p (float): probability of the image being autocontrasted. Default value is 0.5
"""
_v1_transform_cls = _transforms.RandomAutocontrast _v1_transform_cls = _transforms.RandomAutocontrast
def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
...@@ -248,6 +373,20 @@ class RandomAutocontrast(_RandomApplyTransform): ...@@ -248,6 +373,20 @@ class RandomAutocontrast(_RandomApplyTransform):
class RandomAdjustSharpness(_RandomApplyTransform): class RandomAdjustSharpness(_RandomApplyTransform):
"""[BETA] Adjust the sharpness of the image randomly with a given probability.
.. betastatus:: RandomAdjustSharpness transform
If the image is torch Tensor,
it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
Args:
sharpness_factor (float): How much to adjust the sharpness. Can be
any non-negative number. 0 gives a blurred image, 1 gives the
original image while 2 increases the sharpness by a factor of 2.
p (float): probability of the image being sharpened. Default value is 0.5
"""
_v1_transform_cls = _transforms.RandomAdjustSharpness _v1_transform_cls = _transforms.RandomAdjustSharpness
def __init__(self, sharpness_factor: float, p: float = 0.5) -> None: def __init__(self, sharpness_factor: float, p: float = 0.5) -> None:
......
...@@ -9,6 +9,37 @@ from torchvision.transforms.v2 import Transform ...@@ -9,6 +9,37 @@ from torchvision.transforms.v2 import Transform
class Compose(Transform): class Compose(Transform):
"""[BETA] Composes several transforms together.
.. betastatus:: Compose transform
This transform does not support torchscript.
Please, see the note below.
Args:
transforms (list of ``Transform`` objects): list of transforms to compose.
Example:
>>> transforms.Compose([
>>> transforms.CenterCrop(10),
>>> transforms.PILToTensor(),
>>> transforms.ConvertImageDtype(torch.float),
>>> ])
.. note::
In order to script the transformations, please use ``torch.nn.Sequential`` as below.
>>> transforms = torch.nn.Sequential(
>>> transforms.CenterCrop(10),
>>> transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
>>> )
>>> scripted_transforms = torch.jit.script(transforms)
Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
`lambda` functions or ``PIL.Image``.
"""
def __init__(self, transforms: Sequence[Callable]) -> None: def __init__(self, transforms: Sequence[Callable]) -> None:
super().__init__() super().__init__()
if not isinstance(transforms, Sequence): if not isinstance(transforms, Sequence):
...@@ -29,6 +60,27 @@ class Compose(Transform): ...@@ -29,6 +60,27 @@ class Compose(Transform):
class RandomApply(Transform): class RandomApply(Transform):
"""[BETA] Apply randomly a list of transformations with a given probability.
.. betastatus:: RandomApply transform
.. note::
In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of
transforms as shown below:
>>> transforms = transforms.RandomApply(torch.nn.ModuleList([
>>> transforms.ColorJitter(),
>>> ]), p=0.3)
>>> scripted_transforms = torch.jit.script(transforms)
Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
`lambda` functions or ``PIL.Image``.
Args:
transforms (sequence or torch.nn.Module): list of transformations
p (float): probability
"""
_v1_transform_cls = _transforms.RandomApply _v1_transform_cls = _transforms.RandomApply
def __init__(self, transforms: Union[Sequence[Callable], nn.ModuleList], p: float = 0.5) -> None: def __init__(self, transforms: Union[Sequence[Callable], nn.ModuleList], p: float = 0.5) -> None:
...@@ -63,6 +115,12 @@ class RandomApply(Transform): ...@@ -63,6 +115,12 @@ class RandomApply(Transform):
class RandomChoice(Transform): class RandomChoice(Transform):
"""[BETA] Apply single transformation randomly picked from a list.
.. betastatus:: RandomChoice transform
This transform does not support torchscript."""
def __init__( def __init__(
self, self,
transforms: Sequence[Callable], transforms: Sequence[Callable],
...@@ -99,6 +157,13 @@ class RandomChoice(Transform): ...@@ -99,6 +157,13 @@ class RandomChoice(Transform):
class RandomOrder(Transform): class RandomOrder(Transform):
"""[BETA] Apply a list of transformations in a random order.
.. betastatus:: RandomOrder transform
This transform does not support torchscript.
"""
def __init__(self, transforms: Sequence[Callable]) -> None: def __init__(self, transforms: Sequence[Callable]) -> None:
if not isinstance(transforms, Sequence): if not isinstance(transforms, Sequence):
raise TypeError("Argument transforms should be a sequence of callables") raise TypeError("Argument transforms should be a sequence of callables")
......
...@@ -10,6 +10,31 @@ from torchvision.transforms.v2 import Transform ...@@ -10,6 +10,31 @@ from torchvision.transforms.v2 import Transform
class ToTensor(Transform): class ToTensor(Transform):
"""[BETA] Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.
.. betastatus:: ToTensor transform
.. warning::
:class:`v2.ToTensor` is deprecated and will be removed in a future release.
Please use instead ``transforms.Compose([transforms.ToImageTensor(), transforms.ConvertImageDtype()])``.
This transform does not support torchscript.
Converts a PIL Image or numpy.ndarray (H x W x C) in the range
[0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1)
or if the numpy.ndarray has dtype = np.uint8
In the other cases, tensors are returned without scaling.
.. note::
Because the input image is scaled to [0.0, 1.0], this transformation should not be used when
transforming target image masks. See the `references`_ for implementing the transforms for image masks.
.. _references: https://github.com/pytorch/vision/tree/main/references/segmentation
"""
_transformed_types = (PIL.Image.Image, np.ndarray) _transformed_types = (PIL.Image.Image, np.ndarray)
def __init__(self) -> None: def __init__(self) -> None:
......
...@@ -26,6 +26,18 @@ from .utils import has_all, has_any, is_simple_tensor, query_bounding_box, query ...@@ -26,6 +26,18 @@ from .utils import has_all, has_any, is_simple_tensor, query_bounding_box, query
class RandomHorizontalFlip(_RandomApplyTransform): class RandomHorizontalFlip(_RandomApplyTransform):
"""[BETA] Horizontally flip the given image/box/mask randomly with a given probability.
.. betastatus:: RandomHorizontalFlip transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading
dimensions
Args:
p (float): probability of the image being flipped. Default value is 0.5
"""
_v1_transform_cls = _transforms.RandomHorizontalFlip _v1_transform_cls = _transforms.RandomHorizontalFlip
def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
...@@ -33,6 +45,18 @@ class RandomHorizontalFlip(_RandomApplyTransform): ...@@ -33,6 +45,18 @@ class RandomHorizontalFlip(_RandomApplyTransform):
class RandomVerticalFlip(_RandomApplyTransform): class RandomVerticalFlip(_RandomApplyTransform):
"""[BETA] Vertically flip the given image/box/mask randomly with a given probability.
.. betastatus:: RandomVerticalFlip transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading
dimensions
Args:
p (float): probability of the image being flipped. Default value is 0.5
"""
_v1_transform_cls = _transforms.RandomVerticalFlip _v1_transform_cls = _transforms.RandomVerticalFlip
def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
...@@ -40,6 +64,62 @@ class RandomVerticalFlip(_RandomApplyTransform): ...@@ -40,6 +64,62 @@ class RandomVerticalFlip(_RandomApplyTransform):
class Resize(Transform): class Resize(Transform):
"""[BETA] Resize the input image/box/mask to the given size.
.. betastatus:: Resize transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
.. warning::
The output image might be different depending on its type: when downsampling, the interpolation of PIL images
and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences
in the performance of a network. Therefore, it is preferable to train and serve a model with the same input
types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors
closer.
Args:
size (sequence or int): Desired output size. If size is a sequence like
(h, w), output size will be matched to this. If size is an int,
smaller edge of the image will be matched to this number.
i.e, if height > width, then image will be rescaled to
(size * height / width, size).
.. note::
In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
max_size (int, optional): The maximum allowed for the longer edge of
the resized image: if the longer edge of the image is greater
than ``max_size`` after being resized according to ``size``, then
the image is resized again so that the longer edge is equal to
``max_size``. As a result, ``size`` might be overruled, i.e. the
smaller edge may be shorter than ``size``. This is only supported
if ``size`` is an int (or a sequence of length 1 in torchscript
mode).
antialias (bool, optional): Whether to apply antialiasing.
It only affects **tensors** with bilinear or bicubic modes and it is
ignored otherwise: on PIL images, antialiasing is always applied on
bilinear or bicubic modes; on other modes (for PIL images and
tensors), antialiasing makes no sense and this parameter is ignored.
Possible values are:
- ``True``: will apply antialiasing for bilinear or bicubic modes.
Other mode aren't affected. This is probably what you want to use.
- ``False``: will not apply antialiasing for tensors on any mode. PIL
images are still antialiased on bilinear or bicubic modes, because
PIL doesn't support no antialias.
- ``None``: equivalent to ``False`` for tensors and ``True`` for
PIL images. This value exists for legacy reasons and you probably
don't want to use it unless you really know what you are doing.
The current default is ``None`` **but will change to** ``True`` **in
v0.17** for the PIL and Tensor backends to be consistent.
"""
_v1_transform_cls = _transforms.Resize _v1_transform_cls = _transforms.Resize
def __init__( def __init__(
...@@ -76,6 +156,20 @@ class Resize(Transform): ...@@ -76,6 +156,20 @@ class Resize(Transform):
class CenterCrop(Transform): class CenterCrop(Transform):
"""[BETA] Crops the given image/box/mask at the center.
.. betastatus:: CenterCrop transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
Args:
size (sequence or int): Desired output size of the crop. If size is an
int instead of sequence like (h, w), a square crop (size, size) is
made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
"""
_v1_transform_cls = _transforms.CenterCrop _v1_transform_cls = _transforms.CenterCrop
def __init__(self, size: Union[int, Sequence[int]]): def __init__(self, size: Union[int, Sequence[int]]):
...@@ -87,6 +181,53 @@ class CenterCrop(Transform): ...@@ -87,6 +181,53 @@ class CenterCrop(Transform):
class RandomResizedCrop(Transform): class RandomResizedCrop(Transform):
"""[BETA] Crop a random portion of image/box/mask and resize it to a given size.
.. betastatus:: RandomResizedCrop transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
A crop of the original image is made: the crop has a random area (H * W)
and a random aspect ratio. This crop is finally resized to the given
size. This is popularly used to train the Inception networks.
Args:
size (int or sequence): expected output size of the crop, for each edge. If size is an
int instead of sequence like (h, w), a square output size ``(size, size)`` is
made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
.. note::
In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
scale (tuple of float): Specifies the lower and upper bounds for the random area of the crop,
before resizing. The scale is defined with respect to the area of the original image.
ratio (tuple of float): lower and upper bounds for the random aspect ratio of the crop, before
resizing.
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
antialias (bool, optional): Whether to apply antialiasing.
It only affects **tensors** with bilinear or bicubic modes and it is
ignored otherwise: on PIL images, antialiasing is always applied on
bilinear or bicubic modes; on other modes (for PIL images and
tensors), antialiasing makes no sense and this parameter is ignored.
Possible values are:
- ``True``: will apply antialiasing for bilinear or bicubic modes.
Other mode aren't affected. This is probably what you want to use.
- ``False``: will not apply antialiasing for tensors on any mode. PIL
images are still antialiased on bilinear or bicubic modes, because
PIL doesn't support no antialias.
- ``None``: equivalent to ``False`` for tensors and ``True`` for
PIL images. This value exists for legacy reasons and you probably
don't want to use it unless you really know what you are doing.
The current default is ``None`` **but will change to** ``True`` **in
v0.17** for the PIL and Tensor backends to be consistent.
"""
_v1_transform_cls = _transforms.RandomResizedCrop _v1_transform_cls = _transforms.RandomResizedCrop
def __init__( def __init__(
...@@ -164,7 +305,24 @@ ImageOrVideoTypeJIT = Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT] ...@@ -164,7 +305,24 @@ ImageOrVideoTypeJIT = Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]
class FiveCrop(Transform): class FiveCrop(Transform):
""" """[BETA] Crop the given image/box/mask into four corners and the central crop.
.. betastatus:: FiveCrop transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading
dimensions
.. Note::
This transform returns a tuple of images and there may be a mismatch in the number of
inputs and targets your Dataset returns. See below for an example of how to deal with
this.
Args:
size (sequence or int): Desired output size of the crop. If size is an ``int``
instead of sequence like (h, w), a square crop of size (size, size) is made.
If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
Example: Example:
>>> class BatchMultiCrop(transforms.Transform): >>> class BatchMultiCrop(transforms.Transform):
... def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], int]): ... def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], int]):
...@@ -209,8 +367,27 @@ class FiveCrop(Transform): ...@@ -209,8 +367,27 @@ class FiveCrop(Transform):
class TenCrop(Transform): class TenCrop(Transform):
""" """[BETA] Crop the given image/box/mask into four corners and the central crop plus the flipped version of
these (horizontal flipping is used by default).
.. betastatus:: TenCrop transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading
dimensions.
See :class:`~torchvision.transforms.v2.FiveCrop` for an example. See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
.. Note::
This transform returns a tuple of images and there may be a mismatch in the number of
inputs and targets your Dataset returns. See below for an example of how to deal with
this.
Args:
size (sequence or int): Desired output size of the crop. If size is an
int instead of sequence like (h, w), a square crop (size, size) is
made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
vertical_flip (bool): Use vertical flipping instead of horizontal
""" """
_v1_transform_cls = _transforms.TenCrop _v1_transform_cls = _transforms.TenCrop
...@@ -249,6 +426,46 @@ class TenCrop(Transform): ...@@ -249,6 +426,46 @@ class TenCrop(Transform):
class Pad(Transform): class Pad(Transform):
"""[BETA] Pad the given image/box/mask on all sides with the given "pad" value.
.. betastatus:: Pad transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means at most 2 leading dimensions for mode reflect and symmetric,
at most 3 leading dimensions for mode edge,
and an arbitrary number of leading dimensions for mode constant
Args:
padding (int or sequence): Padding on each border. If a single int is provided this
is used to pad all borders. If sequence of length 2 is provided this is the padding
on left/right and top/bottom respectively. If a sequence of length 4 is provided
this is the padding for the left, top, right and bottom borders respectively.
.. note::
In torchscript mode padding as single int is not supported, use a sequence of
length 1: ``[padding, ]``.
fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
length 3, it is used to fill R, G, B channels respectively.
This value is only used when the padding_mode is constant.
Only number is supported for torch Tensor.
Only int or tuple value is supported for PIL Image.
padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
Default is constant.
- constant: pads with a constant value, this value is specified with fill
- edge: pads with the last value at the edge of the image.
If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
- reflect: pads with reflection of image without repeating the last value on the edge.
For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
will result in [3, 2, 1, 2, 3, 4, 3, 2]
- symmetric: pads with reflection of image repeating the last value on the edge.
For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
will result in [2, 1, 1, 2, 3, 4, 4, 3]
"""
_v1_transform_cls = _transforms.Pad _v1_transform_cls = _transforms.Pad
def _extract_params_for_v1_transform(self) -> Dict[str, Any]: def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
...@@ -323,6 +540,34 @@ class RandomZoomOut(_RandomApplyTransform): ...@@ -323,6 +540,34 @@ class RandomZoomOut(_RandomApplyTransform):
class RandomRotation(Transform): class RandomRotation(Transform):
"""[BETA] Rotate the image/box/mask by angle.
.. betastatus:: RandomRotation transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
Args:
degrees (sequence or number): Range of degrees to select from.
If degrees is a number instead of sequence like (min, max), the range of degrees
will be (-degrees, +degrees).
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
expand (bool, optional): Optional expansion flag.
If true, expands the output to make it large enough to hold the entire rotated image.
If false or omitted, make the output image the same size as the input image.
Note that the expand flag assumes rotation around the center and no translation.
center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
Default is the center of the image.
fill (sequence or number): Pixel fill value for the area outside the rotated
image. Default is ``0``. If given a number, the value is used for all bands respectively.
.. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
"""
_v1_transform_cls = _transforms.RandomRotation _v1_transform_cls = _transforms.RandomRotation
def __init__( def __init__(
...@@ -363,6 +608,42 @@ class RandomRotation(Transform): ...@@ -363,6 +608,42 @@ class RandomRotation(Transform):
class RandomAffine(Transform): class RandomAffine(Transform):
"""[BETA] Random affine transformation of the image/box/mask keeping center invariant.
.. betastatus:: RandomAffine transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
Args:
degrees (sequence or number): Range of degrees to select from.
If degrees is a number instead of sequence like (min, max), the range of degrees
will be (-degrees, +degrees). Set to 0 to deactivate rotations.
translate (tuple, optional): tuple of maximum absolute fraction for horizontal
and vertical translations. For example translate=(a, b), then horizontal shift
is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is
randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default.
scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
randomly sampled from the range a <= scale <= b. Will keep original scale by default.
shear (sequence or number, optional): Range of degrees to select from.
If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear)
will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the
range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
Will not apply shear by default.
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
fill (sequence or number): Pixel fill value for the area outside the transformed
image. Default is ``0``. If given a number, the value is used for all bands respectively.
center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
Default is the center of the image.
.. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
"""
_v1_transform_cls = _transforms.RandomAffine _v1_transform_cls = _transforms.RandomAffine
def __init__( def __init__(
...@@ -443,6 +724,52 @@ class RandomAffine(Transform): ...@@ -443,6 +724,52 @@ class RandomAffine(Transform):
class RandomCrop(Transform): class RandomCrop(Transform):
"""[BETA] Crop the given image/box/mask at a random location.
.. betastatus:: RandomCrop transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions,
but if non-constant padding is used, the input is expected to have at most 2 leading dimensions
Args:
size (sequence or int): Desired output size of the crop. If size is an
int instead of sequence like (h, w), a square crop (size, size) is
made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
padding (int or sequence, optional): Optional padding on each border
of the image. Default is None. If a single int is provided this
is used to pad all borders. If sequence of length 2 is provided this is the padding
on left/right and top/bottom respectively. If a sequence of length 4 is provided
this is the padding for the left, top, right and bottom borders respectively.
.. note::
In torchscript mode padding as single int is not supported, use a sequence of
length 1: ``[padding, ]``.
pad_if_needed (boolean): It will pad the image if smaller than the
desired size to avoid raising an exception. Since cropping is done
after padding, the padding seems to be done at a random offset.
fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
length 3, it is used to fill R, G, B channels respectively.
This value is only used when the padding_mode is constant.
Only number is supported for torch Tensor.
Only int or tuple value is supported for PIL Image.
padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
Default is constant.
- constant: pads with a constant value, this value is specified with fill
- edge: pads with the last value at the edge of the image.
If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
- reflect: pads with reflection of image without repeating the last value on the edge.
For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
will result in [3, 2, 1, 2, 3, 4, 3, 2]
- symmetric: pads with reflection of image repeating the last value on the edge.
For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
will result in [2, 1, 1, 2, 3, 4, 4, 3]
"""
_v1_transform_cls = _transforms.RandomCrop _v1_transform_cls = _transforms.RandomCrop
def _extract_params_for_v1_transform(self) -> Dict[str, Any]: def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
...@@ -552,6 +879,25 @@ class RandomCrop(Transform): ...@@ -552,6 +879,25 @@ class RandomCrop(Transform):
class RandomPerspective(_RandomApplyTransform): class RandomPerspective(_RandomApplyTransform):
"""[BETA] Performs a random perspective transformation of the given image/box/mask with a given probability.
.. betastatus:: RandomPerspective transform
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
Args:
distortion_scale (float): argument to control the degree of distortion and ranges from 0 to 1.
Default is 0.5.
p (float): probability of the image being transformed. Default is 0.5.
interpolation (InterpolationMode): Desired interpolation enum defined by
:class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
fill (sequence or number): Pixel fill value for the area outside the transformed
image. Default is ``0``. If given a number, the value is used for all bands respectively.
"""
_v1_transform_cls = _transforms.RandomPerspective _v1_transform_cls = _transforms.RandomPerspective
def __init__( def __init__(
......
...@@ -22,6 +22,27 @@ class ConvertBoundingBoxFormat(Transform): ...@@ -22,6 +22,27 @@ class ConvertBoundingBoxFormat(Transform):
class ConvertDtype(Transform): class ConvertDtype(Transform):
"""[BETA] Convert a tensor image/box/mask to the given ``dtype`` and scale the values accordingly
.. betastatus:: ConvertDtype transform
This function does not support PIL Image.
Args:
dtype (torch.dtype): Desired data type of the output
.. note::
When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly.
If converted back and forth, this mismatch has no effect.
Raises:
RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as
well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to
overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range
of the integer ``dtype``.
"""
_v1_transform_cls = _transforms.ConvertImageDtype _v1_transform_cls = _transforms.ConvertImageDtype
_transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video) _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
......
...@@ -21,6 +21,16 @@ class Identity(Transform): ...@@ -21,6 +21,16 @@ class Identity(Transform):
class Lambda(Transform): class Lambda(Transform):
"""[BETA] Apply a user-defined lambda as a transform.
.. betastatus:: Lambda transform
This transform does not support torchscript.
Args:
lambd (function): Lambda/function to be used for transform.
"""
def __init__(self, lambd: Callable[[Any], Any], *types: Type): def __init__(self, lambd: Callable[[Any], Any], *types: Type):
super().__init__() super().__init__()
self.lambd = lambd self.lambd = lambd
...@@ -42,6 +52,26 @@ class Lambda(Transform): ...@@ -42,6 +52,26 @@ class Lambda(Transform):
class LinearTransformation(Transform): class LinearTransformation(Transform):
"""[BETA] Transform a tensor image with a square transformation matrix and a mean_vector computed offline.
.. betastatus:: LinearTransformation transform
This transform does not support PIL Image.
Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and
subtract mean_vector from it which is then followed by computing the dot
product with the transformation matrix and then reshaping the tensor to its
original shape.
Applications:
whitening transformation: Suppose X is a column vector zero-centered data.
Then compute the data covariance matrix [D x D] with torch.mm(X.t(), X),
perform SVD on this matrix and pass it as transformation_matrix.
Args:
transformation_matrix (Tensor): tensor [D x D], D = C x H x W
mean_vector (Tensor): tensor [D], D = C x H x W
"""
_v1_transform_cls = _transforms.LinearTransformation _v1_transform_cls = _transforms.LinearTransformation
_transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video) _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
...@@ -105,6 +135,26 @@ class LinearTransformation(Transform): ...@@ -105,6 +135,26 @@ class LinearTransformation(Transform):
class Normalize(Transform): class Normalize(Transform):
"""[BETA] Normalize a tensor image with mean and standard deviation.
.. betastatus:: Normalize transform
This transform does not support PIL Image.
Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
channels, this transform will normalize each channel of the input
``torch.*Tensor`` i.e.,
``output[channel] = (input[channel] - mean[channel]) / std[channel]``
.. note::
This transform acts out of place, i.e., it does not mutate the input tensor.
Args:
mean (sequence): Sequence of means for each channel.
std (sequence): Sequence of standard deviations for each channel.
inplace(bool,optional): Bool to make this operation in-place.
"""
_v1_transform_cls = _transforms.Normalize _v1_transform_cls = _transforms.Normalize
_transformed_types = (datapoints.Image, is_simple_tensor, datapoints.Video) _transformed_types = (datapoints.Image, is_simple_tensor, datapoints.Video)
...@@ -125,6 +175,24 @@ class Normalize(Transform): ...@@ -125,6 +175,24 @@ class Normalize(Transform):
class GaussianBlur(Transform): class GaussianBlur(Transform):
"""[BETA] Blurs image with randomly chosen Gaussian blur.
.. betastatus:: GausssianBlur transform
If the image is torch Tensor, it is expected
to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions.
Args:
kernel_size (int or sequence): Size of the Gaussian kernel.
sigma (float or tuple of float (min, max)): Standard deviation to be used for
creating kernel to perform blurring. If float, sigma is fixed. If it is tuple
of float (min, max), sigma is chosen uniformly at random to lie in the
given range.
Returns:
PIL Image or Tensor: Gaussian blurred version of the input image.
"""
_v1_transform_cls = _transforms.GaussianBlur _v1_transform_cls = _transforms.GaussianBlur
def __init__( def __init__(
......
...@@ -11,6 +11,15 @@ from torchvision.transforms.v2.utils import is_simple_tensor ...@@ -11,6 +11,15 @@ from torchvision.transforms.v2.utils import is_simple_tensor
class PILToTensor(Transform): class PILToTensor(Transform):
"""[BETA] Convert a ``PIL Image`` to a tensor of the same type.
.. betastatus:: PILToTensor transform
This transform does not support torchscript.
Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W).
"""
_transformed_types = (PIL.Image.Image,) _transformed_types = (PIL.Image.Image,)
def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Tensor: def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Tensor:
...@@ -27,6 +36,27 @@ class ToImageTensor(Transform): ...@@ -27,6 +36,27 @@ class ToImageTensor(Transform):
class ToImagePIL(Transform): class ToImagePIL(Transform):
"""[BETA] Convert a tensor or an ndarray to PIL Image.
.. betastatus:: ToImagePIL transform
This transform does not support torchscript.
Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
H x W x C to a PIL Image while preserving the value range.
Args:
mode (`PIL.Image mode`_): color space and pixel depth of input data (optional).
If ``mode`` is ``None`` (default) there are some assumptions made about the input data:
- If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``.
- If the input has 3 channels, the ``mode`` is assumed to be ``RGB``.
- If the input has 2 channels, the ``mode`` is assumed to be ``LA``.
- If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``,
``short``).
.. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
"""
_transformed_types = (is_simple_tensor, datapoints.Image, np.ndarray) _transformed_types = (is_simple_tensor, datapoints.Image, np.ndarray)
def __init__(self, mode: Optional[str] = None) -> None: def __init__(self, mode: Optional[str] = None) -> None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment