Unverified Commit a6b77598 authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Add Image Processors (#19796)



* Add CLIP image processor

* Crop size as dict too

* Update warning

* Actually use logger this time

* Normalize doesn't change dtype of input

* Add perceiver image processor

* Tidy up

* Add DPT image processor

* Add Vilt image processor

* Tidy up

* Add poolformer image processor

* Tidy up

* Add LayoutLM v2 and v3 imsge processors

* Tidy up

* Add Flava image processor

* Tidy up

* Add deit image processor

* Tidy up

* Add ConvNext image processor

* Tidy up

* Add levit image processor

* Add segformer image processor

* Add in post processing

* Fix up

* Add ImageGPT image processor

* Fixup

* Add mobilevit image processor

* Tidy up

* Add postprocessing

* Fixup

* Add VideoMAE image processor

* Tidy up

* Add ImageGPT image processor

* Fixup

* Add ViT image processor

* Tidy up

* Add beit image processor

* Add mobilevit image processor

* Tidy up

* Add postprocessing

* Fixup

* Fix up

* Fix flava and remove tree module

* Fix image classification pipeline failing tests

* Update feature extractor in trainer scripts

* Update pad_if_smaller to accept tuple and int size

* Update for image segmentation pipeline

* Update src/transformers/models/perceiver/image_processing_perceiver.py
Co-authored-by: default avatarAlara Dirik <8944735+alaradirik@users.noreply.github.com>

* Update src/transformers/image_processing_utils.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/beit/image_processing_beit.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* PR comments - docstrings; remove accidentally added resize; var names

* Update docstrings

* Add exception if size is not in the right format

* Fix exception check

* Fix up

* Use shortest_edge in tuple in script
Co-authored-by: default avatarAlara Dirik <8944735+alaradirik@users.noreply.github.com>
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>
parent 2e3452af
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for DPT."""
import math
from typing import Dict, Iterable, List, Optional, Tuple, Union
import numpy as np
from transformers.utils import is_vision_available
from transformers.utils.generic import TensorType
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import normalize, rescale, resize, to_channel_dimension_format
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
is_batched,
is_torch_available,
is_torch_tensor,
to_numpy_array,
valid_images,
)
from ...utils import logging
if is_torch_available():
import torch
if is_vision_available():
import PIL
logger = logging.get_logger(__name__)
def get_resize_output_image_size(
input_image: np.ndarray, output_size: Union[int, Iterable[int]], keep_aspect_ratio: bool, multiple: int
) -> Tuple[int, int]:
def constraint_to_multiple_of(val, multiple, min_val=0, max_val=None):
x = round(val / multiple) * multiple
if max_val is not None and x > max_val:
x = math.floor(val / multiple) * multiple
if x < min_val:
x = math.ceil(val / multiple) * multiple
return x
output_size = (output_size, output_size) if isinstance(output_size, int) else output_size
input_height, input_width = get_image_size(input_image)
output_height, output_width = output_size
# determine new height and width
scale_height = output_height / input_height
scale_width = output_width / input_width
if keep_aspect_ratio:
# scale as little as possible
if abs(1 - scale_width) < abs(1 - scale_height):
# fit width
scale_height = scale_width
else:
# fit height
scale_width = scale_height
new_height = constraint_to_multiple_of(scale_height * input_height, multiple=multiple)
new_width = constraint_to_multiple_of(scale_width * input_width, multiple=multiple)
return (new_height, new_width)
class DPTImageProcessor(BaseImageProcessor):
r"""
Constructs a DPT image processor.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions. Can be overidden by `do_resize` in `preprocess`.
size (`Dict[str, int]` *optional*, defaults to `{"height": 384, "width": 384}`):
Size of the image after resizing. Can be overidden by `size` in `preprocess`.
keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
be overidden by `keep_aspect_ratio` in `preprocess`.
ensure_multiple_of (`int`, *optional*, defaults to `1`):
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden
by `ensure_multiple_of` in `preprocess`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Defines the resampling filter to use if resizing the image. Can be overidden by `resample` in `preprocess`.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overidden by `do_rescale` in
`preprocess`.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overidden by `rescale_factor` in `preprocess`.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
keep_aspect_ratio: bool = False,
ensure_multiple_of: int = 1,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 384, "width": 384}
size = get_size_dict(size)
self.do_resize = do_resize
self.size = size
self.keep_aspect_ratio = keep_aspect_ratio
self.ensure_multiple_of = ensure_multiple_of
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
keep_aspect_ratio: bool = False,
ensure_multiple_of: int = 1,
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Resize an image to target size `(size["height"], size["width"])`. If `keep_aspect_ratio` is `True`, the image
is resized to the largest possible size such that the aspect ratio is preserved. If `ensure_multiple_of` is
set, the image is resized to a size that is a multiple of this value.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Target size of the output image.
keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
ensure_multiple_of (`int`, *optional*, defaults to `1`):
The image is resized to a size that is a multiple of this value.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Defines the resampling filter to use if resizing the image. Otherwise, the image is resized to size
specified in `size`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resiizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
output_size = get_resize_output_image_size(
image,
output_size=(size["height"], size["width"]),
keep_aspect_ratio=keep_aspect_ratio,
multiple=ensure_multiple_of,
)
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
def rescale(
self,
image: np.ndarray,
scale: Union[int, float],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
):
"""
Rescale an image by a scale factor. image = image * scale.
Args:
image (`np.ndarray`):
Image to rescale.
scale (`int` or `float`):
Scale to apply to the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return rescale(image, scale=scale, data_format=data_format, **kwargs)
def normalize(
self,
image: np.ndarray,
mean: Union[float, List[float]],
std: Union[float, List[float]],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Normalize an image. image = (image - image_mean) / image_std.
Args:
image (`np.ndarray`):
Image to normalize.
image_mean (`float` or `List[float]`):
Image mean.
image_std (`float` or `List[float]`):
Image standard deviation.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
def preprocess(
self,
images: ImageInput,
do_resize: bool = None,
size: int = None,
keep_aspect_ratio: bool = None,
ensure_multiple_of: int = None,
resample: PILImageResampling = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
) -> PIL.Image.Image:
"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to preprocess.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after reszing. If `keep_aspect_ratio` is `True`, the image is resized to the largest
possible size such that the aspect ratio is preserved. If `ensure_multiple_of` is set, the image is
resized to a size that is a multiple of this value.
keep_aspect_ratio (`bool`, *optional*, defaults to `self.keep_aspect_ratio`):
Whether to keep the aspect ratio of the image. If False, the image will be resized to (size, size). If
True, the image will be resized to keep the aspect ratio and the size will be the maximum possible.
ensure_multiple_of (`int`, *optional*, defaults to `self.ensure_multiple_of`):
Ensure that the image size is a multiple of this value.
resample (`int`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
has an effect if `do_resize` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image values between [0 - 1].
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Image mean.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
size = get_size_dict(size)
keep_aspect_ratio = keep_aspect_ratio if keep_aspect_ratio is not None else self.keep_aspect_ratio
ensure_multiple_of = ensure_multiple_of if ensure_multiple_of is not None else self.ensure_multiple_of
resample = resample if resample is not None else self.resample
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
if not is_batched(images):
images = [images]
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
if do_resize and size is None or resample is None:
raise ValueError("Size and resample must be specified if do_resize is True.")
if do_rescale and rescale_factor is None:
raise ValueError("Rescale factor must be specified if do_rescale is True.")
if do_normalize and (image_mean is None or image_std is None):
raise ValueError("Image mean and std must be specified if do_normalize is True.")
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if do_resize:
images = [self.resize(image=image, size=size, resample=resample) for image in images]
if do_rescale:
images = [self.rescale(image=image, scale=rescale_factor) for image in images]
if do_normalize:
images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
images = [to_channel_dimension_format(image, data_format) for image in images]
data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors)
def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
"""
Args:
Converts the output of [`DPTForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
outputs ([`DPTForSemanticSegmentation`]):
Raw outputs of the model.
target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
predictions will not be resized.
Returns:
semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
"""
# TODO: add support for other frameworks
logits = outputs.logits
# Resize logits and compute semantic segmentation maps
if target_sizes is not None:
if len(logits) != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
if is_torch_tensor(target_sizes):
target_sizes = target_sizes.numpy()
semantic_segmentation = []
for idx in range(len(logits)):
resized_logits = torch.nn.functional.interpolate(
logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
)
semantic_map = resized_logits[0].argmax(dim=0)
semantic_segmentation.append(semantic_map)
else:
semantic_segmentation = logits.argmax(dim=1)
semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
return semantic_segmentation
...@@ -14,344 +14,10 @@ ...@@ -14,344 +14,10 @@
# limitations under the License. # limitations under the License.
"""Feature extractor class for FLAVA.""" """Feature extractor class for FLAVA."""
import math from ...utils import logging
import random from .image_processing_flava import FlavaImageProcessor
from functools import lru_cache
from typing import Any, List, Optional, Tuple, Union
import numpy as np
from PIL import Image
from transformers.image_utils import PILImageResampling
from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
from ...image_utils import ImageFeatureExtractionMixin, is_torch_tensor
from ...utils import TensorType, logging
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
FlavaFeatureExtractor = FlavaImageProcessor
# These values are taken from CLIP
FLAVA_IMAGE_MEAN = [0.48145466, 0.4578275, 0.40821073]
FLAVA_IMAGE_STD = [0.26862954, 0.26130258, 0.27577711]
FLAVA_CODEBOOK_MEAN = [0.0, 0.0, 0.0]
FLAVA_CODEBOOK_STD = [1.0, 1.0, 1.0]
LOGIT_LAPLACE_EPS: float = 0.1
# Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py
class FlavaMaskingGenerator:
def __init__(
self,
input_size: Union[int, Tuple[int, int]] = 14,
total_mask_patches: int = 75,
mask_group_max_patches: Optional[int] = None,
mask_group_min_patches: int = 16,
mask_group_min_aspect_ratio: Optional[float] = 0.3,
mask_group_max_aspect_ratio: float = None,
):
if not isinstance(input_size, tuple):
input_size = (input_size,) * 2
self.height, self.width = input_size
self.num_patches = self.height * self.width
self.total_mask_patches = total_mask_patches
self.mask_group_min_patches = mask_group_min_patches
self.mask_group_max_patches = total_mask_patches if mask_group_max_patches is None else mask_group_max_patches
mask_group_max_aspect_ratio = mask_group_max_aspect_ratio or 1 / mask_group_min_aspect_ratio
self.log_aspect_ratio = (math.log(mask_group_min_aspect_ratio), math.log(mask_group_max_aspect_ratio))
def __repr__(self):
repr_str = "MaskingGenerator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % (
self.height,
self.width,
self.mask_group_min_patches,
self.mask_group_max_patches,
self.total_mask_patches,
self.log_aspect_ratio[0],
self.log_aspect_ratio[1],
)
return repr_str
def get_shape(self):
return self.height, self.width
def _mask(self, mask, max_mask_patches):
delta = 0
for _attempt in range(10):
target_area = random.uniform(self.mask_group_min_patches, max_mask_patches)
aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
height = int(round(math.sqrt(target_area * aspect_ratio)))
width = int(round(math.sqrt(target_area / aspect_ratio)))
if width < self.width and height < self.height:
top = random.randint(0, self.height - height)
left = random.randint(0, self.width - width)
num_masked = mask[top : top + height, left : left + width].sum()
# Overlap
if 0 < height * width - num_masked <= max_mask_patches:
for i in range(top, top + height):
for j in range(left, left + width):
if mask[i, j] == 0:
mask[i, j] = 1
delta += 1
if delta > 0:
break
return delta
def __call__(self):
mask = np.zeros(shape=self.get_shape(), dtype=int)
mask_count = 0
while mask_count < self.total_mask_patches:
max_mask_patches = self.total_mask_patches - mask_count
max_mask_patches = min(max_mask_patches, self.mask_group_max_patches)
delta = self._mask(mask, max_mask_patches)
if delta == 0:
break
else:
mask_count += delta
return mask
class FlavaFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
r"""
Constructs a FLAVA feature extractor.
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
should refer to this superclass for more information regarding those methods.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the input to a certain `size`.
size (`int`, *optional*, defaults to 224):
Resize the input to the given size. Only has an effect if `do_resize` is set to `True`.
resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BICUBIC`):
An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
`PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
`PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
to `True`.
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge, the
image is padded with 0's and then center cropped.
crop_size (`int`, *optional*, defaults to 224):
Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the input with `image_mean` and `image_std`.
image_mean (`Tuple[float, float, float]`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
The sequence of means for each channel, to be used when normalizing images.
image_std (`Tuple[float, float, float]`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
The sequence of standard deviations for each channel, to be used when normalizing images.
input_size_patches (`int`, *optional*, defaults to 14):
Number of patches in the image in height and width direction. 14x14 = 196 total patches.
total_mask_patches (`int`, *optional*, defaults to 75):
Total number of patches that should be masked.
mask_group_min_patches (`int`, *optional*, defaults to 16):
Minimum number of patches that should be masked.
mask_group_max_patches (`int`, *optional*, defaults to None):
Maximum number of patches that should be masked.
mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
Minimum aspect ratio of the mask window.
mask_group_max_aspect_ratio (`float`, *optional*, defaults to None):
Maximum aspect ratio of the mask window
codebook_do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the input for codebook to a certain `codebook_size`.
codebook_size (`int`, *optional*, defaults to 224):
Resize the input for codebook to the given size. Only has an effect if `codebook_do_resize` is set to
`True`.
codebook_resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BICUBIC`):
An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
`PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
`PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
to `True`.
codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to crop the input for codebook at the center. If the input size is smaller than
`codebook_crop_size` along any edge, the image is padded with 0's and then center cropped.
codebook_crop_size (`int`, *optional*, defaults to 224):
Desired output size for codebook input when applying center-cropping. Only has an effect if
`codebook_do_center_crop` is set to `True`.
codebook_do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`.
codebook_image_mean (`Tuple[float, float, float]`, *optional*, defaults to `[0, 0, 0]`):
The sequence of means for each channel, to be used when normalizing images for codebook.
codebook_image_std (`Tuple[float, float, float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
The sequence of standard deviations for each channel, to be used when normalizing images for codebook.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Union[int, Tuple[int, int]] = 224,
resample: int = PILImageResampling.BICUBIC,
do_center_crop: bool = True,
crop_size: Union[int, Tuple[int, int]] = 224,
do_normalize: bool = True,
image_mean: Tuple[float, float, float] = FLAVA_IMAGE_MEAN,
image_std: Tuple[float, float, float] = FLAVA_IMAGE_STD,
# Mask related params
input_size_patches: int = 14,
total_mask_patches: int = 75,
mask_group_min_patches: int = 16,
mask_group_max_patches: Optional[int] = None,
mask_group_min_aspect_ratio: float = 0.3,
mask_group_max_aspect_ratio: Optional[float] = None,
# Codebook related params
codebook_do_resize: bool = True,
codebook_size: bool = 112,
codebook_resample: int = PILImageResampling.LANCZOS,
codebook_do_center_crop: bool = True,
codebook_crop_size: int = 112,
codebook_do_map_pixels: bool = True,
codebook_do_normalize: bool = True,
codebook_image_mean: Tuple[float, float, float] = FLAVA_CODEBOOK_MEAN,
codebook_image_std: Tuple[float, float, float] = FLAVA_CODEBOOK_STD,
**kwargs: Any,
):
super().__init__(**kwargs)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.input_size_patches = input_size_patches
self.total_mask_patches = total_mask_patches
self.mask_group_min_patches = mask_group_min_patches
self.mask_group_max_patches = mask_group_max_patches
self.mask_group_min_aspect_ratio = mask_group_min_aspect_ratio
self.mask_group_max_aspect_ratio = mask_group_max_aspect_ratio
self.codebook_do_resize = codebook_do_resize
self.codebook_size = codebook_size
self.codebook_resample = codebook_resample
self.codebook_do_center_crop = codebook_do_center_crop
self.codebook_crop_size = codebook_crop_size
self.codebook_do_map_pixels = codebook_do_map_pixels
self.codebook_do_normalize = codebook_do_normalize
self.codebook_image_mean = codebook_image_mean
self.codebook_image_std = codebook_image_std
@property
@lru_cache()
def masking_generator(self):
return FlavaMaskingGenerator(
input_size=self.input_size_patches,
total_mask_patches=self.total_mask_patches,
mask_group_min_patches=self.mask_group_min_patches,
mask_group_max_patches=self.mask_group_max_patches,
mask_group_min_aspect_ratio=self.mask_group_min_aspect_ratio,
mask_group_max_aspect_ratio=self.mask_group_max_aspect_ratio,
)
def map_pixels(self, x):
return (1 - 2 * LOGIT_LAPLACE_EPS) * x + LOGIT_LAPLACE_EPS
def __call__(
self,
images: Union[
Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"] # noqa
],
return_image_mask: Optional[bool] = None,
return_codebook_pixels: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs: Any
) -> BatchFeature:
"""
Main method to prepare for the model one or several image(s).
<Tip warning={true}>
NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
PIL images.
</Tip>
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
return_image_mask (`bool`, *optional*, defaults to None):
If True, the processor will return `bool_masked_pos` suggesting masks for image's patch version.
return_codebook_pixels (`bool`, *optional*, defaults to None):
If True, the processor will return `codebook_pixel_values` providing image pixels to be used with the
default FLAVA codebook. Used in pretraining by Masked Image Modeling (MIM) loss.
return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model.
"""
# Input type checking for clearer error
if isinstance(images, (list, tuple)) and len(images) != 0:
self._ensure_format_supported(images[0])
else:
self._ensure_format_supported(images)
is_batched = bool(
isinstance(images, (list, tuple))
and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
)
if not is_batched:
images = [images]
images_for_codebook = images
# transformations (resizing + center cropping + normalization)
if self.do_resize and self.size is not None and self.resample is not None:
images = [self.resize(image=image, size=self.size, resample=self.resample) for image in images]
if self.do_center_crop and self.crop_size is not None:
images = [self.center_crop(image, self.crop_size) for image in images]
if self.do_normalize:
images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
# return as BatchFeature
data = {"pixel_values": images}
if return_codebook_pixels:
images = images_for_codebook
if self.codebook_do_resize and self.codebook_size is not None and self.codebook_resample is not None:
images = [
self.resize(image=image, size=self.codebook_size, resample=self.codebook_resample)
for image in images
]
if self.codebook_do_center_crop and self.codebook_crop_size is not None:
images = [self.center_crop(image, self.codebook_crop_size) for image in images]
if self.codebook_do_normalize:
images = [
self.normalize(image=image, mean=self.codebook_image_mean, std=self.codebook_image_std)
for image in images
]
if self.codebook_do_map_pixels:
images = [self.map_pixels(image) for image in images]
data["codebook_pixel_values"] = images
if return_image_mask:
masks = [self.masking_generator() for _ in images]
data["bool_masked_pos"] = masks
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
return encoded_inputs
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for Flava."""
import math
import random
from functools import lru_cache
from typing import Dict, Iterable, List, Optional, Tuple, Union
import numpy as np
from transformers.utils import is_vision_available
from transformers.utils.generic import TensorType
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import center_crop, normalize, rescale, resize, to_channel_dimension_format
from ...image_utils import ChannelDimension, ImageInput, PILImageResampling, is_batched, to_numpy_array, valid_images
from ...utils import logging
if is_vision_available():
import PIL
logger = logging.get_logger(__name__)
# These values are taken from CLIP
FLAVA_IMAGE_MEAN = [0.48145466, 0.4578275, 0.40821073]
FLAVA_IMAGE_STD = [0.26862954, 0.26130258, 0.27577711]
FLAVA_CODEBOOK_MEAN = [0.0, 0.0, 0.0]
FLAVA_CODEBOOK_STD = [1.0, 1.0, 1.0]
LOGIT_LAPLACE_EPS: float = 0.1
# Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py
class FlavaMaskingGenerator:
def __init__(
self,
input_size: Union[int, Tuple[int, int]] = 14,
total_mask_patches: int = 75,
mask_group_max_patches: Optional[int] = None,
mask_group_min_patches: int = 16,
mask_group_min_aspect_ratio: Optional[float] = 0.3,
mask_group_max_aspect_ratio: float = None,
):
if not isinstance(input_size, tuple):
input_size = (input_size,) * 2
self.height, self.width = input_size
self.num_patches = self.height * self.width
self.total_mask_patches = total_mask_patches
self.mask_group_min_patches = mask_group_min_patches
self.mask_group_max_patches = total_mask_patches if mask_group_max_patches is None else mask_group_max_patches
mask_group_max_aspect_ratio = mask_group_max_aspect_ratio or 1 / mask_group_min_aspect_ratio
self.log_aspect_ratio = (math.log(mask_group_min_aspect_ratio), math.log(mask_group_max_aspect_ratio))
def __repr__(self):
repr_str = "MaskingGenerator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % (
self.height,
self.width,
self.mask_group_min_patches,
self.mask_group_max_patches,
self.total_mask_patches,
self.log_aspect_ratio[0],
self.log_aspect_ratio[1],
)
return repr_str
def get_shape(self):
return self.height, self.width
def _mask(self, mask, max_mask_patches):
delta = 0
for _attempt in range(10):
target_area = random.uniform(self.mask_group_min_patches, max_mask_patches)
aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
height = int(round(math.sqrt(target_area * aspect_ratio)))
width = int(round(math.sqrt(target_area / aspect_ratio)))
if width < self.width and height < self.height:
top = random.randint(0, self.height - height)
left = random.randint(0, self.width - width)
num_masked = mask[top : top + height, left : left + width].sum()
# Overlap
if 0 < height * width - num_masked <= max_mask_patches:
for i in range(top, top + height):
for j in range(left, left + width):
if mask[i, j] == 0:
mask[i, j] = 1
delta += 1
if delta > 0:
break
return delta
def __call__(self):
mask = np.zeros(shape=self.get_shape(), dtype=int)
mask_count = 0
while mask_count < self.total_mask_patches:
max_mask_patches = self.total_mask_patches - mask_count
max_mask_patches = min(max_mask_patches, self.mask_group_max_patches)
delta = self._mask(mask, max_mask_patches)
if delta == 0:
break
else:
mask_count += delta
return mask
class FlavaImageProcessor(BaseImageProcessor):
r"""
Constructs a Flava image processor.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
`do_resize` parameter in `preprocess`.
size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
Size of the image after resizing. Can be overridden by the `size` parameter in `preprocess`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in
`preprocess`.
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to center crop the images. Can be overridden by the `do_center_crop` parameter in `preprocess`.
crop_size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
Size of image after the center crop `(crop_size["height"], crop_size["width"])`. Can be overridden by the
`crop_size` parameter in `preprocess`.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in `preprocess`.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in
`preprocess`.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in `preprocess`.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
return_image_mask (`bool`, *optional*, defaults to `False`):
Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
input_size_patches (`int`, *optional*, defaults to 14):
Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden
by the `input_size_patches` parameter in `preprocess`.
total_mask_patches (`int`, *optional*, defaults to 75):
Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in
`preprocess`.
mask_group_min_patches (`int`, *optional*, defaults to 16):
Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches`
parameter in `preprocess`.
mask_group_max_patches (`int`, *optional*):
Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches`
parameter in `preprocess`.
mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter
in `preprocess`.
mask_group_max_aspect_ratio (`float`, *optional*):
Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter
in `preprocess`.
codebook_do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize`
parameter in `preprocess`. `codebook_size`.
codebook_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in
`preprocess`.
codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample`
parameter in `preprocess`.
codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to crop the input for codebook at the center. If the input size is smaller than
`codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be
overridden by the `codebook_do_center_crop` parameter in `preprocess`.
codebook_crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
Desired output size for codebook input when applying center-cropping. Can be overridden by the
`codebook_crop_size` parameter in `preprocess`.
codebook_do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be
overridden by the `codebook_do_rescale` parameter in `preprocess`.
codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Defines the scale factor to use if rescaling the codebook image. Can be overridden by the
`codebook_rescale_factor` parameter in `preprocess`.
codebook_do_map_pixels (`bool`, *optional*, defaults to `True`):
Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the
`codebook_do_map_pixels` parameter in `preprocess`.
codebook_do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can
be overridden by the `codebook_do_normalize` parameter in `preprocess`.
codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`):
The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden
by the `codebook_image_mean` parameter in `preprocess`.
codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can
be overridden by the `codebook_image_std` parameter in `preprocess`.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_center_crop: bool = True,
crop_size: Dict[str, int] = None,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, Iterable[float]]] = None,
image_std: Optional[Union[float, Iterable[float]]] = None,
# Mask related params
return_image_mask: bool = False,
input_size_patches: int = 14,
total_mask_patches: int = 75,
mask_group_min_patches: int = 16,
mask_group_max_patches: Optional[int] = None,
mask_group_min_aspect_ratio: float = 0.3,
mask_group_max_aspect_ratio: Optional[float] = None,
# Codebook related params
return_codebook_pixels: bool = False,
codebook_do_resize: bool = True,
codebook_size: bool = None,
codebook_resample: int = PILImageResampling.LANCZOS,
codebook_do_center_crop: bool = True,
codebook_crop_size: int = None,
codebook_do_rescale: bool = True,
codebook_rescale_factor: Union[int, float] = 1 / 255,
codebook_do_map_pixels: bool = True,
codebook_do_normalize: bool = True,
codebook_image_mean: Optional[Union[float, Iterable[float]]] = None,
codebook_image_std: Optional[Union[float, Iterable[float]]] = None,
**kwargs
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 224, "width": 224}
size = get_size_dict(size)
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
crop_size = get_size_dict(crop_size)
codebook_size = codebook_size if codebook_size is not None else {"height": 112, "width": 112}
codebook_size = get_size_dict(codebook_size)
codebook_crop_size = codebook_crop_size if codebook_crop_size is not None else {"height": 112, "width": 112}
codebook_crop_size = get_size_dict(codebook_crop_size)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else FLAVA_IMAGE_MEAN
self.image_std = image_std if image_std is not None else FLAVA_IMAGE_STD
self.return_image_mask = return_image_mask
self.input_size_patches = input_size_patches
self.total_mask_patches = total_mask_patches
self.mask_group_min_patches = mask_group_min_patches
self.mask_group_max_patches = mask_group_max_patches
self.mask_group_min_aspect_ratio = mask_group_min_aspect_ratio
self.mask_group_max_aspect_ratio = mask_group_max_aspect_ratio
self.return_codebook_pixels = return_codebook_pixels
self.codebook_do_resize = codebook_do_resize
self.codebook_size = codebook_size
self.codebook_resample = codebook_resample
self.codebook_do_center_crop = codebook_do_center_crop
self.codebook_crop_size = codebook_crop_size
self.codebook_do_rescale = codebook_do_rescale
self.codebook_rescale_factor = codebook_rescale_factor
self.codebook_do_map_pixels = codebook_do_map_pixels
self.codebook_do_normalize = codebook_do_normalize
self.codebook_image_mean = codebook_image_mean
self.codebook_image_mean = codebook_image_mean if codebook_image_mean is not None else FLAVA_CODEBOOK_MEAN
self.codebook_image_std = codebook_image_std if codebook_image_std is not None else FLAVA_CODEBOOK_STD
@lru_cache()
def masking_generator(
self,
input_size_patches,
total_mask_patches,
mask_group_min_patches,
mask_group_max_patches,
mask_group_min_aspect_ratio,
mask_group_max_aspect_ratio,
) -> FlavaMaskingGenerator:
return FlavaMaskingGenerator(
input_size=input_size_patches,
total_mask_patches=total_mask_patches,
mask_group_min_patches=mask_group_min_patches,
mask_group_max_patches=mask_group_max_patches,
mask_group_min_aspect_ratio=mask_group_min_aspect_ratio,
mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
)
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Resize an image to `(size["height"], size["width"])`.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resiizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"The size dictionary must contain 'height' and 'width' keys. Got {size.keys()}")
return resize(
image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
)
def center_crop(
self,
image: np.ndarray,
size: Dict[str, int],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
any edge, the image is padded with 0's and then center cropped.
Args:
image (`np.ndarray`):
Image to center crop.
size (`Dict[str, int]`):
Size of the output image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size)
return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
def rescale(
self,
image: np.ndarray,
scale: Union[int, float],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
):
"""
Rescale an image by a scale factor. image = image * scale.
Args:
image (`np.ndarray`):
Image to rescale.
scale (`int` or `float`):
Scale to apply to the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return rescale(image, scale=scale, data_format=data_format, **kwargs)
def normalize(
self,
image: np.ndarray,
mean: Union[float, List[float]],
std: Union[float, List[float]],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Normalize an image. image = (image - image_mean) / image_std.
Args:
image (`np.ndarray`):
Image to normalize.
image_mean (`float` or `List[float]`):
Image mean.
image_std (`float` or `List[float]`):
Image standard deviation.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
def map_pixels(self, image: np.ndarray) -> np.ndarray:
return (1 - 2 * LOGIT_LAPLACE_EPS) * image + LOGIT_LAPLACE_EPS
def _preprocess_image(
self,
image: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_center_crop: bool = None,
crop_size: Dict[str, int] = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_map_pixels: bool = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
) -> np.ndarray:
"""Preprocesses a single image."""
if do_resize and size is None or resample is None:
raise ValueError("Size and resample must be specified if do_resize is True.")
if do_rescale and rescale_factor is None:
raise ValueError("Rescale factor must be specified if do_rescale is True.")
if do_normalize and (image_mean is None or image_std is None):
raise ValueError("Image mean and std must be specified if do_normalize is True.")
# All transformations expect numpy arrays.
image = to_numpy_array(image)
if do_resize:
image = self.resize(image=image, size=size, resample=resample)
if do_center_crop:
image = self.center_crop(image=image, size=crop_size)
if do_rescale:
image = self.rescale(image=image, scale=rescale_factor)
if do_normalize:
image = self.normalize(image=image, mean=image_mean, std=image_std)
if do_map_pixels:
image = self.map_pixels(image)
if data_format is not None:
image = to_channel_dimension_format(image, data_format)
return image
def preprocess(
self,
images: ImageInput,
do_resize: Optional[bool] = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_center_crop: Optional[bool] = None,
crop_size: Optional[Dict[str, int]] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
# Mask related params
return_image_mask: Optional[bool] = None,
input_size_patches: Optional[int] = None,
total_mask_patches: Optional[int] = None,
mask_group_min_patches: Optional[int] = None,
mask_group_max_patches: Optional[int] = None,
mask_group_min_aspect_ratio: Optional[float] = None,
mask_group_max_aspect_ratio: Optional[float] = None,
# Codebook related params
return_codebook_pixels: Optional[bool] = None,
codebook_do_resize: Optional[bool] = None,
codebook_size: Optional[Dict[str, int]] = None,
codebook_resample: Optional[int] = None,
codebook_do_center_crop: Optional[bool] = None,
codebook_crop_size: Optional[Dict[str, int]] = None,
codebook_do_rescale: Optional[bool] = None,
codebook_rescale_factor: Optional[float] = None,
codebook_do_map_pixels: Optional[bool] = None,
codebook_do_normalize: Optional[bool] = None,
codebook_image_mean: Optional[Iterable[float]] = None,
codebook_image_std: Optional[Iterable[float]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
) -> PIL.Image.Image:
"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to preprocess.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image.
resample (`int`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
has an effect if `do_resize` is set to `True`.
do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
Whether to center crop the image.
crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image values between [0 - 1].
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Image mean.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation.
return_image_mask (`bool`, *optional*, defaults to `self.return_image_mask`):
Whether to return the image mask.
input_size_patches (`int`, *optional*, defaults to `self.input_size_patches`):
Size of the patches to extract from the image.
total_mask_patches (`int`, *optional*, defaults to `self.total_mask_patches`):
Total number of patches to extract from the image.
mask_group_min_patches (`int`, *optional*, defaults to `self.mask_group_min_patches`):
Minimum number of patches to extract from the image.
mask_group_max_patches (`int`, *optional*, defaults to `self.mask_group_max_patches`):
Maximum number of patches to extract from the image.
mask_group_min_aspect_ratio (`float`, *optional*, defaults to `self.mask_group_min_aspect_ratio`):
Minimum aspect ratio of the patches to extract from the image.
mask_group_max_aspect_ratio (`float`, *optional*, defaults to `self.mask_group_max_aspect_ratio`):
Maximum aspect ratio of the patches to extract from the image.
return_codebook_pixels (`bool`, *optional*, defaults to `self.return_codebook_pixels`):
Whether to return the codebook pixels.
codebook_do_resize (`bool`, *optional*, defaults to `self.codebook_do_resize`):
Whether to resize the codebook pixels.
codebook_size (`Dict[str, int]`, *optional*, defaults to `self.codebook_size`):
Size of the codebook pixels.
codebook_resample (`int`, *optional*, defaults to `self.codebook_resample`):
Resampling filter to use if resizing the codebook pixels. This can be one of the enum
`PILImageResampling`, Only has an effect if `codebook_do_resize` is set to `True`.
codebook_do_center_crop (`bool`, *optional*, defaults to `self.codebook_do_center_crop`):
Whether to center crop the codebook pixels.
codebook_crop_size (`Dict[str, int]`, *optional*, defaults to `self.codebook_crop_size`):
Size of the center crop of the codebook pixels. Only has an effect if `codebook_do_center_crop` is set
to `True`.
codebook_do_rescale (`bool`, *optional*, defaults to `self.codebook_do_rescale`):
Whether to rescale the codebook pixels values between [0 - 1].
codebook_rescale_factor (`float`, *optional*, defaults to `self.codebook_rescale_factor`):
Rescale factor to rescale the codebook pixels by if `codebook_do_rescale` is set to `True`.
codebook_do_map_pixels (`bool`, *optional*, defaults to `self.codebook_do_map_pixels`):
Whether to map the codebook pixels values.
codebook_do_normalize (`bool`, *optional*, defaults to `self.codebook_do_normalize`):
Whether to normalize the codebook pixels.
codebook_image_mean (`float` or `List[float]`, *optional*, defaults to `self.codebook_image_mean`):
Codebook pixels mean to normalize the codebook pixels by if `codebook_do_normalize` is set to `True`.
codebook_image_std (`float` or `List[float]`, *optional*, defaults to `self.codebook_image_std`):
Codebook pixels standard deviation to normalize the codebook pixels by if `codebook_do_normalize` is
set to `True`.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
size = get_size_dict(size)
resample = resample if resample is not None else self.resample
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
crop_size = crop_size if crop_size is not None else self.crop_size
crop_size = get_size_dict(crop_size)
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
return_image_mask = return_image_mask if return_image_mask is not None else self.return_image_mask
input_size_patches = input_size_patches if input_size_patches is not None else self.input_size_patches
total_mask_patches = total_mask_patches if total_mask_patches is not None else self.total_mask_patches
mask_group_min_patches = (
mask_group_min_patches if mask_group_min_patches is not None else self.mask_group_min_patches
)
mask_group_max_patches = (
mask_group_max_patches if mask_group_max_patches is not None else self.mask_group_max_patches
)
mask_group_min_aspect_ratio = (
mask_group_min_aspect_ratio
if mask_group_min_aspect_ratio is not None
else self.mask_group_min_aspect_ratio
)
mask_group_max_aspect_ratio = (
mask_group_max_aspect_ratio
if mask_group_max_aspect_ratio is not None
else self.mask_group_max_aspect_ratio
)
return_codebook_pixels = (
return_codebook_pixels if return_codebook_pixels is not None else self.return_codebook_pixels
)
codebook_do_resize = codebook_do_resize if codebook_do_resize is not None else self.codebook_do_resize
codebook_size = codebook_size if codebook_size is not None else self.codebook_size
codebook_size = get_size_dict(codebook_size)
codebook_resample = codebook_resample if codebook_resample is not None else self.codebook_resample
codebook_do_rescale = codebook_do_rescale if codebook_do_rescale is not None else self.codebook_do_rescale
codebook_rescale_factor = (
codebook_rescale_factor if codebook_rescale_factor is not None else self.codebook_rescale_factor
)
codebook_do_center_crop = (
codebook_do_center_crop if codebook_do_center_crop is not None else self.codebook_do_center_crop
)
codebook_crop_size = codebook_crop_size if codebook_crop_size is not None else self.codebook_crop_size
codebook_crop_size = get_size_dict(codebook_crop_size)
codebook_do_map_pixels = (
codebook_do_map_pixels if codebook_do_map_pixels is not None else self.codebook_do_map_pixels
)
codebook_do_normalize = (
codebook_do_normalize if codebook_do_normalize is not None else self.codebook_do_normalize
)
codebook_image_mean = codebook_image_mean if codebook_image_mean is not None else self.codebook_image_mean
codebook_image_std = codebook_image_std if codebook_image_std is not None else self.codebook_image_std
if not is_batched(images):
images = [images]
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
processed_images = [
self._preprocess_image(
image=img,
do_resize=do_resize,
size=size,
resample=resample,
do_center_crop=do_center_crop,
crop_size=crop_size,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
do_map_pixels=False,
data_format=data_format,
)
for img in images
]
data = {"pixel_values": processed_images}
if return_codebook_pixels:
codebook_images = [
self._preprocess_image(
image=img,
do_resize=codebook_do_resize,
size=codebook_size,
resample=codebook_resample,
do_center_crop=codebook_do_center_crop,
crop_size=codebook_crop_size,
do_rescale=codebook_do_rescale,
rescale_factor=codebook_rescale_factor,
do_normalize=codebook_do_normalize,
image_mean=codebook_image_mean,
image_std=codebook_image_std,
do_map_pixels=codebook_do_map_pixels,
data_format=data_format,
)
for img in images
]
data["codebook_pixel_values"] = codebook_images
if return_image_mask:
mask_generator = self.masking_generator(
input_size_patches=input_size_patches,
total_mask_patches=total_mask_patches,
mask_group_min_patches=mask_group_min_patches,
mask_group_max_patches=mask_group_max_patches,
mask_group_min_aspect_ratio=mask_group_min_aspect_ratio,
mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
)
masks = [mask_generator() for _ in images]
data["bool_masked_pos"] = masks
return BatchFeature(data=data, tensor_type=return_tensors)
...@@ -37,16 +37,16 @@ class GLPNImageProcessor(BaseImageProcessor): ...@@ -37,16 +37,16 @@ class GLPNImageProcessor(BaseImageProcessor):
Args: Args:
do_resize (`bool`, *optional*, defaults to `True`): do_resize (`bool`, *optional*, defaults to `True`):
Set the class default for the `do_resize` parameter. Controls whether to resize the image's (height, width) Whether to resize the image's (height, width) dimensions, rounding them down to the closest multiple of
dimensions, rounding them down to the closest multiple of `size_divisor`. `size_divisor`. Can be overridden by `do_resize` in `preprocess`.
size_divisor (`int`, *optional*, defaults to 32): size_divisor (`int`, *optional*, defaults to 32):
Set the class default for the `size_divisor` parameter. When `do_resize` is `True`, images are resized so When `do_resize` is `True`, images are resized so their height and width are rounded down to the closest
their height and width are rounded down to the closest multiple of `size_divisor`. multiple of `size_divisor`. Can be overridden by `size_divisor` in `preprocess`.
resample (`PIL.Image` resampling filter, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`): resample (`PIL.Image` resampling filter, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
Set the class default for `resample`. Defines the resampling filter to use if resizing the image. Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
do_rescale (`bool`, *optional*, defaults to `True`): do_rescale (`bool`, *optional*, defaults to `True`):
Set the class default for the `do_rescale` parameter. Controls whether or not to apply the scaling factor Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Can be
(to make pixel values floats between 0. and 1.). overridden by `do_rescale` in `preprocess`.
""" """
model_input_names = ["pixel_values"] model_input_names = ["pixel_values"]
...@@ -81,7 +81,7 @@ class GLPNImageProcessor(BaseImageProcessor): ...@@ -81,7 +81,7 @@ class GLPNImageProcessor(BaseImageProcessor):
`size_divisor`. `size_divisor`.
resample: resample:
`PIL.Image` resampling filter to use when resizing the image e.g. `PIL.Image.Resampling.BILINEAR`. `PIL.Image` resampling filter to use when resizing the image e.g. `PIL.Image.Resampling.BILINEAR`.
data_format (`ChannelDimension`, *optional*): data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If `None`, the channel dimension format of the input The channel dimension format for the output image. If `None`, the channel dimension format of the input
image is used. Can be one of: image is used. Can be one of:
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
...@@ -108,7 +108,7 @@ class GLPNImageProcessor(BaseImageProcessor): ...@@ -108,7 +108,7 @@ class GLPNImageProcessor(BaseImageProcessor):
The image to rescale. The image to rescale.
scale (`float`): scale (`float`):
The scaling factor to rescale pixel values by. The scaling factor to rescale pixel values by.
data_format (`ChannelDimension`, *optional*): data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If `None`, the channel dimension format of the input The channel dimension format for the output image. If `None`, the channel dimension format of the input
image is used. Can be one of: image is used. Can be one of:
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
...@@ -146,14 +146,14 @@ class GLPNImageProcessor(BaseImageProcessor): ...@@ -146,14 +146,14 @@ class GLPNImageProcessor(BaseImageProcessor):
has an effect if `do_resize` is set to `True`. has an effect if `do_resize` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
return_tensors (`str`, *optional*): return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of: The type of tensors to return. Can be one of:
- `None`: Return a list of `np.ndarray`. - `None`: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`): data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of: The channel dimension format for the output image. Can be one of:
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
......
...@@ -14,168 +14,11 @@ ...@@ -14,168 +14,11 @@
# limitations under the License. # limitations under the License.
"""Feature extractor class for ImageGPT.""" """Feature extractor class for ImageGPT."""
from typing import List, Optional, Union from ...utils import logging
from .image_processing_imagegpt import ImageGPTImageProcessor
import numpy as np
from PIL import Image
from transformers.image_utils import PILImageResampling
from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
from ...image_utils import ImageFeatureExtractionMixin, is_torch_tensor
from ...utils import TensorType, logging
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
def squared_euclidean_distance(a, b): ImageGPTFeatureExtractor = ImageGPTImageProcessor
b = b.T
a2 = np.sum(np.square(a), axis=1)
b2 = np.sum(np.square(b), axis=0)
ab = np.matmul(a, b)
d = a2[:, None] - 2 * ab + b2[None, :]
return d
def color_quantize(x, clusters):
x = x.reshape(-1, 3)
d = squared_euclidean_distance(x, clusters)
return np.argmin(d, axis=1)
class ImageGPTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
r"""
Constructs an ImageGPT feature extractor. This feature extractor can be used to resize images to a smaller
resolution (such as 32x32 or 64x64), normalize them and finally color quantize them to obtain sequences of "pixel
values" (color clusters).
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
should refer to this superclass for more information regarding those methods.
Args:
clusters (`np.ndarray`):
The color clusters to use, as a `np.ndarray` of shape `(n_clusters, 3)`.
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the input to a certain `size`.
size (`int` or `Tuple(int)`, *optional*, defaults to 32):
Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize` is
set to `True`.
resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
`PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
`PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
to `True`.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the input to the range between -1 and +1.
"""
model_input_names = ["input_ids"]
def __init__(
self, clusters, do_resize=True, size=32, resample=PILImageResampling.BILINEAR, do_normalize=True, **kwargs
):
super().__init__(**kwargs)
self.clusters = np.asarray(clusters)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_normalize = do_normalize
def normalize(self, image):
"""
Normalizes `image` into the range -1 to +1.
Args:
image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to normalize.
Returns:
`np.ndarray`: The normalized image.
"""
image = self.to_numpy_array(image, rescale=False, channel_first=False)
return image / 127.5 - 1
def __call__(
self,
images: Union[
Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"] # noqa
],
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs
) -> BatchFeature:
"""
Main method to prepare for the model one or several image(s).
<Tip warning={true}>
NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
PIL images.
</Tip>
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- Input IDs to be fed to a model, of shape `(batch_size, height * width)`.
"""
# Input type checking for clearer error
valid_images = False
# Check that images has a valid type
if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
valid_images = True
elif isinstance(images, (list, tuple)):
if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
valid_images = True
if not valid_images:
raise ValueError(
"Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
"`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
)
is_batched = bool(
isinstance(images, (list, tuple))
and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
)
if not is_batched:
images = [images]
# transformations (resizing + normalization)
if self.do_resize and self.size is not None:
images = [self.resize(image, size=self.size, resample=self.resample) for image in images]
if self.do_normalize:
images = [self.normalize(image) for image in images]
# color quantize from (batch_size, height, width, 3) to (batch_size, height, width)
images = np.array(images)
images = color_quantize(images, self.clusters).reshape(images.shape[:-1])
# flatten to (batch_size, height*width)
batch_size = images.shape[0]
images = images.reshape(batch_size, -1)
# return as BatchFeature
data = {"input_ids": images}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
return encoded_inputs
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for ImageGPT."""
from typing import Dict, List, Optional, Union
import numpy as np
from transformers.utils import is_vision_available
from transformers.utils.generic import TensorType
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import rescale, resize, to_channel_dimension_format
from ...image_utils import ChannelDimension, ImageInput, PILImageResampling, is_batched, to_numpy_array, valid_images
from ...utils import logging
if is_vision_available():
import PIL
logger = logging.get_logger(__name__)
def squared_euclidean_distance(a, b):
b = b.T
a2 = np.sum(np.square(a), axis=1)
b2 = np.sum(np.square(b), axis=0)
ab = np.matmul(a, b)
d = a2[:, None] - 2 * ab + b2[None, :]
return d
def color_quantize(x, clusters):
x = x.reshape(-1, 3)
d = squared_euclidean_distance(x, clusters)
return np.argmin(d, axis=1)
class ImageGPTImageProcessor(BaseImageProcessor):
r"""
Constructs a ImageGPT image processor. This image processor can be used to resize images to a smaller resolution
(such as 32x32 or 64x64), normalize them and finally color quantize them to obtain sequences of "pixel values"
(color clusters).
Args:
clusters (`np.ndarray`, *optional*):
The color clusters to use, as a `np.ndarray` of shape `(n_clusters, 3)` when color quantizing. Can be
overriden by `clusters` in `preprocess`.
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's dimensions to `(size["height"], size["width"])`. Can be overridden by
`do_resize` in `preprocess`.
size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`):
Size of the image after resizing. Can be overridden by `size` in `preprocess`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image pixel value to between [-1, 1]. Can be overridden by `do_normalize` in
`preprocess`.
do_color_quantize (`bool`, *optional*, defaults to `True`):
Whether to color quantize the image. Can be overridden by `do_color_quantize` in `preprocess`.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
# clusters is a first argument to maintain backwards compatibility with the old ImageGPTFeatureExtractor
clusters: Optional[np.ndarray] = None,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_normalize: bool = True,
do_color_quantize: bool = True,
**kwargs
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 256, "width": 256}
size = get_size_dict(size)
self.clusters = clusters
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_normalize = do_normalize
self.do_color_quantize = do_color_quantize
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Resize an image to (size["height"], size["width"]).
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"Size dictionary must contain both height and width keys. Got {size.keys()}")
return resize(
image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
)
def normalize(
self,
image: np.ndarray,
data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Normalizes an images' pixel values to between [-1, 1].
Args:
image (`np.ndarray`):
Image to normalize.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
image = rescale(image=image, scale=1 / 127.5, data_format=data_format)
image = image - 1
return image
def preprocess(
self,
images: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_normalize: bool = None,
do_color_quantize: Optional[bool] = None,
clusters: Optional[Union[int, List[int]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
) -> PIL.Image.Image:
"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to preprocess.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after resizing.
resample (`int`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
has an effect if `do_resize` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image
do_color_quantize (`bool`, *optional*, defaults to `self.do_color_quantize`):
Whether to color quantize the image.
clusters (`np.ndarray`, *optional*, defaults to `self.clusters`):
Clusters used to quantize the image of shape `(n_clusters, 3)`. Only has an effect if
`do_color_quantize` is set to `True`.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
Only has an effect if `do_color_quantize` is set to `False`.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
size = get_size_dict(size)
resample = resample if resample is not None else self.resample
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize
clusters = clusters if clusters is not None else self.clusters
if not is_batched(images):
images = [images]
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
if do_resize and size is None or resample is None:
raise ValueError("Size and resample must be specified if do_resize is True.")
if do_color_quantize and clusters is None:
raise ValueError("Clusters must be specified if do_color_quantize is True.")
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if do_resize:
images = [self.resize(image=image, size=size, resample=resample) for image in images]
if do_normalize:
images = [self.normalize(image=image) for image in images]
if do_color_quantize:
images = [to_channel_dimension_format(image, ChannelDimension.LAST) for image in images]
# color quantize from (batch_size, height, width, 3) to (batch_size, height, width)
images = np.array(images)
clusters = np.array(clusters)
images = color_quantize(images, clusters).reshape(images.shape[:-1])
# flatten to (batch_size, height*width)
batch_size = images.shape[0]
images = images.reshape(batch_size, -1)
# We need to convert back to a list of images to keep consistent behaviour across processors.
images = list(images)
else:
images = [to_channel_dimension_format(image, data_format) for image in images]
data = {"input_ids": images}
return BatchFeature(data=data, tensor_type=return_tensors)
...@@ -992,11 +992,12 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel): ...@@ -992,11 +992,12 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
... ) ... )
>>> clusters = feature_extractor.clusters >>> clusters = feature_extractor.clusters
>>> n_px = feature_extractor.size >>> height = feature_extractor.size["height"]
>>> width = feature_extractor.size["width"]
>>> samples = output[:, 1:].cpu().detach().numpy() >>> samples = output[:, 1:].cpu().detach().numpy()
>>> samples_img = [ >>> samples_img = [
... np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [n_px, n_px, 3]).astype(np.uint8) for s in samples ... np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples
... ] # convert color cluster tokens back to pixels ... ] # convert color cluster tokens back to pixels
>>> f, axes = plt.subplots(1, batch_size, dpi=300) >>> f, axes = plt.subplots(1, batch_size, dpi=300)
......
...@@ -16,226 +16,10 @@ ...@@ -16,226 +16,10 @@
Feature extractor class for LayoutLMv2. Feature extractor class for LayoutLMv2.
""" """
from typing import List, Optional, Union from ...utils import logging
from .image_processing_layoutlmv2 import LayoutLMv2ImageProcessor
import numpy as np
from PIL import Image
from transformers.image_utils import PILImageResampling
from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
from ...image_utils import ImageFeatureExtractionMixin, is_torch_tensor
from ...utils import TensorType, is_pytesseract_available, logging, requires_backends
# soft dependency
if is_pytesseract_available():
import pytesseract
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
ImageInput = Union[ LayoutLMv2FeatureExtractor = LayoutLMv2ImageProcessor
Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"] # noqa
]
def normalize_box(box, width, height):
return [
int(1000 * (box[0] / width)),
int(1000 * (box[1] / height)),
int(1000 * (box[2] / width)),
int(1000 * (box[3] / height)),
]
def apply_tesseract(image: Image.Image, lang: Optional[str], tesseract_config: Optional[str]):
"""Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
# apply OCR
data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config)
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
# filter empty words and corresponding coordinates
irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
# turn coordinates into (left, top, left+width, top+height) format
actual_boxes = []
for x, y, w, h in zip(left, top, width, height):
actual_box = [x, y, x + w, y + h]
actual_boxes.append(actual_box)
image_width, image_height = image.size
# finally, normalize the bounding boxes
normalized_boxes = []
for box in actual_boxes:
normalized_boxes.append(normalize_box(box, image_width, image_height))
assert len(words) == len(normalized_boxes), "Not as many words as there are bounding boxes"
return words, normalized_boxes
class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
r"""
Constructs a LayoutLMv2 feature extractor. This can be used to resize document images to the same size, as well as
to apply OCR on them in order to get a list of words and normalized bounding boxes.
This feature extractor inherits from [`~feature_extraction_utils.PreTrainedFeatureExtractor`] which contains most
of the main methods. Users should refer to this superclass for more information regarding those methods.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the input to a certain `size`.
size (`int` or `Tuple(int)`, *optional*, defaults to 224):
Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize` is
set to `True`.
resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
`PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
`PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
to `True`.
apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
ocr_lang (`str`, *optional*):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used.
tesseract_config (`str`, *optional*):
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
Tesseract. For example: '--psm 6'.
<Tip>
LayoutLMv2FeatureExtractor uses Google's Tesseract OCR engine under the hood.
</Tip>"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize=True,
size=224,
resample=PILImageResampling.BILINEAR,
apply_ocr=True,
ocr_lang=None,
tesseract_config="",
**kwargs
):
super().__init__(**kwargs)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.apply_ocr = apply_ocr
self.ocr_lang = ocr_lang
self.tesseract_config = tesseract_config
def __call__(
self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
) -> BatchFeature:
"""
Main method to prepare for the model one or several image(s).
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
width).
- **words** -- Optional words as identified by Tesseract OCR (only when [`LayoutLMv2FeatureExtractor`] was
initialized with `apply_ocr` set to `True`).
- **boxes** -- Optional bounding boxes as identified by Tesseract OCR, normalized based on the image size
(only when [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to `True`).
Examples:
```python
>>> from transformers import LayoutLMv2FeatureExtractor
>>> from PIL import Image
>>> # Document can be a png, jpg, etc. PDFs must be converted to images.
>>> image = Image.open(name_of_your_document).convert("RGB")
>>> # option 1: with apply_ocr=True (default)
>>> feature_extractor = LayoutLMv2FeatureExtractor()
>>> encoding = feature_extractor(image, return_tensors="pt")
>>> print(encoding.keys())
>>> # dict_keys(['pixel_values', 'words', 'boxes'])
>>> # option 2: with apply_ocr=False
>>> feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
>>> encoding = feature_extractor(image, return_tensors="pt")
>>> print(encoding.keys())
>>> # dict_keys(['pixel_values'])
```"""
# Input type checking for clearer error
valid_images = False
# Check that images has a valid type
if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
valid_images = True
elif isinstance(images, (list, tuple)):
if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
valid_images = True
if not valid_images:
raise ValueError(
"Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
"`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples), "
f"but is of type {type(images)}."
)
is_batched = bool(
isinstance(images, (list, tuple))
and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
)
if not is_batched:
images = [images]
# Tesseract OCR to get words + normalized bounding boxes
if self.apply_ocr:
requires_backends(self, "pytesseract")
words_batch = []
boxes_batch = []
for image in images:
words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang, self.tesseract_config)
words_batch.append(words)
boxes_batch.append(boxes)
# transformations (resizing)
if self.do_resize and self.size is not None:
images = [self.resize(image=image, size=self.size, resample=self.resample) for image in images]
images = [self.to_numpy_array(image, rescale=False) for image in images]
# flip color channels from RGB to BGR (as Detectron2 requires this)
images = [image[::-1, :, :] for image in images]
# return as BatchFeature
data = {"pixel_values": images}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if self.apply_ocr:
encoded_inputs["words"] = words_batch
encoded_inputs["boxes"] = boxes_batch
return encoded_inputs
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for LayoutLMv2."""
from typing import Dict, Optional, Union
import numpy as np
from transformers.utils import is_vision_available
from transformers.utils.generic import TensorType
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import resize, to_channel_dimension_format, to_pil_image
from ...image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_batched,
to_numpy_array,
valid_images,
)
from ...utils import is_pytesseract_available, logging, requires_backends
if is_vision_available():
import PIL
# soft dependency
if is_pytesseract_available():
import pytesseract
logger = logging.get_logger(__name__)
def normalize_box(box, width, height):
return [
int(1000 * (box[0] / width)),
int(1000 * (box[1] / height)),
int(1000 * (box[2] / width)),
int(1000 * (box[3] / height)),
]
def apply_tesseract(image: np.ndarray, lang: Optional[str], tesseract_config: Optional[str] = None):
"""Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
tesseract_config = tesseract_config if tesseract_config is not None else ""
# apply OCR
pil_image = to_pil_image(image)
image_width, image_height = pil_image.size
data = pytesseract.image_to_data(pil_image, lang=lang, output_type="dict", config=tesseract_config)
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
# filter empty words and corresponding coordinates
irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
# turn coordinates into (left, top, left+width, top+height) format
actual_boxes = []
for x, y, w, h in zip(left, top, width, height):
actual_box = [x, y, x + w, y + h]
actual_boxes.append(actual_box)
# finally, normalize the bounding boxes
normalized_boxes = []
for box in actual_boxes:
normalized_boxes.append(normalize_box(box, image_width, image_height))
assert len(words) == len(normalized_boxes), "Not as many words as there are bounding boxes"
return words, normalized_boxes
def flip_channel_order(image: np.ndarray, data_format: Optional[ChannelDimension] = None) -> np.ndarray:
input_data_format = infer_channel_dimension_format(image)
if input_data_format == ChannelDimension.LAST:
image = image[..., ::-1]
elif input_data_format == ChannelDimension.FIRST:
image = image[:, ::-1, ...]
else:
raise ValueError(f"Unsupported channel dimension: {input_data_format}")
if data_format is not None:
image = to_channel_dimension_format(image, data_format)
return image
class LayoutLMv2ImageProcessor(BaseImageProcessor):
r"""
Constructs a LayoutLMv2 image processor.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to `(size["height"], size["width"])`. Can be
overridden by `do_resize` in `preprocess`.
size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
Size of the image after resizing. Can be overridden by `size` in `preprocess`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
`preprocess` method.
apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
`apply_ocr` in `preprocess`.
ocr_lang (`str`, *optional*):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used. Can be overridden by `ocr_lang` in `preprocess`.
tesseract_config (`str`, *optional*):
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
Tesseract. For example: '--psm 6'. Can be overridden by `tesseract_config` in `preprocess`.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
apply_ocr: bool = True,
ocr_lang: Optional[str] = None,
tesseract_config: Optional[str] = "",
**kwargs
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 224, "width": 224}
size = get_size_dict(size)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.apply_ocr = apply_ocr
self.ocr_lang = ocr_lang
self.tesseract_config = tesseract_config
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Resize an image to `(size["height"], size["width"])`.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
output_size = (size["height"], size["width"])
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
def preprocess(
self,
images: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
apply_ocr: bool = None,
ocr_lang: Optional[str] = None,
tesseract_config: Optional[str] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
) -> PIL.Image.Image:
"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to preprocess.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Desired size of the output image after resizing.
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PIL.Image` resampling
filter. Only has an effect if `do_resize` is set to `True`.
apply_ocr (`bool`, *optional*, defaults to `self.apply_ocr`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
ocr_lang (`str`, *optional*, defaults to `self.ocr_lang`):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used.
tesseract_config (`str`, *optional*, defaults to `self.tesseract_config`):
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
Tesseract.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
size = get_size_dict(size)
resample = resample if resample is not None else self.resample
apply_ocr = apply_ocr if apply_ocr is not None else self.apply_ocr
ocr_lang = ocr_lang if ocr_lang is not None else self.ocr_lang
tesseract_config = tesseract_config if tesseract_config is not None else self.tesseract_config
if not is_batched(images):
images = [images]
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
if do_resize and size is None:
raise ValueError("Size must be specified if do_resize is True.")
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if apply_ocr:
requires_backends(self, "pytesseract")
words_batch = []
boxes_batch = []
for image in images:
words, boxes = apply_tesseract(image, ocr_lang, tesseract_config)
words_batch.append(words)
boxes_batch.append(boxes)
if do_resize:
images = [self.resize(image=image, size=size, resample=resample) for image in images]
# flip color channels from RGB to BGR (as Detectron2 requires this)
images = [flip_channel_order(image) for image in images]
images = [to_channel_dimension_format(image, data_format) for image in images]
data = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
if apply_ocr:
data["words"] = words_batch
data["boxes"] = boxes_batch
return data
...@@ -16,235 +16,11 @@ ...@@ -16,235 +16,11 @@
Feature extractor class for LayoutLMv3. Feature extractor class for LayoutLMv3.
""" """
from typing import List, Optional, Union from ...utils import logging
from .image_processing_layoutlmv3 import LayoutLMv3ImageProcessor
import numpy as np
from PIL import Image
from transformers.image_utils import PILImageResampling
from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageFeatureExtractionMixin, is_torch_tensor
from ...utils import TensorType, is_pytesseract_available, logging, requires_backends
# soft dependency
if is_pytesseract_available():
import pytesseract
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
ImageInput = Union[
Image.Image, np.ndarray, "torch.Tensor", List[Image.Image], List[np.ndarray], List["torch.Tensor"] # noqa
]
def normalize_box(box, width, height):
return [
int(1000 * (box[0] / width)),
int(1000 * (box[1] / height)),
int(1000 * (box[2] / width)),
int(1000 * (box[3] / height)),
]
def apply_tesseract(image: Image.Image, lang: Optional[str], tesseract_config: Optional[str]):
"""Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
# apply OCR
data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config)
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
# filter empty words and corresponding coordinates
irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
# turn coordinates into (left, top, left+width, top+height) format
actual_boxes = []
for x, y, w, h in zip(left, top, width, height):
actual_box = [x, y, x + w, y + h]
actual_boxes.append(actual_box)
image_width, image_height = image.size
# finally, normalize the bounding boxes
normalized_boxes = []
for box in actual_boxes:
normalized_boxes.append(normalize_box(box, image_width, image_height))
assert len(words) == len(normalized_boxes), "Not as many words as there are bounding boxes"
return words, normalized_boxes
class LayoutLMv3FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
r"""
Constructs a LayoutLMv3 feature extractor. This can be used to resize + normalize document images, as well as to
apply OCR on them in order to get a list of words and normalized bounding boxes.
This feature extractor inherits from [`~feature_extraction_utils.PreTrainedFeatureExtractor`] which contains most
of the main methods. Users should refer to this superclass for more information regarding those methods.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the input to a certain `size`.
size (`int` or `Tuple(int)`, *optional*, defaults to 224):
Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize` is
set to `True`.
resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
`PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
`PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
to `True`.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the input with mean and standard deviation.
image_mean (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
The sequence of means for each channel, to be used when normalizing images.
image_std (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
The sequence of standard deviations for each channel, to be used when normalizing images.
apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
ocr_lang (`str`, *optional*):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used.
tesseract_config (`str`, *optional*):
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
Tesseract. For example: '--psm 6'.
<Tip>
LayoutLMv3FeatureExtractor uses Google's Tesseract OCR engine under the hood.
</Tip>"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize=True,
size=224,
resample=PILImageResampling.BILINEAR,
do_normalize=True,
image_mean=None,
image_std=None,
apply_ocr=True,
ocr_lang=None,
tesseract_config="",
**kwargs
):
super().__init__(**kwargs)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self.apply_ocr = apply_ocr
self.ocr_lang = ocr_lang
self.tesseract_config = tesseract_config
def __call__(
self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
) -> BatchFeature:
"""
Main method to prepare for the model one or several image(s).
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
width).
- **words** -- Optional words as identified by Tesseract OCR (only when [`LayoutLMv3FeatureExtractor`] was
initialized with `apply_ocr` set to `True`).
- **boxes** -- Optional bounding boxes as identified by Tesseract OCR, normalized based on the image size
(only when [`LayoutLMv3FeatureExtractor`] was initialized with `apply_ocr` set to `True`).
Examples:
```python
>>> from transformers import LayoutLMv3FeatureExtractor
>>> from PIL import Image
>>> # Document can be a png, jpg, etc. PDFs must be converted to images.
>>> image = Image.open(name_of_your_document).convert("RGB")
>>> # option 1: with apply_ocr=True (default)
>>> feature_extractor = LayoutLMv3FeatureExtractor()
>>> encoding = feature_extractor(image, return_tensors="pt")
>>> print(encoding.keys())
>>> # dict_keys(['pixel_values', 'words', 'boxes'])
>>> # option 2: with apply_ocr=False
>>> feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
>>> encoding = feature_extractor(image, return_tensors="pt")
>>> print(encoding.keys())
>>> # dict_keys(['pixel_values'])
```"""
# Input type checking for clearer error
valid_images = False
# Check that images has a valid type
if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
valid_images = True
elif isinstance(images, (list, tuple)):
if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
valid_images = True
if not valid_images:
raise ValueError(
"Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
"`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples), "
f"but is of type {type(images)}."
)
is_batched = bool(
isinstance(images, (list, tuple))
and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
)
if not is_batched:
images = [images]
# Tesseract OCR to get words + normalized bounding boxes
if self.apply_ocr:
requires_backends(self, "pytesseract")
words_batch = []
boxes_batch = []
for image in images:
words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang, self.tesseract_config)
words_batch.append(words)
boxes_batch.append(boxes)
# transformations (resizing + normalization)
if self.do_resize and self.size is not None:
images = [self.resize(image=image, size=self.size, resample=self.resample) for image in images]
if self.do_normalize:
images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
# return as BatchFeature
data = {"pixel_values": images}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if self.apply_ocr:
encoded_inputs["words"] = words_batch
encoded_inputs["boxes"] = boxes_batch
return encoded_inputs LayoutLMv3FeatureExtractor = LayoutLMv3ImageProcessor
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for LayoutLMv3."""
from typing import Dict, Iterable, Optional, Union
import numpy as np
from transformers.utils import is_vision_available
from transformers.utils.generic import TensorType
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import normalize, rescale, resize, to_channel_dimension_format, to_pil_image
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_batched,
to_numpy_array,
valid_images,
)
from ...utils import is_pytesseract_available, logging, requires_backends
if is_vision_available():
import PIL
# soft dependency
if is_pytesseract_available():
import pytesseract
logger = logging.get_logger(__name__)
def normalize_box(box, width, height):
return [
int(1000 * (box[0] / width)),
int(1000 * (box[1] / height)),
int(1000 * (box[2] / width)),
int(1000 * (box[3] / height)),
]
def apply_tesseract(image: np.ndarray, lang: Optional[str], tesseract_config: Optional[str]):
"""Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
# apply OCR
pil_image = to_pil_image(image)
image_width, image_height = pil_image.size
data = pytesseract.image_to_data(pil_image, lang=lang, output_type="dict", config=tesseract_config)
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
# filter empty words and corresponding coordinates
irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
# turn coordinates into (left, top, left+width, top+height) format
actual_boxes = []
for x, y, w, h in zip(left, top, width, height):
actual_box = [x, y, x + w, y + h]
actual_boxes.append(actual_box)
# finally, normalize the bounding boxes
normalized_boxes = []
for box in actual_boxes:
normalized_boxes.append(normalize_box(box, image_width, image_height))
assert len(words) == len(normalized_boxes), "Not as many words as there are bounding boxes"
return words, normalized_boxes
def flip_channel_order(image: np.ndarray, data_format: Optional[ChannelDimension] = None) -> np.ndarray:
input_data_format = infer_channel_dimension_format(image)
if input_data_format == ChannelDimension.LAST:
image = image[..., ::-1]
elif input_data_format == ChannelDimension.FIRST:
image = image[:, ::-1, ...]
else:
raise ValueError(f"Unsupported channel dimension: {input_data_format}")
if data_format is not None:
image = to_channel_dimension_format(image, data_format)
return image
class LayoutLMv3ImageProcessor(BaseImageProcessor):
r"""
Constructs a LayoutLMv3 image processor.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to `(size["height"], size["width"])`. Can be
overridden by `do_resize` in `preprocess`.
size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
Size of the image after resizing. Can be overridden by `size` in `preprocess`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image's pixel values by the specified `rescale_value`. Can be overridden by
`do_rescale` in `preprocess`.
rescale_factor (`float`, *optional*, defaults to 1 / 255):
Value by which the image's pixel values are rescaled. Can be overridden by `rescale_factor` in
`preprocess`.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`Iterable[float]` or `float`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`Iterable[float]` or `float`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
the `apply_ocr` parameter in the `preprocess` method.
ocr_lang (`str`, *optional*):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
tesseract_config (`str`, *optional*):
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
`preprocess` method.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_rescale: bool = True,
rescale_value: float = 1 / 255,
do_normalize: bool = True,
image_mean: Union[float, Iterable[float]] = None,
image_std: Union[float, Iterable[float]] = None,
apply_ocr: bool = True,
ocr_lang: Optional[str] = None,
tesseract_config: Optional[str] = "",
**kwargs
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 224, "width": 224}
size = get_size_dict(size)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_value
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self.apply_ocr = apply_ocr
self.ocr_lang = ocr_lang
self.tesseract_config = tesseract_config
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Resize an image to (size["height"], size["width"]) dimensions.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use when resiizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
output_size = (size["height"], size["width"])
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
def rescale(
self,
image: np.ndarray,
scale: Union[int, float],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Rescale an image by a scale factor. image = image * scale.
Args:
image (`np.ndarray`):
Image to rescale.
scale (`int` or `float`):
Scale to apply to the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return rescale(image, scale=scale, data_format=data_format, **kwargs)
def normalize(
self,
image: np.ndarray,
mean: Union[float, Iterable[float]],
std: Union[float, Iterable[float]],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Normalize an image.
Args:
image (`np.ndarray`):
Image to normalize.
mean (`float` or `Iterable[float]`):
Mean values to be used for normalization.
std (`float` or `Iterable[float]`):
Standard deviation values to be used for normalization.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
def preprocess(
self,
images: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample=None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Union[float, Iterable[float]] = None,
image_std: Union[float, Iterable[float]] = None,
apply_ocr: bool = None,
ocr_lang: Optional[str] = None,
tesseract_config: Optional[str] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
) -> PIL.Image.Image:
"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to preprocess.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Desired size of the output image after applying `resize`.
resample (`int`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` filters.
Only has an effect if `do_resize` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image pixel values between [0, 1].
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to apply to the image pixel values. Only has an effect if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `Iterable[float]`, *optional*, defaults to `self.image_mean`):
Mean values to be used for normalization. Only has an effect if `do_normalize` is set to `True`.
image_std (`float` or `Iterable[float]`, *optional*, defaults to `self.image_std`):
Standard deviation values to be used for normalization. Only has an effect if `do_normalize` is set to
`True`.
apply_ocr (`bool`, *optional*, defaults to `self.apply_ocr`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
ocr_lang (`str`, *optional*, defaults to `self.ocr_lang`):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used.
tesseract_config (`str`, *optional*, defaults to `self.tesseract_config`):
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
Tesseract.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
size = get_size_dict(size)
resample = resample if resample is not None else self.resample
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
apply_ocr = apply_ocr if apply_ocr is not None else self.apply_ocr
ocr_lang = ocr_lang if ocr_lang is not None else self.ocr_lang
tesseract_config = tesseract_config if tesseract_config is not None else self.tesseract_config
if not is_batched(images):
images = [images]
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
if do_resize and size is None:
raise ValueError("Size must be specified if do_resize is True.")
if do_rescale and rescale_factor is None:
raise ValueError("Rescale factor must be specified if do_rescale is True.")
if do_normalize and (image_mean is None or image_std is None):
raise ValueError("If do_normalize is True, image_mean and image_std must be specified.")
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
# Tesseract OCR to get words + normalized bounding boxes
if apply_ocr:
requires_backends(self, "pytesseract")
words_batch = []
boxes_batch = []
for image in images:
words, boxes = apply_tesseract(image, ocr_lang, tesseract_config)
words_batch.append(words)
boxes_batch.append(boxes)
if do_resize:
images = [self.resize(image=image, size=size, resample=resample) for image in images]
if do_rescale:
images = [self.rescale(image=image, scale=rescale_factor) for image in images]
if do_normalize:
images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
# flip color channels from RGB to BGR (as Detectron2 requires this)
images = [flip_channel_order(image) for image in images]
images = [to_channel_dimension_format(image, data_format) for image in images]
data = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
if apply_ocr:
data["words"] = words_batch
data["boxes"] = boxes_batch
return data
...@@ -14,148 +14,12 @@ ...@@ -14,148 +14,12 @@
# limitations under the License. # limitations under the License.
"""Feature extractor class for LeViT.""" """Feature extractor class for LeViT."""
from typing import Optional, Union from ...utils import logging
from .image_processing_levit import LevitImageProcessor
import numpy as np
from PIL import Image
from transformers.image_utils import PILImageResampling
from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
ImageFeatureExtractionMixin,
ImageInput,
is_torch_tensor,
)
from ...utils import TensorType, logging
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class LevitFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): # Feature extractor for Levit is being replaced by image processor
r""" LevitFeatureExtractor = LevitImageProcessor
Constructs a LeViT feature extractor.
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
should refer to this superclass for more information regarding those methods.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the shortest edge of the input to int(256/224 *`size`).
size (`int` or `Tuple(int)`, *optional*, defaults to 224):
Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
integer is provided, then shorter side of input will be resized to 'size'.
resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BICUBIC`):
An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
`PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
`PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
to `True`.
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether or not to center crop the input to `size`.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the input with mean and standard deviation.
image_mean (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
The sequence of means for each channel, to be used when normalizing images.
image_std (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
The sequence of standard deviations for each channel, to be used when normalizing images.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize=True,
size=224,
resample=PILImageResampling.BICUBIC,
do_center_crop=True,
do_normalize=True,
image_mean=IMAGENET_DEFAULT_MEAN,
image_std=IMAGENET_DEFAULT_STD,
**kwargs
):
super().__init__(**kwargs)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_center_crop = do_center_crop
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
def __call__(
self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
) -> BatchFeature:
"""
Main method to prepare for the model one or several image(s).
<Tip warning={true}>
NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
PIL images.
</Tip>
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
width).
"""
# Input type checking for clearer error
valid_images = False
# Check that images has a valid type
if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
valid_images = True
elif isinstance(images, (list, tuple)):
if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
valid_images = True
if not valid_images:
raise ValueError(
"Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
"`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
)
is_batched = bool(
isinstance(images, (list, tuple))
and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
)
if not is_batched:
images = [images]
# transformations (resizing + center cropping + normalization)
if self.do_resize and self.size is not None:
size_ = int((256 / 224) * self.size)
images = [
self.resize(image=image, size=size_, resample=self.resample, default_to_square=False)
for image in images
]
if self.do_center_crop:
images = [self.center_crop(image=image, size=self.size) for image in images]
if self.do_normalize:
images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
# return as BatchFeature
data = {"pixel_values": images}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
return encoded_inputs
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for LeViT."""
from typing import Dict, Iterable, List, Optional, Union
import numpy as np
from transformers.utils.generic import TensorType
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
center_crop,
get_resize_output_image_size,
normalize,
rescale,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
is_batched,
to_numpy_array,
valid_images,
)
from ...utils import logging
logger = logging.get_logger(__name__)
class LevitImageProcessor(BaseImageProcessor):
r"""
Constructs a LeViT image processor.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Wwhether to resize the shortest edge of the input to int(256/224 *`size`). Can be overridden by the
`do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]`, *optional*, defaults to `{"shortest_edge": 224}`):
Size of the output image after resizing. If size is a dict with keys "width" and "height", the image will
be resized to `(size["height"], size["width"])`. If size is a dict with key "shortest_edge", the shortest
edge value `c` is rescaled to `int(c * (256/224))`. The smaller edge of the image will be matched to this
value i.e, if height > width, then image will be rescaled to `(size["shortest_egde"] * height / width,
size["shortest_egde"])`. Can be overridden by the `size` parameter in the `preprocess` method.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
`preprocess` method.
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether or not to center crop the input to `(crop_size["height"], crop_size["width"])`. Can be overridden
by the `do_center_crop` parameter in the `preprocess` method.
crop_size (`Dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
Desired image size after `center_crop`. Can be overridden by the `crop_size` parameter in the `preprocess`
method.
do_rescale (`bool`, *optional*, defaults to `True`):
Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
`do_rescale` parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize (`bool`, *optional*, defaults to `True`):
Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
`preprocess` method.
image_mean (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_center_crop: bool = True,
crop_size: Dict[str, int] = None,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, Iterable[float]]] = IMAGENET_DEFAULT_MEAN,
image_std: Optional[Union[float, Iterable[float]]] = IMAGENET_DEFAULT_STD,
**kwargs
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 224}
size = get_size_dict(size, default_to_square=False)
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
crop_size = get_size_dict(crop_size)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Resize an image.
If size is a dict with keys "width" and "height", the image will be resized to `(size["height"],
size["width"])`.
If size is a dict with key "shortest_edge", the shortest edge value `c` is rescaled to `int(c * (256/224))`.
The smaller edge of the image will be matched to this value i.e, if height > width, then image will be rescaled
to `(size["shortest_egde"] * height / width, size["shortest_egde"])`.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image after resizing. If size is a dict with keys "width" and "height", the image
will be resized to (height, width). If size is a dict with key "shortest_edge", the shortest edge value
`c` is rescaled to int(`c` * (256/224)). The smaller edge of the image will be matched to this value
i.e, if height > width, then image will be rescaled to (size * height / width, size).
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resiizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size_dict = get_size_dict(size, default_to_square=False)
# size_dict is a dict with either keys "height" and "width" or "shortest_edge"
if "shortest_edge" in size:
shortest_edge = int((256 / 224) * size["shortest_edge"])
output_size = get_resize_output_image_size(image, size=shortest_edge, default_to_square=False)
size_dict = {"height": output_size[0], "width": output_size[1]}
if "height" not in size_dict or "width" not in size_dict:
raise ValueError(
f"Size dict must have keys 'height' and 'width' or 'shortest_edge'. Got {size_dict.keys()}"
)
return resize(
image, size=(size_dict["height"], size_dict["width"]), resample=resample, data_format=data_format, **kwargs
)
def center_crop(
self,
image: np.ndarray,
size: Dict[str, int],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Center crop an image.
Args:
image (`np.ndarray`):
Image to center crop.
size (`Dict[str, int]`):
Dict `{"height": int, "width": int}` specifying the size of the output image after cropping.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size)
return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
def rescale(
self,
image: np.ndarray,
scale: Union[int, float],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Rescale an image by a scale factor. image = image * scale.
Args:
image (`np.ndarray`):
Image to rescale.
scale (`int` or `float`):
Scale to apply to the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return rescale(image, scale=scale, data_format=data_format, **kwargs)
def normalize(
self,
image: np.ndarray,
mean: Union[float, List[float]],
std: Union[float, List[float]],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Normalize an image. image = (image - image_mean) / image_std.
Args:
image (`np.ndarray`):
Image to normalize.
mean (`float` or `List[float]`):
Image mean.
std (`float` or `List[float]`):
Image standard deviation.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
def preprocess(
self,
images: ImageInput,
do_resize: Optional[bool] = None,
size: Optional[Dict[str, int]] = None,
resample: PILImageResampling = None,
do_center_crop: Optional[bool] = None,
crop_size: Optional[Dict[str, int]] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, Iterable[float]]] = None,
image_std: Optional[Union[float, Iterable[float]]] = None,
return_tensors: Optional[TensorType] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
) -> BatchFeature:
"""
Preprocess an image or batch of images to be used as input to a LeViT model.
Args:
images (`ImageInput`):
Image or batch of images to preprocess.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Size of the output image after resizing. If size is a dict with keys "width" and "height", the image
will be resized to (height, width). If size is a dict with key "shortest_edge", the shortest edge value
`c` is rescaled to int(`c` * (256/224)). The smaller edge of the image will be matched to this value
i.e, if height > width, then image will be rescaled to (size * height / width, size).
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resiizing the image.
do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
Whether to center crop the image.
crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
Size of the output image after center cropping. Crops images to (crop_size["height"],
crop_size["width"]).
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image pixel values by `rescaling_factor` - typical to values between 0 and 1.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Factor to rescale the image pixel values by.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image pixel values by `image_mean` and `image_std`.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Mean to normalize the image pixel values by.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Standard deviation to normalize the image pixel values by.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`str` or `ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
resample = resample if resample is not None else self.resample
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
size = size if size is not None else self.size
size = get_size_dict(size, default_to_square=False)
crop_size = crop_size if crop_size is not None else self.crop_size
crop_size = get_size_dict(crop_size)
if not is_batched(images):
images = [images]
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
if do_resize and size is None:
raise ValueError("Size must be specified if do_resize is True.")
if do_center_crop and crop_size is None:
raise ValueError("Crop size must be specified if do_center_crop is True.")
if do_rescale and rescale_factor is None:
raise ValueError("Rescale factor must be specified if do_rescale is True.")
if do_normalize and (image_mean is None or image_std is None):
raise ValueError("Image mean and std must be specified if do_normalize is True.")
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if do_resize:
images = [self.resize(image, size, resample) for image in images]
if do_center_crop:
images = [self.center_crop(image, crop_size) for image in images]
if do_rescale:
images = [self.rescale(image, rescale_factor) for image in images]
if do_normalize:
images = [self.normalize(image, image_mean, image_std) for image in images]
images = [to_channel_dimension_format(image, data_format) for image in images]
data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors)
...@@ -14,189 +14,11 @@ ...@@ -14,189 +14,11 @@
# limitations under the License. # limitations under the License.
"""Feature extractor class for MobileViT.""" """Feature extractor class for MobileViT."""
from typing import List, Optional, Tuple, Union from ...utils import logging
from .image_processing_mobilevit import MobileViTImageProcessor
import numpy as np
from PIL import Image
from transformers.image_utils import PILImageResampling
from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
from ...image_utils import ImageFeatureExtractionMixin, ImageInput, is_torch_tensor
from ...utils import TensorType, is_torch_available, logging
if is_torch_available():
import torch
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class MobileViTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): MobileViTFeatureExtractor = MobileViTImageProcessor
r"""
Constructs a MobileViT feature extractor.
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
should refer to this superclass for more information regarding those methods.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the input to a certain `size`.
size (`int` or `Tuple(int)`, *optional*, defaults to 288):
Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
integer is provided, then the input will be resized to match the shorter side. Only has an effect if
`do_resize` is set to `True`.
resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
`PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
`PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
to `True`.
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge, the
image is padded with 0's and then center cropped.
crop_size (`int`, *optional*, defaults to 256):
Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`.
do_flip_channel_order (`bool`, *optional*, defaults to `True`):
Whether to flip the color channels from RGB to BGR.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize=True,
size=288,
resample=PILImageResampling.BILINEAR,
do_center_crop=True,
crop_size=256,
do_flip_channel_order=True,
**kwargs
):
super().__init__(**kwargs)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_flip_channel_order = do_flip_channel_order
def __call__(
self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
) -> BatchFeature:
"""
Main method to prepare for the model one or several image(s).
<Tip warning={true}>
NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
PIL images.
</Tip>
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
width).
"""
# Input type checking for clearer error
valid_images = False
# Check that images has a valid type
if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
valid_images = True
elif isinstance(images, (list, tuple)):
if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
valid_images = True
if not valid_images:
raise ValueError(
"Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
"`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
)
is_batched = bool(
isinstance(images, (list, tuple))
and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
)
if not is_batched:
images = [images]
# transformations (resizing + normalization)
if self.do_resize and self.size is not None:
images = [
self.resize(image=image, size=self.size, resample=self.resample, default_to_square=False)
for image in images
]
if self.do_center_crop and self.crop_size is not None:
images = [self.center_crop(image, self.crop_size) for image in images]
images = [self.to_numpy_array(image) for image in images]
# the pretrained checkpoints assume images are BGR, not RGB
if self.do_flip_channel_order:
images = [self.flip_channel_order(image) for image in images]
# return as BatchFeature
data = {"pixel_values": images}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
return encoded_inputs
def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
"""
Converts the output of [`MobileViTForSemanticSegmentation`] into semantic segmentation maps. Only supports
PyTorch.
Args:
outputs ([`MobileViTForSemanticSegmentation`]):
Raw outputs of the model.
target_sizes (`List[Tuple]`, *optional*):
A list of length `batch_size`, where each item is a `Tuple[int, int]` corresponding to the requested
final size (height, width) of each prediction. If left to None, predictions will not be resized.
Returns:
`List[torch.Tensor]`:
A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
`torch.Tensor` correspond to a semantic class id.
"""
logits = outputs.logits
# Resize logits and compute semantic segmentation maps
if target_sizes is not None:
if len(logits) != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
if is_torch_tensor(target_sizes):
target_sizes = target_sizes.numpy()
semantic_segmentation = []
for idx in range(len(logits)):
resized_logits = torch.nn.functional.interpolate(
logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
)
semantic_map = resized_logits[0].argmax(dim=0)
semantic_segmentation.append(semantic_map)
else:
semantic_segmentation = logits.argmax(dim=1)
semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
return semantic_segmentation
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for MobileViT."""
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
from transformers.utils import is_torch_available, is_torch_tensor, is_vision_available
from transformers.utils.generic import TensorType
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import center_crop, get_resize_output_image_size, rescale, resize, to_channel_dimension_format
from ...image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_batched,
to_numpy_array,
valid_images,
)
from ...utils import logging
if is_vision_available():
import PIL
if is_torch_available():
import torch
logger = logging.get_logger(__name__)
def flip_channel_order(image: np.ndarray, data_format: Optional[ChannelDimension]) -> np.ndarray:
"""
Flip the color channels from RGB to BGR or vice versa.
Args:
image (`np.ndarray`):
The image, represented as a numpy array.
data_format (`ChannelDimension`, *`optional`*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
Returns:
`np.ndarray`: The image with the flipped color channels.
"""
input_data_format = infer_channel_dimension_format(image)
if input_data_format == ChannelDimension.LAST:
image = image[..., ::-1]
elif input_data_format == ChannelDimension.FIRST:
image = image[:, ::-1, ...]
else:
raise ValueError(f"Invalid input channel dimension format: {input_data_format}")
if data_format is not None:
image = to_channel_dimension_format(image, data_format)
return image
class MobileViTImageProcessor(BaseImageProcessor):
r"""
Constructs a MobileViT image processor.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
`do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
Controls the size of the output image after resizing. Can be overridden by the `size` parameter in the
`preprocess` method.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Defines the resampling filter to use if resizing the image. Can be overridden by the `resample` parameter
in the `preprocess` method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge, the
image is padded with 0's and then center cropped. Can be overridden by the `do_center_crop` parameter in
the `preprocess` method.
crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 256, "width": 256}`):
Desired output size `(size["height"], size["width"])` when applying center-cropping. Can be overridden by
the `crop_size` parameter in the `preprocess` method.
do_flip_channel_order (`bool`, *optional*, defaults to `True`):
Whether to flip the color channels from RGB to BGR. Can be overridden by the `do_flip_channel_order`
parameter in the `preprocess` method.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_center_crop: bool = True,
crop_size: Dict[str, int] = None,
do_flip_channel_order: bool = True,
**kwargs
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 224}
size = get_size_dict(size, default_to_square=False)
crop_size = crop_size if crop_size is not None else {"height": 256, "width": 256}
crop_size = get_size_dict(crop_size)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_flip_channel_order = do_flip_channel_order
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PIL.Image.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Resize an image.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Controls the size of the output image. The shortest edge of the image will be resized to
`size["shortest_edge"]` while maintaining the aspect ratio.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use when resiizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size, default_to_square=False)
if "shortest_edge" not in size:
raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}")
output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
def center_crop(
self,
image: np.ndarray,
size: Dict[str, int],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Center crop an image to size `(size["height], size["width"])`. If the input size is smaller than `size` along
any edge, the image is padded with 0's and then center cropped.
Args:
image (`np.ndarray`):
Image to center crop.
size (`Dict[str, int]`):
Size of the output image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size)
return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
def rescale(
self,
image: np.ndarray,
scale: Union[int, float],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
):
"""
Rescale an image by a scale factor. image = image * scale.
Args:
image (`np.ndarray`):
Image to rescale.
scale (`int` or `float`):
Scale to apply to the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return rescale(image, scale=scale, data_format=data_format, **kwargs)
def flip_channel_order(
self, image: np.ndarray, data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
"""
Flip the color channels from RGB to BGR or vice versa.
Args:
image (`np.ndarray`):
The image, represented as a numpy array.
data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return flip_channel_order(image, data_format=data_format)
def preprocess(
self,
images: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_center_crop: bool = None,
crop_size: Dict[str, int] = None,
do_flip_channel_order: bool = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
) -> PIL.Image.Image:
"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to preprocess.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after resizing.
resample (`int`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
has an effect if `do_resize` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image by rescale factor.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
Whether to center crop the image.
crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
Size of the center crop if `do_center_crop` is set to `True`.
do_flip_channel_order (`bool`, *optional*, defaults to `self.do_flip_channel_order`):
Whether to flip the channel order of the image.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
resample = resample if resample is not None else self.resample
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
do_flip_channel_order = (
do_flip_channel_order if do_flip_channel_order is not None else self.do_flip_channel_order
)
size = size if size is not None else self.size
size = get_size_dict(size, default_to_square=False)
crop_size = crop_size if crop_size is not None else self.crop_size
crop_size = get_size_dict(crop_size)
if not is_batched(images):
images = [images]
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
if do_resize and size is None:
raise ValueError("Size must be specified if do_resize is True.")
if do_rescale and rescale_factor is None:
raise ValueError("Rescale factor must be specified if do_rescale is True.")
if do_center_crop and crop_size is None:
raise ValueError("Crop size must be specified if do_center_crop is True.")
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if do_resize:
images = [self.resize(image=image, size=size, resample=resample) for image in images]
if do_center_crop:
images = [self.center_crop(image=image, size=crop_size) for image in images]
if do_rescale:
images = [self.rescale(image=image, scale=rescale_factor) for image in images]
# the pretrained checkpoints assume images are BGR, not RGB
if do_flip_channel_order:
images = [self.flip_channel_order(image=image) for image in images]
images = [to_channel_dimension_format(image, data_format) for image in images]
data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors)
def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
"""
Args:
Converts the output of [`MobileViTForSemanticSegmentation`] into semantic segmentation maps. Only supports
PyTorch.
outputs ([`MobileViTForSemanticSegmentation`]):
Raw outputs of the model.
target_sizes (`List[Tuple]`, *optional*):
A list of length `batch_size`, where each item is a `Tuple[int, int]` corresponding to the requested
final size (height, width) of each prediction. If left to None, predictions will not be resized.
Returns:
`List[torch.Tensor]`:
A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
`torch.Tensor` correspond to a semantic class id.
"""
# TODO: add support for other frameworks
logits = outputs.logits
# Resize logits and compute semantic segmentation maps
if target_sizes is not None:
if len(logits) != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
if is_torch_tensor(target_sizes):
target_sizes = target_sizes.numpy()
semantic_segmentation = []
for idx in range(len(logits)):
resized_logits = torch.nn.functional.interpolate(
logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
)
semantic_map = resized_logits[0].argmax(dim=0)
semantic_segmentation.append(semantic_map)
else:
semantic_segmentation = logits.argmax(dim=1)
semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
return semantic_segmentation
...@@ -14,179 +14,11 @@ ...@@ -14,179 +14,11 @@
# limitations under the License. # limitations under the License.
"""Feature extractor class for Perceiver.""" """Feature extractor class for Perceiver."""
from typing import Optional, Union from ...utils import logging
from .image_processing_perceiver import PerceiverImageProcessor
import numpy as np
from PIL import Image
from transformers.image_utils import PILImageResampling
from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
ImageFeatureExtractionMixin,
ImageInput,
is_torch_tensor,
)
from ...utils import TensorType, logging
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class PerceiverFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): PerceiverFeatureExtractor = PerceiverImageProcessor
r"""
Constructs a Perceiver feature extractor.
This feature extractor inherits from [`ImageFeatureExtractionMixin`] which contains most of the main methods. Users
should refer to this superclass for more information regarding those methods.
Args:
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge, the
image is padded with 0's and then center cropped.
crop_size (`int`, *optional*, defaults to 256):
Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`.
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the input to a certain `size`.
size (`int` or `Tuple(int)`, *optional*, defaults to 224):
Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize` is
set to `True`.
resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BICUBIC`):
An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
`PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
`PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
to `True`.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the input with `image_mean` and `image_std`.
image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
The sequence of means for each channel, to be used when normalizing images.
image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
The sequence of standard deviations for each channel, to be used when normalizing images.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_center_crop=True,
crop_size=256,
do_resize=True,
size=224,
resample=PILImageResampling.BICUBIC,
do_normalize=True,
image_mean=None,
image_std=None,
**kwargs
):
super().__init__(**kwargs)
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
def center_crop(self, image):
"""
Crops `image` to *self.crop_size* using a center crop. Note that if the image is too small to be cropped to the
size given, it will be padded (so the returned result has the size asked).
Args:
image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to resize.
"""
if isinstance(image, Image.Image):
image = self.to_numpy_array(image)
image_height, image_width = image.shape[-2:]
padded_center_crop_size = (
(self.size / (self.crop_size)) * np.minimum(image_height, image_width).astype(np.float32)
).astype(np.int32)
offset_height = ((image_height - padded_center_crop_size) + 1) // 2
offset_width = ((image_width - padded_center_crop_size) + 1) // 2
crop_window = [offset_height, offset_width, padded_center_crop_size, padded_center_crop_size]
image = image[
:, crop_window[0] : crop_window[0] + crop_window[2], crop_window[1] : crop_window[1] + crop_window[3]
]
return image
def __call__(
self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
) -> BatchFeature:
"""
Main method to prepare for the model one or several image(s).
<Tip warning={true}>
NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
PIL images.
</Tip>
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
width).
"""
# Input type checking for clearer error
valid_images = False
# Check that images has a valid type
if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
valid_images = True
elif isinstance(images, (list, tuple)):
if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
valid_images = True
if not valid_images:
raise ValueError(
"Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example),"
"`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
)
is_batched = bool(
isinstance(images, (list, tuple))
and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
)
if not is_batched:
images = [images]
# transformations (center cropping + resizing + normalization)
if self.do_center_crop and self.crop_size is not None:
images = [self.center_crop(image) for image in images]
if self.do_resize and self.size is not None and self.resample is not None:
images = [self.resize(image=image, size=self.size, resample=self.resample) for image in images]
if self.do_normalize:
images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
# return as BatchFeature
data = {"pixel_values": images}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
return encoded_inputs
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for Perceiver."""
from typing import Dict, List, Optional, Union
import numpy as np
from transformers.utils import is_vision_available
from transformers.utils.generic import TensorType
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import center_crop, normalize, rescale, resize, to_channel_dimension_format
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
is_batched,
to_numpy_array,
valid_images,
)
from ...utils import logging
if is_vision_available():
import PIL
logger = logging.get_logger(__name__)
class PerceiverImageProcessor(BaseImageProcessor):
r"""
Constructs a Perceiver image processor.
Args:
do_center_crop (`bool`, `optional`, defaults to `True`):
Whether or not to center crop the image. If the input size if smaller than `crop_size` along any edge, the
image will be padded with zeros and then center cropped. Can be overridden by the `do_center_crop`
parameter in the `preprocess` method.
crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 256, "width": 256}`):
Desired output size when applying center-cropping. Can be overridden by the `crop_size` parameter in the
`preprocess` method.
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image to `(size["height"], size["width"])`. Can be overridden by the `do_resize`
parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
Size of the image after resizing. Can be overridden by the `size` parameter in the `preprocess` method.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Defines the resampling filter to use if resizing the image. Can be overridden by the `resample` parameter
in the `preprocess` method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
in the `preprocess` method.
do_normalize:
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_center_crop: bool = True,
crop_size: Dict[str, int] = None,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs
) -> None:
super().__init__(**kwargs)
crop_size = crop_size if crop_size is not None else {"height": 256, "width": 256}
crop_size = get_size_dict(crop_size)
size = size if size is not None else {"height": 224, "width": 224}
size = get_size_dict(size)
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
def center_crop(
self,
image: np.ndarray,
crop_size: Dict[str, int],
size: Optional[int] = None,
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Center crop an image to `(size["height"] / crop_size["height"] * min_dim, size["width"] / crop_size["width"] *
min_dim)`. Where `min_dim = min(size["height"], size["width"])`.
If the input size is smaller than `crop_size` along any edge, the image will be padded with zeros and then
center cropped.
Args:
image (`np.ndarray`):
Image to center crop.
crop_size (`Dict[str, int]`):
Desired output size after applying the center crop.
size (`Dict[str, int]`, *optional*):
Size of the image after resizing. If not provided, the self.size attribute will be used.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = self.size if size is None else size
size = get_size_dict(size)
crop_size = get_size_dict(crop_size)
height, width = get_image_size(image)
min_dim = min(height, width)
cropped_height = (size["height"] / crop_size["height"]) * min_dim
cropped_width = (size["width"] / crop_size["width"]) * min_dim
return center_crop(image, size=(cropped_height, cropped_width), data_format=data_format, **kwargs)
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PIL.Image.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Resize an image to `(size["height"], size["width"])`.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PIL.Image.BILINEAR`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
return resize(
image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
)
def rescale(
self,
image: np.ndarray,
scale: Union[int, float],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
):
"""
Rescale an image by a scale factor. image = image * scale.
Args:
image (`np.ndarray`):
Image to rescale.
scale (`int` or `float`):
Scale to apply to the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return rescale(image, scale=scale, data_format=data_format, **kwargs)
def normalize(
self,
image: np.ndarray,
mean: Union[float, List[float]],
std: Union[float, List[float]],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Normalize an image. image = (image - image_mean) / image_std.
Args:
image (`np.ndarray`):
Image to normalize.
mean (`float` or `List[float]`):
Image mean.
std (`float` or `List[float]`):
Image standard deviation.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
def preprocess(
self,
images: ImageInput,
do_center_crop: Optional[bool] = None,
crop_size: Optional[Dict[str, int]] = None,
do_resize: Optional[bool] = None,
size: Optional[Dict[str, int]] = None,
resample: PILImageResampling = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
) -> PIL.Image.Image:
"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to preprocess.
do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
Whether to center crop the image to `crop_size`.
crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
Desired output size after applying the center crop.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after resizing.
resample (`int`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
has an effect if `do_resize` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Image mean.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
crop_size = crop_size if crop_size is not None else self.crop_size
crop_size = get_size_dict(crop_size)
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
size = get_size_dict(size)
resample = resample if resample is not None else self.resample
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
if not is_batched(images):
images = [images]
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
if do_center_crop and crop_size is None:
raise ValueError("If `do_center_crop` is set to `True`, `crop_size` must be provided.")
if do_resize and size is None:
raise ValueError("Size must be specified if do_resize is True.")
if do_rescale and rescale_factor is None:
raise ValueError("Rescale factor must be specified if do_rescale is True.")
if do_normalize and (image_mean is None or image_std is None):
raise ValueError("Image mean and image standard deviation must be specified if do_normalize is True.")
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if do_center_crop:
images = [self.center_crop(image, crop_size, size=size) for image in images]
if do_resize:
images = [self.resize(image=image, size=size, resample=resample) for image in images]
if do_rescale:
images = [self.rescale(image=image, scale=rescale_factor) for image in images]
if do_normalize:
images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
images = [to_channel_dimension_format(image, data_format) for image in images]
data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors)
...@@ -14,161 +14,11 @@ ...@@ -14,161 +14,11 @@
# limitations under the License. # limitations under the License.
"""Feature extractor class for PoolFormer.""" """Feature extractor class for PoolFormer."""
import math from ...utils import logging
from typing import Optional, Union from .image_processing_poolformer import PoolFormerImageProcessor
import numpy as np
from PIL import Image
from transformers.image_utils import PILImageResampling
from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
ImageFeatureExtractionMixin,
ImageInput,
is_torch_tensor,
)
from ...utils import TensorType, logging
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class PoolFormerFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): PoolFormerFeatureExtractor = PoolFormerImageProcessor
r"""
Constructs a PoolFormer feature extractor.
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
should refer to this superclass for more information regarding those methods.
Args:
do_resize_and_center_crop (`bool`, *optional*, defaults to `True`):
Whether to resize the shortest edge of the image and center crop the input to a certain `size`.
size (`int` or `Tuple(int)`, *optional*, defaults to 224):
Center crop the input to the given size. If a tuple is provided, it should be (width, height). If only an
integer is provided, then the input will be center cropped to (size, size). Only has an effect if
`do_resize_and_center_crop` is set to `True`.
resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BICUBIC`):
An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
`PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
`PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
to `True`.
crop_pct (`float`, *optional*, defaults to `0.9`):
The percentage of the image to crop from the center. Only has an effect if `do_resize_and_center_crop` is
set to `True`.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the input with `image_mean` and `image_std`.
image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
The sequence of means for each channel, to be used when normalizing images.
image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
The sequence of standard deviations for each channel, to be used when normalizing images.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize_and_center_crop=True,
size=224,
resample=PILImageResampling.BICUBIC,
crop_pct=0.9,
do_normalize=True,
image_mean=None,
image_std=None,
**kwargs
):
super().__init__(**kwargs)
self.do_resize_and_center_crop = do_resize_and_center_crop
self.size = size
self.resample = resample
self.crop_pct = crop_pct
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
def __call__(
self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
) -> BatchFeature:
"""
Main method to prepare for the model one or several image(s).
<Tip warning={true}>
NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
PIL images.
</Tip>
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
width).
"""
# Input type checking for clearer error
valid_images = False
# Check that images has a valid type
if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
valid_images = True
elif isinstance(images, (list, tuple)):
if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
valid_images = True
if not valid_images:
raise ValueError(
"Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example), "
"`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
)
is_batched = bool(
isinstance(images, (list, tuple))
and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
)
if not is_batched:
images = [images]
# transformations (resizing + center cropping + normalization)
if self.do_resize_and_center_crop and self.size is not None and self.crop_pct is not None:
if isinstance(self.size, (tuple, list)):
assert len(self.size) == 2
if self.size[-1] == self.size[-2]:
scale_size = int(math.floor(self.size[0] / self.crop_pct))
else:
scale_size = tuple([int(x / self.crop_pct) for x in self.size])
else:
scale_size = int(math.floor(self.size / self.crop_pct))
# resize shortest edge of the image
images = [
self.resize(image=image, size=scale_size, resample=self.resample, default_to_square=False)
for image in images
]
# center crop
images = [self.center_crop(image, size=self.size) for image in images]
if self.do_normalize:
images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
# return as BatchFeature
data = {"pixel_values": images}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
return encoded_inputs
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for PoolFormer."""
import math
from typing import Dict, List, Optional, Union
import numpy as np
from transformers import is_vision_available
from transformers.utils.generic import TensorType
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
center_crop,
get_resize_output_image_size,
normalize,
rescale,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
is_batched,
to_numpy_array,
valid_images,
)
from ...utils import logging
if is_vision_available():
import PIL
logger = logging.get_logger(__name__)
class PoolFormerImageProcessor(BaseImageProcessor):
r"""
Constructs a PoolFormer image processor.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
`do_resize` in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 256}`):
Size of the image after resizing. Can be overridden by `size` in the `preprocess` method. If crop_pct is
unset:
- size is `{"height": h, "width": w}`: the image is resized to `(h, w)`.
- size is `{"shortest_edge": s}`: the shortest edge of the image is resized to s whilst maintaining the
aspect ratio.
If crop_pct is set:
- size is `{"height": h, "width": w}`: the image is resized to `(int(floor(h/crop_pct)),
int(floor(w/crop_pct)))`
- size is `{"height": c, "width": c}`: the shortest edge of the image is resized to `int(floor(c/crop_pct)`
whilst maintaining the aspect ratio.
- size is `{"shortest_edge": c}`: the shortest edge of the image is resized to `int(floor(c/crop_pct)`
whilst maintaining the aspect ratio.
crop_pct (`float`, *optional*, defaults to `0.9`):
Percentage of the image to crop from the center. Can be overridden by `crop_pct` in the `preprocess`
method.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
is padded with 0's and then center cropped. Can be overridden by `do_center_crop` in the `preprocess`
method.
crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 256, "width": 256}`):
Size of the image after applying center crop. Only has an effect if `do_center_crop` is set to `True`. Can
be overridden by the `crop_size` parameter in the `preprocess` method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize (`bool`, *optional*, defaults to `True`):
Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
`preprocess` method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
crop_pct: int = 0.9,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_center_crop: bool = True,
crop_size: Dict[str, int] = None,
rescale_factor: Union[int, float] = 1 / 255,
do_rescale: bool = True,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 256}
size = get_size_dict(size, default_to_square=False)
crop_size = crop_size if crop_size is not None else {"height": 256, "width": 256}
crop_size = get_size_dict(crop_size)
self.do_resize = do_resize
self.size = size
self.crop_pct = crop_pct
self.resample = resample
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
crop_pct: Optional[float] = None,
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Resize an image.
If crop_pct is unset:
- size is `{"height": h, "width": w}`: the image is resized to `(h, w)`.
- size is `{"shortest_edge": s}`: the shortest edge of the image is resized to s whilst maintaining the
aspect ratio.
if crop_pct is set:
- size is `{"height": h, "width": w}`: the image is resized to `(int(floor(h/crop_pct)),
int(floor(w/crop_pct)))`
- size is `{"height": c, "width": c}`: the shortest edge of the image is resized to `int(floor(c/crop_pct)`
whilst maintaining the aspect ratio.
- size is `{"shortest_edge": c}`: the shortest edge of the image is resized to `int(floor(c/crop_pct)`
whilst maintaining the aspect ratio.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image.
crop_pct (`float`, *optional*):
Percentage of the image that will be cropped from the center. If set, the image is resized
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size, default_to_square=False)
if "shortest_edge" not in size and ("height" not in size or "width" not in size):
raise ValueError(f"size must contain 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
if crop_pct is not None:
if "shortest_edge" in size:
scale_size = int(math.floor(size["shortest_edge"] / crop_pct))
elif "height" in size and "width" in size:
if size["height"] == size["width"]:
scale_size = int(math.floor(size["height"] / crop_pct))
else:
scale_size = (
int(math.floor(size["height"] / crop_pct)),
int(math.floor(size["width"] / crop_pct)),
)
else:
raise ValueError("Invalid size for resize: {}".format(size))
output_size = get_resize_output_image_size(image, size=scale_size, default_to_square=False)
else:
if "shortest_edge" in size:
output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
elif "height" in size and "width" in size:
output_size = (size["height"], size["width"])
else:
raise ValueError("Invalid size for resize: {}".format(size))
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
def center_crop(
self,
image: np.ndarray,
size: Dict[str, int],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Center crop an image to (size["height"], size["width"]). If the input size is smaller than `crop_size` along
any edge, the image is padded with 0's and then center cropped.
Args:
image (`np.ndarray`):
Image to center crop.
size (`Dict[str, int]`):
Size of the output image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size)
return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
def rescale(
self,
image: np.ndarray,
scale: Union[int, float],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
):
"""
Rescale an image by a scale factor. image = image * scale.
Args:
image (`np.ndarray`):
Image to rescale.
scale (`int` or `float`):
Scale to apply to the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return rescale(image, scale=scale, data_format=data_format, **kwargs)
def normalize(
self,
image: np.ndarray,
mean: Union[float, List[float]],
std: Union[float, List[float]],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs
) -> np.ndarray:
"""
Normalize an image. image = (image - image_mean) / image_std.
Args:
image (`np.ndarray`):
Image to normalize.
image_mean (`float` or `List[float]`):
Image mean.
image_std (`float` or `List[float]`):
Image standard deviation.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
def preprocess(
self,
images: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
crop_pct: int = None,
resample: PILImageResampling = None,
do_center_crop: bool = None,
crop_size: Dict[str, int] = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
) -> PIL.Image.Image:
"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to preprocess.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after applying resize.
crop_pct (`float`, *optional*, defaults to `self.crop_pct`):
Percentage of the image to crop. Only has an effect if `do_resize` is set to `True`.
resample (`int`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
has an effect if `do_resize` is set to `True`.
do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
Whether to center crop the image.
crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
Size of the image after applying center crop.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image values between [0 - 1].
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Image mean.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
crop_pct = crop_pct if crop_pct is not None else self.crop_pct
resample = resample if resample is not None else self.resample
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
size = size if size is not None else self.size
size = get_size_dict(size, default_to_square=False)
crop_size = crop_size if crop_size is not None else self.crop_size
crop_size = get_size_dict(crop_size)
if not is_batched(images):
images = [images]
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
if do_resize and size is None or resample is None:
raise ValueError("Size and resample must be specified if do_resize is True.")
if do_center_crop and crop_pct is None:
raise ValueError("Crop_pct must be specified if do_center_crop is True.")
if do_rescale and rescale_factor is None:
raise ValueError("Rescale factor must be specified if do_rescale is True.")
if do_normalize and (image_mean is None or image_std is None):
raise ValueError("Image mean and std must be specified if do_normalize is True.")
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if do_resize:
images = [self.resize(image=image, size=size, crop_pct=crop_pct, resample=resample) for image in images]
if do_center_crop:
images = [self.center_crop(image=image, size=crop_size) for image in images]
if do_rescale:
images = [self.rescale(image=image, scale=rescale_factor) for image in images]
if do_normalize:
images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
images = [to_channel_dimension_format(image, data_format) for image in images]
data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors)
...@@ -14,248 +14,11 @@ ...@@ -14,248 +14,11 @@
# limitations under the License. # limitations under the License.
"""Feature extractor class for SegFormer.""" """Feature extractor class for SegFormer."""
from typing import List, Optional, Tuple, Union from ...utils import logging
from .image_processing_segformer import SegformerImageProcessor
import numpy as np
from PIL import Image
from transformers.image_utils import PILImageResampling
from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
ImageFeatureExtractionMixin,
ImageInput,
is_torch_tensor,
)
from ...utils import TensorType, is_torch_available, logging
if is_torch_available():
import torch
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class SegformerFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): SegformerFeatureExtractor = SegformerImageProcessor
r"""
Constructs a SegFormer feature extractor.
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
should refer to this superclass for more information regarding those methods.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the input based on a certain `size`.
size (`int` or `Tuple(int)`, *optional*, defaults to 512):
Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize` is
set to `True`.
resample (`int`, *optional*, defaults to `PIL.Image.Resampling.BILINEAR`):
An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
`PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
`PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
to `True`.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to normalize the input with mean and standard deviation.
image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
ImageNet std.
reduce_labels (`bool`, *optional*, defaults to `False`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is
used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The
background label will be replaced by 255.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_resize=True,
size=512,
resample=PILImageResampling.BILINEAR,
do_normalize=True,
image_mean=None,
image_std=None,
reduce_labels=False,
**kwargs
):
super().__init__(**kwargs)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.reduce_labels = reduce_labels
def __call__(
self,
images: ImageInput,
segmentation_maps: ImageInput = None,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs
) -> BatchFeature:
"""
Main method to prepare for the model one or several image(s) and optional corresponding segmentation maps.
<Tip warning={true}>
NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
PIL images.
</Tip>
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is
the number of channels, H and W are image height and width.
segmentation_maps (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
Optionally, the corresponding semantic segmentation maps with the pixel-wise annotations.
return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
width).
- **labels** -- Optional labels to be fed to a model (when `segmentation_maps` are provided)
"""
# Input type checking for clearer error
valid_images = False
valid_segmentation_maps = False
# Check that images has a valid type
if isinstance(images, (Image.Image, np.ndarray)) or is_torch_tensor(images):
valid_images = True
elif isinstance(images, (list, tuple)):
if len(images) == 0 or isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]):
valid_images = True
if not valid_images:
raise ValueError(
"Images must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single example),"
"`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of examples)."
)
# Check that segmentation maps has a valid type
if segmentation_maps is not None:
if isinstance(segmentation_maps, (Image.Image, np.ndarray)) or is_torch_tensor(segmentation_maps):
valid_segmentation_maps = True
elif isinstance(segmentation_maps, (list, tuple)):
if (
len(segmentation_maps) == 0
or isinstance(segmentation_maps[0], (Image.Image, np.ndarray))
or is_torch_tensor(segmentation_maps[0])
):
valid_segmentation_maps = True
if not valid_segmentation_maps:
raise ValueError(
"Segmentation maps must of type `PIL.Image.Image`, `np.ndarray` or `torch.Tensor` (single"
" example),`List[PIL.Image.Image]`, `List[np.ndarray]` or `List[torch.Tensor]` (batch of"
" examples)."
)
is_batched = bool(
isinstance(images, (list, tuple))
and (isinstance(images[0], (Image.Image, np.ndarray)) or is_torch_tensor(images[0]))
)
if not is_batched:
images = [images]
if segmentation_maps is not None:
segmentation_maps = [segmentation_maps]
# reduce zero label if needed
if self.reduce_labels:
if segmentation_maps is not None:
for idx, map in enumerate(segmentation_maps):
if not isinstance(map, np.ndarray):
map = np.array(map)
# avoid using underflow conversion
map[map == 0] = 255
map = map - 1
map[map == 254] = 255
segmentation_maps[idx] = Image.fromarray(map.astype(np.uint8))
# transformations (resizing + normalization)
if self.do_resize and self.size is not None:
images = [self.resize(image=image, size=self.size, resample=self.resample) for image in images]
if segmentation_maps is not None:
segmentation_maps = [
self.resize(map, size=self.size, resample=Image.NEAREST) for map in segmentation_maps
]
if self.do_normalize:
images = [self.normalize(image=image, mean=self.image_mean, std=self.image_std) for image in images]
# return as BatchFeature
data = {"pixel_values": images}
if segmentation_maps is not None:
labels = []
for map in segmentation_maps:
if not isinstance(map, np.ndarray):
map = np.array(map)
labels.append(map.astype(np.int64))
# cast to np.int64
data["labels"] = labels
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
return encoded_inputs
def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
"""
Converts the output of [`SegformerForSemanticSegmentation`] into semantic segmentation maps. Only supports
PyTorch.
Args:
outputs ([`SegformerForSemanticSegmentation`]):
Raw outputs of the model.
target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
List of tuples corresponding to the requested final size (height, width) of each prediction. If left to
None, predictions will not be resized.
Returns:
semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
"""
logits = outputs.logits
# Resize logits and compute semantic segmentation maps
if target_sizes is not None:
if len(logits) != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
if is_torch_tensor(target_sizes):
target_sizes = target_sizes.numpy()
semantic_segmentation = []
for idx in range(len(logits)):
resized_logits = torch.nn.functional.interpolate(
logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
)
semantic_map = resized_logits[0].argmax(dim=0)
semantic_segmentation.append(semantic_map)
else:
semantic_segmentation = logits.argmax(dim=1)
semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
return semantic_segmentation
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment