Unverified Commit f53fe35b authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Fast image processor (#28847)



* Draft fast image processors

* Draft working fast version

* py3.8 compatible cache

* Enable loading fast image processors through auto

* Tidy up; rescale behaviour based on input type

* Enable tests for fast image processors

* Smarter rescaling

* Don't default to Fast

* Safer imports

* Add necessary Pillow requirement

* Woops

* Add AutoImageProcessor test

* Fix up

* Fix test for imagegpt

* Fix test

* Review comments

* Add warning for TF and JAX input types

* Rearrange

* Return transforms

* NumpyToTensor transformation

* Rebase - include changes from upstream in ImageProcessingMixin

* Safe typing

* Fix up

* convert mean/std to tesnor to rescale

* Don't store transforms in state

* Fix up

* Update src/transformers/image_processing_utils_fast.py
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/models/auto/image_processing_auto.py
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/models/auto/image_processing_auto.py
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/models/auto/image_processing_auto.py
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* Warn if fast image processor available

* Update src/transformers/models/vit/image_processing_vit_fast.py

* Transpose incoming numpy images to be in CHW format

* Update mapping names based on packages, auto set fast to None

* Fix up

* Fix

* Add AutoImageProcessor.from_pretrained(checkpoint, use_fast=True) test

* Update src/transformers/models/vit/image_processing_vit_fast.py
Co-authored-by: default avatarPavel Iakubovskii <qubvel@gmail.com>

* Add equivalence and speed tests

* Fix up

---------
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>
Co-authored-by: default avatarPavel Iakubovskii <qubvel@gmail.com>
parent edc1dffd
......@@ -32,3 +32,8 @@ An image processor is in charge of preparing input features for vision models an
## BaseImageProcessor
[[autodoc]] image_processing_utils.BaseImageProcessor
## BaseImageProcessorFast
[[autodoc]] image_processing_utils_fast.BaseImageProcessorFast
......@@ -158,6 +158,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
[[autodoc]] ViTImageProcessor
- preprocess
## ViTImageProcessorFast
[[autodoc]] ViTImageProcessorFast
- preprocess
<frameworkcontent>
<pt>
......
......@@ -29,3 +29,4 @@ timm
albumentations >= 1.4.5
torchmetrics
pycocotools
Pillow>=10.0.1,<=15.0
......@@ -1104,7 +1104,8 @@ except OptionalDependencyNotAvailable:
name for name in dir(dummy_vision_objects) if not name.startswith("_")
]
else:
_import_structure["image_processing_utils"] = ["ImageProcessingMixin"]
_import_structure["image_processing_base"] = ["ImageProcessingMixin"]
_import_structure["image_processing_utils"] = ["BaseImageProcessor"]
_import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
_import_structure["models.beit"].extend(["BeitFeatureExtractor", "BeitImageProcessor"])
_import_structure["models.bit"].extend(["BitImageProcessor"])
......@@ -1167,6 +1168,18 @@ else:
_import_structure["models.vivit"].append("VivitImageProcessor")
_import_structure["models.yolos"].extend(["YolosFeatureExtractor", "YolosImageProcessor"])
try:
if not is_torchvision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from .utils import dummy_torchvision_objects
_import_structure["utils.dummy_torchvision_objects"] = [
name for name in dir(dummy_torchvision_objects) if not name.startswith("_")
]
else:
_import_structure["image_processing_utils_fast"] = ["BaseImageProcessorFast"]
_import_structure["models.vit"].append("ViTImageProcessorFast")
# PyTorch-backed objects
try:
......@@ -5703,7 +5716,8 @@ if TYPE_CHECKING:
except OptionalDependencyNotAvailable:
from .utils.dummy_vision_objects import *
else:
from .image_processing_utils import ImageProcessingMixin
from .image_processing_base import ImageProcessingMixin
from .image_processing_utils import BaseImageProcessor
from .image_utils import ImageFeatureExtractionMixin
from .models.beit import BeitFeatureExtractor, BeitImageProcessor
from .models.bit import BitImageProcessor
......@@ -5793,6 +5807,15 @@ if TYPE_CHECKING:
from .models.vivit import VivitImageProcessor
from .models.yolos import YolosFeatureExtractor, YolosImageProcessor
try:
if not is_torchvision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from .utils.dummy_torchvision_objects import *
else:
from .image_processing_utils_fast import BaseImageProcessorFast
from .models.vit import ViTImageProcessorFast
# Modeling
try:
if not is_torch_available():
......
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import json
import os
import warnings
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union
import numpy as np
import requests
from .dynamic_module_utils import custom_object_save
from .feature_extraction_utils import BatchFeature as BaseBatchFeature
from .utils import (
IMAGE_PROCESSOR_NAME,
PushToHubMixin,
add_model_info_to_auto_map,
add_model_info_to_custom_pipelines,
cached_file,
copy_func,
download_url,
is_offline_mode,
is_remote_url,
is_vision_available,
logging,
)
if is_vision_available():
from PIL import Image
logger = logging.get_logger(__name__)
# TODO: Move BatchFeature to be imported by both image_processing_utils and image_processing_utils
# We override the class string here, but logic is the same.
class BatchFeature(BaseBatchFeature):
r"""
Holds the output of the image processor specific `__call__` methods.
This class is derived from a python dictionary and can be used as a dictionary.
Args:
data (`dict`):
Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).
tensor_type (`Union[None, str, TensorType]`, *optional*):
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
initialization.
"""
# TODO: (Amy) - factor out the common parts of this and the feature extractor
class ImageProcessingMixin(PushToHubMixin):
"""
This is an image processor mixin used to provide saving/loading functionality for sequential and image feature
extractors.
"""
_auto_class = None
def __init__(self, **kwargs):
"""Set elements of `kwargs` as attributes."""
# This key was saved while we still used `XXXFeatureExtractor` for image processing. Now we use
# `XXXImageProcessor`, this attribute and its value are misleading.
kwargs.pop("feature_extractor_type", None)
# Pop "processor_class" as it should be saved as private attribute
self._processor_class = kwargs.pop("processor_class", None)
# Additional attributes without default values
for key, value in kwargs.items():
try:
setattr(self, key, value)
except AttributeError as err:
logger.error(f"Can't set {key} with value {value} for {self}")
raise err
def _set_processor_class(self, processor_class: str):
"""Sets processor class as an attribute."""
self._processor_class = processor_class
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: Union[str, os.PathLike],
cache_dir: Optional[Union[str, os.PathLike]] = None,
force_download: bool = False,
local_files_only: bool = False,
token: Optional[Union[str, bool]] = None,
revision: str = "main",
**kwargs,
):
r"""
Instantiate a type of [`~image_processing_utils.ImageProcessingMixin`] from an image processor.
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained image_processor hosted inside a model repo on
huggingface.co.
- a path to a *directory* containing a image processor file saved using the
[`~image_processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g.,
`./my_model_directory/`.
- a path or url to a saved image processor JSON *file*, e.g.,
`./my_model_directory/preprocessor_config.json`.
cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model image processor should be cached if the
standard cache should not be used.
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force to (re-)download the image processor files and override the cached versions if
they exist.
resume_download:
Deprecated and ignored. All downloads are now resumed by default when possible.
Will be removed in v5 of Transformers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
token (`str` or `bool`, *optional*):
The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
revision (`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
<Tip>
To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
</Tip>
return_unused_kwargs (`bool`, *optional*, defaults to `False`):
If `False`, then this function returns just the final image processor object. If `True`, then this
functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
`kwargs` which has not been used to update `image_processor` and is otherwise ignored.
subfolder (`str`, *optional*, defaults to `""`):
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
specify the folder name here.
kwargs (`Dict[str, Any]`, *optional*):
The values in kwargs of any keys which are image processor attributes will be used to override the
loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
controlled by the `return_unused_kwargs` keyword parameter.
Returns:
A image processor of type [`~image_processing_utils.ImageProcessingMixin`].
Examples:
```python
# We can't instantiate directly the base class *ImageProcessingMixin* so let's show the examples on a
# derived class: *CLIPImageProcessor*
image_processor = CLIPImageProcessor.from_pretrained(
"openai/clip-vit-base-patch32"
) # Download image_processing_config from huggingface.co and cache.
image_processor = CLIPImageProcessor.from_pretrained(
"./test/saved_model/"
) # E.g. image processor (or model) was saved using *save_pretrained('./test/saved_model/')*
image_processor = CLIPImageProcessor.from_pretrained("./test/saved_model/preprocessor_config.json")
image_processor = CLIPImageProcessor.from_pretrained(
"openai/clip-vit-base-patch32", do_normalize=False, foo=False
)
assert image_processor.do_normalize is False
image_processor, unused_kwargs = CLIPImageProcessor.from_pretrained(
"openai/clip-vit-base-patch32", do_normalize=False, foo=False, return_unused_kwargs=True
)
assert image_processor.do_normalize is False
assert unused_kwargs == {"foo": False}
```"""
kwargs["cache_dir"] = cache_dir
kwargs["force_download"] = force_download
kwargs["local_files_only"] = local_files_only
kwargs["revision"] = revision
use_auth_token = kwargs.pop("use_auth_token", None)
if use_auth_token is not None:
warnings.warn(
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
FutureWarning,
)
if token is not None:
raise ValueError(
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
)
token = use_auth_token
if token is not None:
kwargs["token"] = token
image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
return cls.from_dict(image_processor_dict, **kwargs)
def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
"""
Save an image processor object to the directory `save_directory`, so that it can be re-loaded using the
[`~image_processing_utils.ImageProcessingMixin.from_pretrained`] class method.
Args:
save_directory (`str` or `os.PathLike`):
Directory where the image processor JSON file will be saved (will be created if it does not exist).
push_to_hub (`bool`, *optional*, defaults to `False`):
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
namespace).
kwargs (`Dict[str, Any]`, *optional*):
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
"""
use_auth_token = kwargs.pop("use_auth_token", None)
if use_auth_token is not None:
warnings.warn(
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
FutureWarning,
)
if kwargs.get("token", None) is not None:
raise ValueError(
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
)
kwargs["token"] = use_auth_token
if os.path.isfile(save_directory):
raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
os.makedirs(save_directory, exist_ok=True)
if push_to_hub:
commit_message = kwargs.pop("commit_message", None)
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
repo_id = self._create_repo(repo_id, **kwargs)
files_timestamps = self._get_files_timestamps(save_directory)
# If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
# loaded from the Hub.
if self._auto_class is not None:
custom_object_save(self, save_directory, config=self)
# If we save using the predefined names, we can load using `from_pretrained`
output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
self.to_json_file(output_image_processor_file)
logger.info(f"Image processor saved in {output_image_processor_file}")
if push_to_hub:
self._upload_modified_files(
save_directory,
repo_id,
files_timestamps,
commit_message=commit_message,
token=kwargs.get("token"),
)
return [output_image_processor_file]
@classmethod
def get_image_processor_dict(
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""
From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
image processor of type [`~image_processor_utils.ImageProcessingMixin`] using `from_dict`.
Parameters:
pretrained_model_name_or_path (`str` or `os.PathLike`):
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
subfolder (`str`, *optional*, defaults to `""`):
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
specify the folder name here.
Returns:
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
"""
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", None)
proxies = kwargs.pop("proxies", None)
token = kwargs.pop("token", None)
use_auth_token = kwargs.pop("use_auth_token", None)
local_files_only = kwargs.pop("local_files_only", False)
revision = kwargs.pop("revision", None)
subfolder = kwargs.pop("subfolder", "")
from_pipeline = kwargs.pop("_from_pipeline", None)
from_auto_class = kwargs.pop("_from_auto", False)
if use_auth_token is not None:
warnings.warn(
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
FutureWarning,
)
if token is not None:
raise ValueError(
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
)
token = use_auth_token
user_agent = {"file_type": "image processor", "from_auto_class": from_auto_class}
if from_pipeline is not None:
user_agent["using_pipeline"] = from_pipeline
if is_offline_mode() and not local_files_only:
logger.info("Offline mode: forcing local_files_only=True")
local_files_only = True
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
is_local = os.path.isdir(pretrained_model_name_or_path)
if os.path.isdir(pretrained_model_name_or_path):
image_processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
if os.path.isfile(pretrained_model_name_or_path):
resolved_image_processor_file = pretrained_model_name_or_path
is_local = True
elif is_remote_url(pretrained_model_name_or_path):
image_processor_file = pretrained_model_name_or_path
resolved_image_processor_file = download_url(pretrained_model_name_or_path)
else:
image_processor_file = IMAGE_PROCESSOR_NAME
try:
# Load from local folder or from cache or download from model Hub and cache
resolved_image_processor_file = cached_file(
pretrained_model_name_or_path,
image_processor_file,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
local_files_only=local_files_only,
token=token,
user_agent=user_agent,
revision=revision,
subfolder=subfolder,
)
except EnvironmentError:
# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
# the original exception.
raise
except Exception:
# For any other exception, we throw a generic error.
raise EnvironmentError(
f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
" it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
f" directory containing a {IMAGE_PROCESSOR_NAME} file"
)
try:
# Load image_processor dict
with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
text = reader.read()
image_processor_dict = json.loads(text)
except json.JSONDecodeError:
raise EnvironmentError(
f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
)
if is_local:
logger.info(f"loading configuration file {resolved_image_processor_file}")
else:
logger.info(
f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
)
if not is_local:
if "auto_map" in image_processor_dict:
image_processor_dict["auto_map"] = add_model_info_to_auto_map(
image_processor_dict["auto_map"], pretrained_model_name_or_path
)
if "custom_pipelines" in image_processor_dict:
image_processor_dict["custom_pipelines"] = add_model_info_to_custom_pipelines(
image_processor_dict["custom_pipelines"], pretrained_model_name_or_path
)
return image_processor_dict, kwargs
@classmethod
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
"""
Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
Args:
image_processor_dict (`Dict[str, Any]`):
Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
retrieved from a pretrained checkpoint by leveraging the
[`~image_processing_utils.ImageProcessingMixin.to_dict`] method.
kwargs (`Dict[str, Any]`):
Additional parameters from which to initialize the image processor object.
Returns:
[`~image_processing_utils.ImageProcessingMixin`]: The image processor object instantiated from those
parameters.
"""
image_processor_dict = image_processor_dict.copy()
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
# The `size` parameter is a dict and was previously an int or tuple in feature extractors.
# We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate
# dict within the image processor and isn't overwritten if `size` is passed in as a kwarg.
if "size" in kwargs and "size" in image_processor_dict:
image_processor_dict["size"] = kwargs.pop("size")
if "crop_size" in kwargs and "crop_size" in image_processor_dict:
image_processor_dict["crop_size"] = kwargs.pop("crop_size")
image_processor = cls(**image_processor_dict)
# Update image_processor with kwargs if needed
to_remove = []
for key, value in kwargs.items():
if hasattr(image_processor, key):
setattr(image_processor, key, value)
to_remove.append(key)
for key in to_remove:
kwargs.pop(key, None)
logger.info(f"Image processor {image_processor}")
if return_unused_kwargs:
return image_processor, kwargs
else:
return image_processor
def to_dict(self) -> Dict[str, Any]:
"""
Serializes this instance to a Python dictionary.
Returns:
`Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
"""
output = copy.deepcopy(self.__dict__)
output["image_processor_type"] = self.__class__.__name__
return output
@classmethod
def from_json_file(cls, json_file: Union[str, os.PathLike]):
"""
Instantiates a image processor of type [`~image_processing_utils.ImageProcessingMixin`] from the path to a JSON
file of parameters.
Args:
json_file (`str` or `os.PathLike`):
Path to the JSON file containing the parameters.
Returns:
A image processor of type [`~image_processing_utils.ImageProcessingMixin`]: The image_processor object
instantiated from that JSON file.
"""
with open(json_file, "r", encoding="utf-8") as reader:
text = reader.read()
image_processor_dict = json.loads(text)
return cls(**image_processor_dict)
def to_json_string(self) -> str:
"""
Serializes this instance to a JSON string.
Returns:
`str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
"""
dictionary = self.to_dict()
for key, value in dictionary.items():
if isinstance(value, np.ndarray):
dictionary[key] = value.tolist()
# make sure private name "_processor_class" is correctly
# saved as "processor_class"
_processor_class = dictionary.pop("_processor_class", None)
if _processor_class is not None:
dictionary["processor_class"] = _processor_class
return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
def to_json_file(self, json_file_path: Union[str, os.PathLike]):
"""
Save this instance to a JSON file.
Args:
json_file_path (`str` or `os.PathLike`):
Path to the JSON file in which this image_processor instance's parameters will be saved.
"""
with open(json_file_path, "w", encoding="utf-8") as writer:
writer.write(self.to_json_string())
def __repr__(self):
return f"{self.__class__.__name__} {self.to_json_string()}"
@classmethod
def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
"""
Register this class with a given auto class. This should only be used for custom image processors as the ones
in the library are already mapped with `AutoImageProcessor `.
<Tip warning={true}>
This API is experimental and may have some slight breaking changes in the next releases.
</Tip>
Args:
auto_class (`str` or `type`, *optional*, defaults to `"AutoImageProcessor "`):
The auto class to register this new image processor with.
"""
if not isinstance(auto_class, str):
auto_class = auto_class.__name__
import transformers.models.auto as auto_module
if not hasattr(auto_module, auto_class):
raise ValueError(f"{auto_class} is not a valid auto class.")
cls._auto_class = auto_class
def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
"""
Convert a single or a list of urls into the corresponding `PIL.Image` objects.
If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
returned.
"""
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0"
" Safari/537.36"
)
}
if isinstance(image_url_or_urls, list):
return [self.fetch_images(x) for x in image_url_or_urls]
elif isinstance(image_url_or_urls, str):
response = requests.get(image_url_or_urls, stream=True, headers=headers)
response.raise_for_status()
return Image.open(BytesIO(response.content))
else:
raise ValueError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}")
ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub)
if ImageProcessingMixin.push_to_hub.__doc__ is not None:
ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format(
object="image processor", object_class="AutoImageProcessor", object_files="image processor file"
)
......@@ -13,37 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import json
import os
import warnings
from io import BytesIO
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
from typing import Dict, Iterable, Optional, Union
import numpy as np
import requests
from .dynamic_module_utils import custom_object_save
from .feature_extraction_utils import BatchFeature as BaseBatchFeature
from .image_processing_base import BatchFeature, ImageProcessingMixin
from .image_transforms import center_crop, normalize, rescale
from .image_utils import ChannelDimension
from .utils import (
IMAGE_PROCESSOR_NAME,
PushToHubMixin,
add_model_info_to_auto_map,
add_model_info_to_custom_pipelines,
cached_file,
copy_func,
download_url,
is_offline_mode,
is_remote_url,
is_vision_available,
logging,
)
from .utils import logging
if is_vision_available():
from PIL import Image
logger = logging.get_logger(__name__)
......@@ -54,505 +32,6 @@ INIT_SERVICE_KWARGS = [
]
# TODO: Move BatchFeature to be imported by both image_processing_utils and image_processing_utils
# We override the class string here, but logic is the same.
class BatchFeature(BaseBatchFeature):
r"""
Holds the output of the image processor specific `__call__` methods.
This class is derived from a python dictionary and can be used as a dictionary.
Args:
data (`dict`):
Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).
tensor_type (`Union[None, str, TensorType]`, *optional*):
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
initialization.
"""
# TODO: (Amy) - factor out the common parts of this and the feature extractor
class ImageProcessingMixin(PushToHubMixin):
"""
This is an image processor mixin used to provide saving/loading functionality for sequential and image feature
extractors.
"""
_auto_class = None
def __init__(self, **kwargs):
"""Set elements of `kwargs` as attributes."""
# This key was saved while we still used `XXXFeatureExtractor` for image processing. Now we use
# `XXXImageProcessor`, this attribute and its value are misleading.
kwargs.pop("feature_extractor_type", None)
# Pop "processor_class" as it should be saved as private attribute
self._processor_class = kwargs.pop("processor_class", None)
# Additional attributes without default values
for key, value in kwargs.items():
try:
setattr(self, key, value)
except AttributeError as err:
logger.error(f"Can't set {key} with value {value} for {self}")
raise err
def _set_processor_class(self, processor_class: str):
"""Sets processor class as an attribute."""
self._processor_class = processor_class
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: Union[str, os.PathLike],
cache_dir: Optional[Union[str, os.PathLike]] = None,
force_download: bool = False,
local_files_only: bool = False,
token: Optional[Union[str, bool]] = None,
revision: str = "main",
**kwargs,
):
r"""
Instantiate a type of [`~image_processing_utils.ImageProcessingMixin`] from an image processor.
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained image_processor hosted inside a model repo on
huggingface.co.
- a path to a *directory* containing a image processor file saved using the
[`~image_processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g.,
`./my_model_directory/`.
- a path or url to a saved image processor JSON *file*, e.g.,
`./my_model_directory/preprocessor_config.json`.
cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model image processor should be cached if the
standard cache should not be used.
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force to (re-)download the image processor files and override the cached versions if
they exist.
resume_download:
Deprecated and ignored. All downloads are now resumed by default when possible.
Will be removed in v5 of Transformers.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
token (`str` or `bool`, *optional*):
The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
revision (`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
<Tip>
To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
</Tip>
return_unused_kwargs (`bool`, *optional*, defaults to `False`):
If `False`, then this function returns just the final image processor object. If `True`, then this
functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
`kwargs` which has not been used to update `image_processor` and is otherwise ignored.
subfolder (`str`, *optional*, defaults to `""`):
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
specify the folder name here.
kwargs (`Dict[str, Any]`, *optional*):
The values in kwargs of any keys which are image processor attributes will be used to override the
loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
controlled by the `return_unused_kwargs` keyword parameter.
Returns:
A image processor of type [`~image_processing_utils.ImageProcessingMixin`].
Examples:
```python
# We can't instantiate directly the base class *ImageProcessingMixin* so let's show the examples on a
# derived class: *CLIPImageProcessor*
image_processor = CLIPImageProcessor.from_pretrained(
"openai/clip-vit-base-patch32"
) # Download image_processing_config from huggingface.co and cache.
image_processor = CLIPImageProcessor.from_pretrained(
"./test/saved_model/"
) # E.g. image processor (or model) was saved using *save_pretrained('./test/saved_model/')*
image_processor = CLIPImageProcessor.from_pretrained("./test/saved_model/preprocessor_config.json")
image_processor = CLIPImageProcessor.from_pretrained(
"openai/clip-vit-base-patch32", do_normalize=False, foo=False
)
assert image_processor.do_normalize is False
image_processor, unused_kwargs = CLIPImageProcessor.from_pretrained(
"openai/clip-vit-base-patch32", do_normalize=False, foo=False, return_unused_kwargs=True
)
assert image_processor.do_normalize is False
assert unused_kwargs == {"foo": False}
```"""
kwargs["cache_dir"] = cache_dir
kwargs["force_download"] = force_download
kwargs["local_files_only"] = local_files_only
kwargs["revision"] = revision
use_auth_token = kwargs.pop("use_auth_token", None)
if use_auth_token is not None:
warnings.warn(
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
FutureWarning,
)
if token is not None:
raise ValueError(
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
)
token = use_auth_token
if token is not None:
kwargs["token"] = token
image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
return cls.from_dict(image_processor_dict, **kwargs)
def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
"""
Save an image processor object to the directory `save_directory`, so that it can be re-loaded using the
[`~image_processing_utils.ImageProcessingMixin.from_pretrained`] class method.
Args:
save_directory (`str` or `os.PathLike`):
Directory where the image processor JSON file will be saved (will be created if it does not exist).
push_to_hub (`bool`, *optional*, defaults to `False`):
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
namespace).
kwargs (`Dict[str, Any]`, *optional*):
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
"""
use_auth_token = kwargs.pop("use_auth_token", None)
if use_auth_token is not None:
warnings.warn(
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
FutureWarning,
)
if kwargs.get("token", None) is not None:
raise ValueError(
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
)
kwargs["token"] = use_auth_token
if os.path.isfile(save_directory):
raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
os.makedirs(save_directory, exist_ok=True)
if push_to_hub:
commit_message = kwargs.pop("commit_message", None)
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
repo_id = self._create_repo(repo_id, **kwargs)
files_timestamps = self._get_files_timestamps(save_directory)
# If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
# loaded from the Hub.
if self._auto_class is not None:
custom_object_save(self, save_directory, config=self)
# If we save using the predefined names, we can load using `from_pretrained`
output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
self.to_json_file(output_image_processor_file)
logger.info(f"Image processor saved in {output_image_processor_file}")
if push_to_hub:
self._upload_modified_files(
save_directory,
repo_id,
files_timestamps,
commit_message=commit_message,
token=kwargs.get("token"),
)
return [output_image_processor_file]
@classmethod
def get_image_processor_dict(
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""
From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
image processor of type [`~image_processor_utils.ImageProcessingMixin`] using `from_dict`.
Parameters:
pretrained_model_name_or_path (`str` or `os.PathLike`):
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
subfolder (`str`, *optional*, defaults to `""`):
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
specify the folder name here.
Returns:
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
"""
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", None)
proxies = kwargs.pop("proxies", None)
token = kwargs.pop("token", None)
use_auth_token = kwargs.pop("use_auth_token", None)
local_files_only = kwargs.pop("local_files_only", False)
revision = kwargs.pop("revision", None)
subfolder = kwargs.pop("subfolder", "")
from_pipeline = kwargs.pop("_from_pipeline", None)
from_auto_class = kwargs.pop("_from_auto", False)
if use_auth_token is not None:
warnings.warn(
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
FutureWarning,
)
if token is not None:
raise ValueError(
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
)
token = use_auth_token
user_agent = {"file_type": "image processor", "from_auto_class": from_auto_class}
if from_pipeline is not None:
user_agent["using_pipeline"] = from_pipeline
if is_offline_mode() and not local_files_only:
logger.info("Offline mode: forcing local_files_only=True")
local_files_only = True
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
is_local = os.path.isdir(pretrained_model_name_or_path)
if os.path.isdir(pretrained_model_name_or_path):
image_processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME)
if os.path.isfile(pretrained_model_name_or_path):
resolved_image_processor_file = pretrained_model_name_or_path
is_local = True
elif is_remote_url(pretrained_model_name_or_path):
image_processor_file = pretrained_model_name_or_path
resolved_image_processor_file = download_url(pretrained_model_name_or_path)
else:
image_processor_file = IMAGE_PROCESSOR_NAME
try:
# Load from local folder or from cache or download from model Hub and cache
resolved_image_processor_file = cached_file(
pretrained_model_name_or_path,
image_processor_file,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
local_files_only=local_files_only,
token=token,
user_agent=user_agent,
revision=revision,
subfolder=subfolder,
)
except EnvironmentError:
# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
# the original exception.
raise
except Exception:
# For any other exception, we throw a generic error.
raise EnvironmentError(
f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
" it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
f" directory containing a {IMAGE_PROCESSOR_NAME} file"
)
try:
# Load image_processor dict
with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
text = reader.read()
image_processor_dict = json.loads(text)
except json.JSONDecodeError:
raise EnvironmentError(
f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
)
if is_local:
logger.info(f"loading configuration file {resolved_image_processor_file}")
else:
logger.info(
f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
)
if not is_local:
if "auto_map" in image_processor_dict:
image_processor_dict["auto_map"] = add_model_info_to_auto_map(
image_processor_dict["auto_map"], pretrained_model_name_or_path
)
if "custom_pipelines" in image_processor_dict:
image_processor_dict["custom_pipelines"] = add_model_info_to_custom_pipelines(
image_processor_dict["custom_pipelines"], pretrained_model_name_or_path
)
return image_processor_dict, kwargs
@classmethod
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
"""
Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
Args:
image_processor_dict (`Dict[str, Any]`):
Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
retrieved from a pretrained checkpoint by leveraging the
[`~image_processing_utils.ImageProcessingMixin.to_dict`] method.
kwargs (`Dict[str, Any]`):
Additional parameters from which to initialize the image processor object.
Returns:
[`~image_processing_utils.ImageProcessingMixin`]: The image processor object instantiated from those
parameters.
"""
image_processor_dict = image_processor_dict.copy()
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
# The `size` parameter is a dict and was previously an int or tuple in feature extractors.
# We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate
# dict within the image processor and isn't overwritten if `size` is passed in as a kwarg.
if "size" in kwargs and "size" in image_processor_dict:
image_processor_dict["size"] = kwargs.pop("size")
if "crop_size" in kwargs and "crop_size" in image_processor_dict:
image_processor_dict["crop_size"] = kwargs.pop("crop_size")
image_processor = cls(**image_processor_dict)
# Update image_processor with kwargs if needed
to_remove = []
for key, value in kwargs.items():
if hasattr(image_processor, key):
setattr(image_processor, key, value)
to_remove.append(key)
for key in to_remove:
kwargs.pop(key, None)
logger.info(f"Image processor {image_processor}")
if return_unused_kwargs:
return image_processor, kwargs
else:
return image_processor
def to_dict(self) -> Dict[str, Any]:
"""
Serializes this instance to a Python dictionary.
Returns:
`Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
"""
output = copy.deepcopy(self.__dict__)
output["image_processor_type"] = self.__class__.__name__
return output
@classmethod
def from_json_file(cls, json_file: Union[str, os.PathLike]):
"""
Instantiates a image processor of type [`~image_processing_utils.ImageProcessingMixin`] from the path to a JSON
file of parameters.
Args:
json_file (`str` or `os.PathLike`):
Path to the JSON file containing the parameters.
Returns:
A image processor of type [`~image_processing_utils.ImageProcessingMixin`]: The image_processor object
instantiated from that JSON file.
"""
with open(json_file, "r", encoding="utf-8") as reader:
text = reader.read()
image_processor_dict = json.loads(text)
return cls(**image_processor_dict)
def to_json_string(self) -> str:
"""
Serializes this instance to a JSON string.
Returns:
`str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
"""
dictionary = self.to_dict()
for key, value in dictionary.items():
if isinstance(value, np.ndarray):
dictionary[key] = value.tolist()
# make sure private name "_processor_class" is correctly
# saved as "processor_class"
_processor_class = dictionary.pop("_processor_class", None)
if _processor_class is not None:
dictionary["processor_class"] = _processor_class
return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
def to_json_file(self, json_file_path: Union[str, os.PathLike]):
"""
Save this instance to a JSON file.
Args:
json_file_path (`str` or `os.PathLike`):
Path to the JSON file in which this image_processor instance's parameters will be saved.
"""
with open(json_file_path, "w", encoding="utf-8") as writer:
writer.write(self.to_json_string())
def __repr__(self):
return f"{self.__class__.__name__} {self.to_json_string()}"
@classmethod
def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
"""
Register this class with a given auto class. This should only be used for custom image processors as the ones
in the library are already mapped with `AutoImageProcessor `.
<Tip warning={true}>
This API is experimental and may have some slight breaking changes in the next releases.
</Tip>
Args:
auto_class (`str` or `type`, *optional*, defaults to `"AutoImageProcessor "`):
The auto class to register this new image processor with.
"""
if not isinstance(auto_class, str):
auto_class = auto_class.__name__
import transformers.models.auto as auto_module
if not hasattr(auto_module, auto_class):
raise ValueError(f"{auto_class} is not a valid auto class.")
cls._auto_class = auto_class
def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
"""
Convert a single or a list of urls into the corresponding `PIL.Image` objects.
If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
returned.
"""
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0"
" Safari/537.36"
)
}
if isinstance(image_url_or_urls, list):
return [self.fetch_images(x) for x in image_url_or_urls]
elif isinstance(image_url_or_urls, str):
response = requests.get(image_url_or_urls, stream=True, headers=headers)
response.raise_for_status()
return Image.open(BytesIO(response.content))
else:
raise ValueError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}")
class BaseImageProcessor(ImageProcessingMixin):
def __init__(self, **kwargs):
super().__init__(**kwargs)
......@@ -801,10 +280,3 @@ def select_best_resolution(original_size: tuple, possible_resolutions: list) ->
best_fit = (height, width)
return best_fit
ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub)
if ImageProcessingMixin.push_to_hub.__doc__ is not None:
ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format(
object="image processor", object_class="AutoImageProcessor", object_files="image processor file"
)
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
from dataclasses import dataclass
from .image_processing_utils import BaseImageProcessor
from .utils.import_utils import is_torchvision_available
if is_torchvision_available():
from torchvision.transforms import Compose
@dataclass(frozen=True)
class SizeDict:
"""
Hashable dictionary to store image size information.
"""
height: int = None
width: int = None
longest_edge: int = None
shortest_edge: int = None
max_height: int = None
max_width: int = None
def __getitem__(self, key):
if hasattr(self, key):
return getattr(self, key)
raise KeyError(f"Key {key} not found in SizeDict.")
class BaseImageProcessorFast(BaseImageProcessor):
_transform_params = None
def _build_transforms(self, **kwargs) -> "Compose":
"""
Given the input settings e.g. do_resize, build the image transforms.
"""
raise NotImplementedError
def _validate_params(self, **kwargs) -> None:
for k, v in kwargs.items():
if k not in self._transform_params:
raise ValueError(f"Invalid transform parameter {k}={v}.")
@functools.lru_cache(maxsize=1)
def get_transforms(self, **kwargs) -> "Compose":
self._validate_params(**kwargs)
return self._build_transforms(**kwargs)
......@@ -31,6 +31,7 @@ from .utils.import_utils import (
is_flax_available,
is_tf_available,
is_torch_available,
is_torchvision_available,
is_vision_available,
requires_backends,
)
......@@ -50,6 +51,9 @@ if is_tf_available():
if is_flax_available():
import jax.numpy as jnp
if is_torchvision_available():
from torchvision.transforms import functional as F
def to_channel_dimension_format(
image: np.ndarray,
......@@ -374,6 +378,7 @@ def normalize(
if input_data_format is None:
input_data_format = infer_channel_dimension_format(image)
channel_axis = get_channel_dimension_axis(image, input_data_format=input_data_format)
num_channels = image.shape[channel_axis]
......@@ -802,3 +807,48 @@ def flip_channel_order(
if data_format is not None:
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
return image
def _cast_tensor_to_float(x):
if x.is_floating_point():
return x
return x.float()
class FusedRescaleNormalize:
"""
Rescale and normalize the input image in one step.
"""
def __init__(self, mean, std, rescale_factor: float = 1.0, inplace: bool = False):
self.mean = torch.tensor(mean) * (1.0 / rescale_factor)
self.std = torch.tensor(std) * (1.0 / rescale_factor)
self.inplace = inplace
def __call__(self, image: "torch.Tensor"):
image = _cast_tensor_to_float(image)
return F.normalize(image, self.mean, self.std, inplace=self.inplace)
class Rescale:
"""
Rescale the input image by rescale factor: image *= rescale_factor.
"""
def __init__(self, rescale_factor: float = 1.0):
self.rescale_factor = rescale_factor
def __call__(self, image: "torch.Tensor"):
image = image * self.rescale_factor
return image
class NumpyToTensor:
"""
Convert a numpy array to a PyTorch tensor.
"""
def __call__(self, image: np.ndarray):
# Same as in PyTorch, we assume incoming numpy images are in HWC format
# c.f. https://github.com/pytorch/vision/blob/61d97f41bc209e1407dcfbd685d2ee2da9c1cdad/torchvision/transforms/functional.py#L154
return torch.from_numpy(image.transpose(2, 0, 1)).contiguous()
......@@ -25,9 +25,11 @@ from packaging import version
from .utils import (
ExplicitEnum,
is_jax_tensor,
is_numpy_array,
is_tf_tensor,
is_torch_available,
is_torch_tensor,
is_torchvision_available,
is_vision_available,
logging,
requires_backends,
......@@ -52,6 +54,20 @@ if is_vision_available():
else:
PILImageResampling = PIL.Image
if is_torchvision_available():
from torchvision.transforms import InterpolationMode
pil_torch_interpolation_mapping = {
PILImageResampling.NEAREST: InterpolationMode.NEAREST,
PILImageResampling.BOX: InterpolationMode.BOX,
PILImageResampling.BILINEAR: InterpolationMode.BILINEAR,
PILImageResampling.HAMMING: InterpolationMode.HAMMING,
PILImageResampling.BICUBIC: InterpolationMode.BICUBIC,
PILImageResampling.LANCZOS: InterpolationMode.LANCZOS,
PILImageResampling.NEAREST: InterpolationMode.NEAREST,
}
if TYPE_CHECKING:
if is_torch_available():
import torch
......@@ -90,14 +106,30 @@ def is_pil_image(img):
return is_vision_available() and isinstance(img, PIL.Image.Image)
class ImageType(ExplicitEnum):
PIL = "pillow"
TORCH = "torch"
NUMPY = "numpy"
TENSORFLOW = "tensorflow"
JAX = "jax"
def get_image_type(image):
if is_pil_image(image):
return ImageType.PIL
if is_torch_tensor(image):
return ImageType.TORCH
if is_numpy_array(image):
return ImageType.NUMPY
if is_tf_tensor(image):
return ImageType.TENSORFLOW
if is_jax_tensor(image):
return ImageType.JAX
raise ValueError(f"Unrecognised image type {type(image)}")
def is_valid_image(img):
return (
(is_vision_available() and isinstance(img, PIL.Image.Image))
or isinstance(img, np.ndarray)
or is_torch_tensor(img)
or is_tf_tensor(img)
or is_jax_tensor(img)
)
return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img) or is_tf_tensor(img) or is_jax_tensor(img)
def valid_images(imgs):
......
......@@ -19,13 +19,21 @@ import json
import os
import warnings
from collections import OrderedDict
from typing import Dict, Optional, Union
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
# Build the list of all image processors
from ...configuration_utils import PretrainedConfig
from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
from ...image_processing_utils import ImageProcessingMixin
from ...utils import CONFIG_NAME, IMAGE_PROCESSOR_NAME, get_file_from_repo, logging
from ...image_processing_utils import BaseImageProcessor, ImageProcessingMixin
from ...image_processing_utils_fast import BaseImageProcessorFast
from ...utils import (
CONFIG_NAME,
IMAGE_PROCESSOR_NAME,
get_file_from_repo,
is_torchvision_available,
is_vision_available,
logging,
)
from .auto_factory import _LazyAutoMapping
from .configuration_auto import (
CONFIG_MAPPING_NAMES,
......@@ -37,104 +45,125 @@ from .configuration_auto import (
logger = logging.get_logger(__name__)
IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
if TYPE_CHECKING:
# This significantly improves completion suggestion performance when
# the transformers package is used with Microsoft's Pylance language server.
IMAGE_PROCESSOR_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict()
else:
IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
[
("align", "EfficientNetImageProcessor"),
("beit", "BeitImageProcessor"),
("bit", "BitImageProcessor"),
("blip", "BlipImageProcessor"),
("blip-2", "BlipImageProcessor"),
("bridgetower", "BridgeTowerImageProcessor"),
("chinese_clip", "ChineseCLIPImageProcessor"),
("clip", "CLIPImageProcessor"),
("clipseg", "ViTImageProcessor"),
("conditional_detr", "ConditionalDetrImageProcessor"),
("convnext", "ConvNextImageProcessor"),
("convnextv2", "ConvNextImageProcessor"),
("cvt", "ConvNextImageProcessor"),
("data2vec-vision", "BeitImageProcessor"),
("deformable_detr", "DeformableDetrImageProcessor"),
("deit", "DeiTImageProcessor"),
("depth_anything", "DPTImageProcessor"),
("deta", "DetaImageProcessor"),
("detr", "DetrImageProcessor"),
("dinat", "ViTImageProcessor"),
("dinov2", "BitImageProcessor"),
("donut-swin", "DonutImageProcessor"),
("dpt", "DPTImageProcessor"),
("efficientformer", "EfficientFormerImageProcessor"),
("efficientnet", "EfficientNetImageProcessor"),
("flava", "FlavaImageProcessor"),
("focalnet", "BitImageProcessor"),
("fuyu", "FuyuImageProcessor"),
("git", "CLIPImageProcessor"),
("glpn", "GLPNImageProcessor"),
("grounding-dino", "GroundingDinoImageProcessor"),
("groupvit", "CLIPImageProcessor"),
("idefics", "IdeficsImageProcessor"),
("idefics2", "Idefics2ImageProcessor"),
("imagegpt", "ImageGPTImageProcessor"),
("instructblip", "BlipImageProcessor"),
("kosmos-2", "CLIPImageProcessor"),
("layoutlmv2", "LayoutLMv2ImageProcessor"),
("layoutlmv3", "LayoutLMv3ImageProcessor"),
("levit", "LevitImageProcessor"),
("llava", "CLIPImageProcessor"),
("llava_next", "LlavaNextImageProcessor"),
("mask2former", "Mask2FormerImageProcessor"),
("maskformer", "MaskFormerImageProcessor"),
("mgp-str", "ViTImageProcessor"),
("mobilenet_v1", "MobileNetV1ImageProcessor"),
("mobilenet_v2", "MobileNetV2ImageProcessor"),
("mobilevit", "MobileViTImageProcessor"),
("mobilevit", "MobileViTImageProcessor"),
("mobilevitv2", "MobileViTImageProcessor"),
("nat", "ViTImageProcessor"),
("nougat", "NougatImageProcessor"),
("oneformer", "OneFormerImageProcessor"),
("owlv2", "Owlv2ImageProcessor"),
("owlvit", "OwlViTImageProcessor"),
("paligemma", "CLIPImageProcessor"),
("perceiver", "PerceiverImageProcessor"),
("pix2struct", "Pix2StructImageProcessor"),
("poolformer", "PoolFormerImageProcessor"),
("pvt", "PvtImageProcessor"),
("pvt_v2", "PvtImageProcessor"),
("regnet", "ConvNextImageProcessor"),
("resnet", "ConvNextImageProcessor"),
("sam", "SamImageProcessor"),
("segformer", "SegformerImageProcessor"),
("seggpt", "SegGptImageProcessor"),
("siglip", "SiglipImageProcessor"),
("swiftformer", "ViTImageProcessor"),
("swin", "ViTImageProcessor"),
("swin2sr", "Swin2SRImageProcessor"),
("swinv2", "ViTImageProcessor"),
("table-transformer", "DetrImageProcessor"),
("timesformer", "VideoMAEImageProcessor"),
("tvlt", "TvltImageProcessor"),
("tvp", "TvpImageProcessor"),
("udop", "LayoutLMv3ImageProcessor"),
("upernet", "SegformerImageProcessor"),
("van", "ConvNextImageProcessor"),
("video_llava", "VideoLlavaImageProcessor"),
("videomae", "VideoMAEImageProcessor"),
("vilt", "ViltImageProcessor"),
("vipllava", "CLIPImageProcessor"),
("vit", "ViTImageProcessor"),
("vit_hybrid", "ViTHybridImageProcessor"),
("vit_mae", "ViTImageProcessor"),
("vit_msn", "ViTImageProcessor"),
("vitmatte", "VitMatteImageProcessor"),
("xclip", "CLIPImageProcessor"),
("yolos", "YolosImageProcessor"),
("align", ("EfficientNetImageProcessor",)),
("beit", ("BeitImageProcessor",)),
("bit", ("BitImageProcessor",)),
("blip", ("BlipImageProcessor",)),
("blip-2", ("BlipImageProcessor",)),
("bridgetower", ("BridgeTowerImageProcessor",)),
("chinese_clip", ("ChineseCLIPImageProcessor",)),
("clip", ("CLIPImageProcessor",)),
("clipseg", ("ViTImageProcessor", "ViTImageProcessorFast")),
("conditional_detr", ("ConditionalDetrImageProcessor",)),
("convnext", ("ConvNextImageProcessor",)),
("convnextv2", ("ConvNextImageProcessor",)),
("cvt", ("ConvNextImageProcessor",)),
("data2vec-vision", ("BeitImageProcessor",)),
("deformable_detr", ("DeformableDetrImageProcessor",)),
("deit", ("DeiTImageProcessor",)),
("depth_anything", ("DPTImageProcessor",)),
("deta", ("DetaImageProcessor",)),
("detr", ("DetrImageProcessor",)),
("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")),
("dinov2", ("BitImageProcessor",)),
("donut-swin", ("DonutImageProcessor",)),
("dpt", ("DPTImageProcessor",)),
("efficientformer", ("EfficientFormerImageProcessor",)),
("efficientnet", ("EfficientNetImageProcessor",)),
("flava", ("FlavaImageProcessor",)),
("focalnet", ("BitImageProcessor",)),
("fuyu", ("FuyuImageProcessor",)),
("git", ("CLIPImageProcessor",)),
("glpn", ("GLPNImageProcessor",)),
("grounding-dino", ("GroundingDinoImageProcessor",)),
("groupvit", ("CLIPImageProcessor",)),
("idefics", ("IdeficsImageProcessor",)),
("idefics2", ("Idefics2ImageProcessor",)),
("imagegpt", ("ImageGPTImageProcessor",)),
("instructblip", ("BlipImageProcessor",)),
("kosmos-2", ("CLIPImageProcessor",)),
("layoutlmv2", ("LayoutLMv2ImageProcessor",)),
("layoutlmv3", ("LayoutLMv3ImageProcessor",)),
("levit", ("LevitImageProcessor",)),
("llava", ("CLIPImageProcessor",)),
("llava_next", ("LlavaNextImageProcessor",)),
("mask2former", ("Mask2FormerImageProcessor",)),
("maskformer", ("MaskFormerImageProcessor",)),
("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")),
("mobilenet_v1", ("MobileNetV1ImageProcessor",)),
("mobilenet_v2", ("MobileNetV2ImageProcessor",)),
("mobilevit", ("MobileViTImageProcessor",)),
("mobilevit", ("MobileViTImageProcessor",)),
("mobilevitv2", ("MobileViTImageProcessor",)),
("nat", ("ViTImageProcessor", "ViTImageProcessorFast")),
("nougat", ("NougatImageProcessor",)),
("oneformer", ("OneFormerImageProcessor",)),
("owlv2", ("Owlv2ImageProcessor",)),
("owlvit", ("OwlViTImageProcessor",)),
("perceiver", ("PerceiverImageProcessor",)),
("pix2struct", ("Pix2StructImageProcessor",)),
("poolformer", ("PoolFormerImageProcessor",)),
("pvt", ("PvtImageProcessor",)),
("pvt_v2", ("PvtImageProcessor",)),
("regnet", ("ConvNextImageProcessor",)),
("resnet", ("ConvNextImageProcessor",)),
("sam", ("SamImageProcessor",)),
("segformer", ("SegformerImageProcessor",)),
("seggpt", ("SegGptImageProcessor",)),
("siglip", ("SiglipImageProcessor",)),
("swiftformer", ("ViTImageProcessor", "ViTImageProcessorFast")),
("swin", ("ViTImageProcessor", "ViTImageProcessorFast")),
("swin2sr", ("Swin2SRImageProcessor",)),
("swinv2", ("ViTImageProcessor", "ViTImageProcessorFast")),
("table-transformer", ("DetrImageProcessor",)),
("timesformer", ("VideoMAEImageProcessor",)),
("tvlt", ("TvltImageProcessor",)),
("tvp", ("TvpImageProcessor",)),
("udop", ("LayoutLMv3ImageProcessor",)),
("upernet", ("SegformerImageProcessor",)),
("van", ("ConvNextImageProcessor",)),
("videomae", ("VideoMAEImageProcessor",)),
("vilt", ("ViltImageProcessor",)),
("vipllava", ("CLIPImageProcessor",)),
("vit", ("ViTImageProcessor", "ViTImageProcessorFast")),
("vit_hybrid", ("ViTHybridImageProcessor",)),
("vit_mae", ("ViTImageProcessor", "ViTImageProcessorFast")),
("vit_msn", ("ViTImageProcessor", "ViTImageProcessorFast")),
("vitmatte", ("VitMatteImageProcessor",)),
("xclip", ("CLIPImageProcessor",)),
("yolos", ("YolosImageProcessor",)),
]
)
)
for model_type, image_processors in IMAGE_PROCESSOR_MAPPING_NAMES.items():
slow_image_processor_class, *fast_image_processor_class = image_processors
if not is_vision_available():
slow_image_processor_class = None
# If the fast image processor is not defined, or torchvision is not available, we set it to None
if not fast_image_processor_class or fast_image_processor_class[0] is None or not is_torchvision_available():
fast_image_processor_class = None
else:
fast_image_processor_class = fast_image_processor_class[0]
IMAGE_PROCESSOR_MAPPING_NAMES[model_type] = (slow_image_processor_class, fast_image_processor_class)
IMAGE_PROCESSOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, IMAGE_PROCESSOR_MAPPING_NAMES)
def image_processor_class_from_name(class_name: str):
if class_name == "BaseImageProcessorFast":
return BaseImageProcessorFast
for module_name, extractors in IMAGE_PROCESSOR_MAPPING_NAMES.items():
if class_name in extractors:
module_name = model_type_to_module_name(module_name)
......@@ -145,11 +174,12 @@ def image_processor_class_from_name(class_name: str):
except AttributeError:
continue
for _, extractor in IMAGE_PROCESSOR_MAPPING._extra_content.items():
for _, extractors in IMAGE_PROCESSOR_MAPPING._extra_content.items():
for extractor in extractors:
if getattr(extractor, "__name__", None) == class_name:
return extractor
# We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main
# We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main
# init and we return the proper dummy to get an appropriate error message.
main_module = importlib.import_module("transformers")
if hasattr(main_module, class_name):
......@@ -258,6 +288,13 @@ def get_image_processor_config(
return json.load(reader)
def _warning_fast_image_processor_available(fast_class):
logger.warning(
f"Fast image processor class {fast_class} is available for this model. "
"Using slow image processor class. To use the fast image processor class set `use_fast=True`."
)
class AutoImageProcessor:
r"""
This is a generic image processor class that will be instantiated as one of the image processor classes of the
......@@ -274,7 +311,7 @@ class AutoImageProcessor:
@classmethod
@replace_list_option_in_docstrings(IMAGE_PROCESSOR_MAPPING_NAMES)
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
r"""
Instantiate one of the image processor classes of the library from a pretrained model vocabulary.
......@@ -314,6 +351,10 @@ class AutoImageProcessor:
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
use_fast (`bool`, *optional*, defaults to `False`):
Use a fast torchvision-base image processor if it is supported for a given model.
If a fast tokenizer is not available for a given model, a normal numpy-based image processor
is returned instead.
return_unused_kwargs (`bool`, *optional*, defaults to `False`):
If `False`, then this function returns just the final image processor object. If `True`, then this
functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
......@@ -358,6 +399,7 @@ class AutoImageProcessor:
kwargs["token"] = use_auth_token
config = kwargs.pop("config", None)
use_fast = kwargs.pop("use_fast", False)
trust_remote_code = kwargs.pop("trust_remote_code", None)
kwargs["_from_auto"] = True
......@@ -387,6 +429,11 @@ class AutoImageProcessor:
image_processor_auto_map = config.auto_map["AutoImageProcessor"]
if image_processor_class is not None:
# Update class name to reflect the use_fast option. If class is not found, None is returned.
if use_fast and not image_processor_class.endswith("Fast"):
image_processor_class += "Fast"
elif not use_fast and image_processor_class.endswith("Fast"):
image_processor_class = image_processor_class[:-4]
image_processor_class = image_processor_class_from_name(image_processor_class)
has_remote_code = image_processor_auto_map is not None
......@@ -395,10 +442,19 @@ class AutoImageProcessor:
trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
)
if image_processor_auto_map is not None and not isinstance(image_processor_auto_map, tuple):
# In some configs, only the slow image processor class is stored
image_processor_auto_map = (image_processor_auto_map, None)
if has_remote_code and trust_remote_code:
image_processor_class = get_class_from_dynamic_module(
image_processor_auto_map, pretrained_model_name_or_path, **kwargs
)
if not use_fast and image_processor_auto_map[1] is not None:
_warning_fast_image_processor_available(image_processor_auto_map[1])
if use_fast and image_processor_auto_map[1] is not None:
class_ref = image_processor_auto_map[1]
else:
class_ref = image_processor_auto_map[0]
image_processor_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
_ = kwargs.pop("code_revision", None)
if os.path.isdir(pretrained_model_name_or_path):
image_processor_class.register_for_auto_class()
......@@ -407,8 +463,22 @@ class AutoImageProcessor:
return image_processor_class.from_dict(config_dict, **kwargs)
# Last try: we use the IMAGE_PROCESSOR_MAPPING.
elif type(config) in IMAGE_PROCESSOR_MAPPING:
image_processor_class = IMAGE_PROCESSOR_MAPPING[type(config)]
return image_processor_class.from_dict(config_dict, **kwargs)
image_processor_tuple = IMAGE_PROCESSOR_MAPPING[type(config)]
image_processor_class_py, image_processor_class_fast = image_processor_tuple
if not use_fast and image_processor_class_fast is not None:
_warning_fast_image_processor_available(image_processor_class_fast)
if image_processor_class_fast and (use_fast or image_processor_class_py is None):
return image_processor_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
else:
if image_processor_class_py is not None:
return image_processor_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
else:
raise ValueError(
"This image processor cannot be instantiated. Please make sure you have `Pillow` installed."
)
raise ValueError(
f"Unrecognized image processor in {pretrained_model_name_or_path}. Should have a "
......@@ -417,7 +487,13 @@ class AutoImageProcessor:
)
@staticmethod
def register(config_class, image_processor_class, exist_ok=False):
def register(
config_class,
image_processor_class=None,
slow_image_processor_class=None,
fast_image_processor_class=None,
exist_ok=False,
):
"""
Register a new image processor for this class.
......@@ -426,4 +502,43 @@ class AutoImageProcessor:
The configuration corresponding to the model to register.
image_processor_class ([`ImageProcessingMixin`]): The image processor to register.
"""
IMAGE_PROCESSOR_MAPPING.register(config_class, image_processor_class, exist_ok=exist_ok)
if image_processor_class is not None:
if slow_image_processor_class is not None:
raise ValueError("Cannot specify both image_processor_class and slow_image_processor_class")
warnings.warn(
"The image_processor_class argument is deprecated and will be removed in v4.42. Please use `slow_image_processor_class`, or `fast_image_processor_class` instead",
FutureWarning,
)
slow_image_processor_class = image_processor_class
if slow_image_processor_class is None and fast_image_processor_class is None:
raise ValueError("You need to specify either slow_image_processor_class or fast_image_processor_class")
if slow_image_processor_class is not None and issubclass(slow_image_processor_class, BaseImageProcessorFast):
raise ValueError("You passed a fast image processor in as the `slow_image_processor_class`.")
if fast_image_processor_class is not None and issubclass(fast_image_processor_class, BaseImageProcessor):
raise ValueError("You passed a slow image processor in as the `fast_image_processor_class`.")
if (
slow_image_processor_class is not None
and fast_image_processor_class is not None
and issubclass(fast_image_processor_class, BaseImageProcessorFast)
and fast_image_processor_class.slow_image_processor_class != slow_image_processor_class
):
raise ValueError(
"The fast processor class you are passing has a `slow_image_processor_class` attribute that is not "
"consistent with the slow processor class you passed (fast tokenizer has "
f"{fast_image_processor_class.slow_image_processor_class} and you passed {slow_image_processor_class}. Fix one of those "
"so they match!"
)
# Avoid resetting a set slow/fast image processor if we are passing just the other ones.
if config_class in IMAGE_PROCESSOR_MAPPING._extra_content:
existing_slow, existing_fast = IMAGE_PROCESSOR_MAPPING[config_class]
if slow_image_processor_class is None:
slow_image_processor_class = existing_slow
if fast_image_processor_class is None:
fast_image_processor_class = existing_fast
IMAGE_PROCESSOR_MAPPING.register(
config_class, (slow_image_processor_class, fast_image_processor_class), exist_ok=exist_ok
)
......@@ -19,6 +19,7 @@ from ...utils import (
is_flax_available,
is_tf_available,
is_torch_available,
is_torchvision_available,
is_vision_available,
)
......@@ -34,6 +35,15 @@ else:
_import_structure["feature_extraction_vit"] = ["ViTFeatureExtractor"]
_import_structure["image_processing_vit"] = ["ViTImageProcessor"]
try:
if not is_torchvision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_vit_fast"] = ["ViTImageProcessorFast"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
......@@ -83,6 +93,14 @@ if TYPE_CHECKING:
from .feature_extraction_vit import ViTFeatureExtractor
from .image_processing_vit import ViTImageProcessor
try:
if not is_torchvision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_vit_fast import ViTImageProcessorFast
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
......
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fast Image processor class for ViT."""
import functools
from typing import Dict, List, Optional, Union
from ...image_processing_base import BatchFeature
from ...image_processing_utils import get_size_dict
from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict
from ...image_transforms import FusedRescaleNormalize, NumpyToTensor, Rescale
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
ImageType,
PILImageResampling,
get_image_type,
make_list_of_images,
pil_torch_interpolation_mapping,
)
from ...utils import TensorType, logging
from ...utils.import_utils import is_torch_available, is_torchvision_available
logger = logging.get_logger(__name__)
if is_torch_available():
import torch
if is_torchvision_available():
from torchvision.transforms import Compose, Normalize, PILToTensor, Resize
class ViTImageProcessorFast(BaseImageProcessorFast):
r"""
Constructs a ViT image processor.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
method.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
`preprocess` method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
model_input_names = ["pixel_values"]
_transform_params = [
"do_resize",
"do_rescale",
"do_normalize",
"size",
"resample",
"rescale_factor",
"image_mean",
"image_std",
"image_type",
]
def __init__(
self,
do_resize: bool = True,
size: Optional[Dict[str, int]] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"height": 224, "width": 224}
size = get_size_dict(size)
self.do_resize = do_resize
self.do_rescale = do_rescale
self.do_normalize = do_normalize
self.size = size
self.resample = resample
self.rescale_factor = rescale_factor
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self._transform_settings = {}
def _build_transforms(
self,
do_resize: bool,
size: Dict[str, int],
resample: PILImageResampling,
do_rescale: bool,
rescale_factor: float,
do_normalize: bool,
image_mean: Union[float, List[float]],
image_std: Union[float, List[float]],
image_type: ImageType,
) -> "Compose":
"""
Given the input settings build the image transforms using `torchvision.transforms.Compose`.
"""
transforms = []
# All PIL and numpy values need to be converted to a torch tensor
# to keep cross compatibility with slow image processors
if image_type == ImageType.PIL:
transforms.append(PILToTensor())
elif image_type == ImageType.NUMPY:
transforms.append(NumpyToTensor())
if do_resize:
transforms.append(
Resize((size["height"], size["width"]), interpolation=pil_torch_interpolation_mapping[resample])
)
# We can combine rescale and normalize into a single operation for speed
if do_rescale and do_normalize:
transforms.append(FusedRescaleNormalize(image_mean, image_std, rescale_factor=rescale_factor))
elif do_rescale:
transforms.append(Rescale(rescale_factor=rescale_factor))
elif do_normalize:
transforms.append(Normalize(image_mean, image_std))
return Compose(transforms)
@functools.lru_cache(maxsize=1)
def _validate_input_arguments(
self,
return_tensors: Union[str, TensorType],
do_resize: bool,
size: Dict[str, int],
resample: PILImageResampling,
do_rescale: bool,
rescale_factor: float,
do_normalize: bool,
image_mean: Union[float, List[float]],
image_std: Union[float, List[float]],
data_format: Union[str, ChannelDimension],
image_type: ImageType,
):
if return_tensors != "pt":
raise ValueError("Only returning PyTorch tensors is currently supported.")
if data_format != ChannelDimension.FIRST:
raise ValueError("Only channel first data format is currently supported.")
if do_resize and None in (size, resample):
raise ValueError("Size and resample must be specified if do_resize is True.")
if do_rescale and rescale_factor is None:
raise ValueError("Rescale factor must be specified if do_rescale is True.")
if do_normalize and None in (image_mean, image_std):
raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
def preprocess(
self,
images: ImageInput,
do_resize: Optional[bool] = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = "pt",
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
resizing.
resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
`PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
an effect if `do_resize` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image values between [0 - 1].
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Image mean to use if `do_normalize` is set to `True`.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation to use if `do_normalize` is set to `True`.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Only "pt" is supported
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. The following formats are currently supported:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
resample = resample if resample is not None else self.resample
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
size = size if size is not None else self.size
# Make hashable for cache
size = SizeDict(**size)
image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
image_std = tuple(image_std) if isinstance(image_std, list) else image_std
images = make_list_of_images(images)
image_type = get_image_type(images[0])
if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
raise ValueError(f"Unsupported input image type {image_type}")
self._validate_input_arguments(
do_resize=do_resize,
size=size,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
return_tensors=return_tensors,
data_format=data_format,
image_type=image_type,
)
transforms = self.get_transforms(
do_resize=do_resize,
do_rescale=do_rescale,
do_normalize=do_normalize,
size=size,
resample=resample,
rescale_factor=rescale_factor,
image_mean=image_mean,
image_std=image_std,
image_type=image_type,
)
transformed_images = [transforms(image) for image in images]
data = {"pixel_values": torch.vstack(transformed_images)}
return BatchFeature(data, tensor_type=return_tensors)
# This file is autogenerated by the command `make fix-copies`, do not edit.
from ..utils import DummyObject, requires_backends
class BaseImageProcessorFast(metaclass=DummyObject):
_backends = ["torchvision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torchvision"])
class ViTImageProcessorFast(metaclass=DummyObject):
_backends = ["torchvision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torchvision"])
......@@ -9,6 +9,13 @@ class ImageProcessingMixin(metaclass=DummyObject):
requires_backends(self, ["vision"])
class BaseImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class ImageFeatureExtractionMixin(metaclass=DummyObject):
_backends = ["vision"]
......
......@@ -27,8 +27,10 @@ from transformers import (
AutoImageProcessor,
CLIPConfig,
CLIPImageProcessor,
ViTImageProcessor,
ViTImageProcessorFast,
)
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_torchvision, require_vision
sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
......@@ -133,6 +135,23 @@ class AutoImageProcessorTest(unittest.TestCase):
):
_ = AutoImageProcessor.from_pretrained("hf-internal-testing/config-no-model")
@require_vision
@require_torchvision
def test_use_fast_selection(self):
checkpoint = "hf-internal-testing/tiny-random-vit"
# Slow image processor is selected by default
image_processor = AutoImageProcessor.from_pretrained(checkpoint)
self.assertIsInstance(image_processor, ViTImageProcessor)
# Fast image processor is selected when use_fast=True
image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=True)
self.assertIsInstance(image_processor, ViTImageProcessorFast)
# Slow image processor is selected when use_fast=False
image_processor = AutoImageProcessor.from_pretrained(checkpoint, use_fast=False)
self.assertIsInstance(image_processor, ViTImageProcessor)
def test_from_pretrained_dynamic_image_processor(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):
......
......@@ -121,6 +121,7 @@ class BeitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = BeitImageProcessor if is_vision_available() else None
def setUp(self):
super().setUp()
self.image_processor_tester = BeitImageProcessingTester(self)
@property
......
......@@ -90,6 +90,7 @@ class BlipImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = BlipImageProcessor if is_vision_available() else None
def setUp(self):
super().setUp()
self.image_processor_tester = BlipImageProcessingTester(self)
@property
......@@ -112,6 +113,7 @@ class BlipImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.Tes
image_processing_class = BlipImageProcessor if is_vision_available() else None
def setUp(self):
super().setUp()
self.image_processor_tester = BlipImageProcessingTester(self, num_channels=4)
self.expected_encoded_image_num_channels = 3
......
......@@ -136,6 +136,7 @@ class BridgeTowerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
image_processing_class = BridgeTowerImageProcessor if is_vision_available() else None
def setUp(self):
super().setUp()
self.image_processor_tester = BridgeTowerImageProcessingTester(self)
@property
......
......@@ -98,6 +98,7 @@ class ChineseCLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
def setUp(self):
super().setUp()
self.image_processor_tester = ChineseCLIPImageProcessingTester(self, do_center_crop=True)
@property
......@@ -135,6 +136,7 @@ class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingTestMixin, unitt
image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
def setUp(self):
super().setUp()
self.image_processor_tester = ChineseCLIPImageProcessingTester(self, num_channels=4, do_center_crop=True)
self.expected_encoded_image_num_channels = 3
......
......@@ -94,6 +94,7 @@ class CLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = CLIPImageProcessor if is_vision_available() else None
def setUp(self):
super().setUp()
self.image_processor_tester = CLIPImageProcessingTester(self)
@property
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment