AutoImageProcessor (#20111)

* AutoImageProcessor skeleton * Update references * Add mapping in init * Add model image processors to __init__ for importing * Add AutoImageProcessor tests * Fix up * Image Processor documentation * Remove pdb * Update docs/source/en/model_doc/mobilevit.mdx * Update docs * Don't add whitespace on json files * Remove fixtures * Move checking model config down * Fix up * Add check for image processor * Remove FeatureExtractorMixin in docstrings * Rename model_tmpfile to config_tmpfile * Don't make None if not in image processor map

AutoImageProcessor (#20111)
* AutoImageProcessor skeleton * Update references * Add mapping in init * Add model image processors to __init__ for importing * Add AutoImageProcessor tests * Fix up * Image Processor documentation * Remove pdb * Update docs/source/en/model_doc/mobilevit.mdx * Update docs * Don't add whitespace on json files * Remove fixtures * Move checking model config down * Fix up * Add check for image processor * Remove FeatureExtractorMixin in docstrings * Rename model_tmpfile to config_tmpfile * Don't make None if not in image processor map
4eb918e6 · amyeroberts · GitHub · c08a1e26 · 4eb918e6 · 4eb918e6
Unverified Commit 4eb918e6 authored Nov 08, 2022 by amyeroberts Committed by GitHub Nov 08, 2022
11 changed files
--- a/src/transformers/models/levit/__init__.py
+++ b/src/transformers/models/levit/__init__.py
@@ -29,6 +29,7 @@ except OptionalDependencyNotAvailable:
    pass
 else:
    _import_structure["feature_extraction_levit"] = ["LevitFeatureExtractor"]
+    _import_structure["image_processing_levit"] = ["LevitImageProcessor"]

 try:
    if not is_torch_available():
@@ -55,6 +56,7 @@ if TYPE_CHECKING:
        pass
    else:
        from .feature_extraction_levit import LevitFeatureExtractor
+        from .image_processing_levit import LevitImageProcessor

    try:
        if not is_torch_available():

--- a/src/transformers/models/mobilevit/__init__.py
+++ b/src/transformers/models/mobilevit/__init__.py
@@ -37,6 +37,7 @@ except OptionalDependencyNotAvailable:
    pass
 else:
    _import_structure["feature_extraction_mobilevit"] = ["MobileViTFeatureExtractor"]
+    _import_structure["image_processing_mobilevit"] = ["MobileViTImageProcessor"]

 try:
    if not is_torch_available():
@@ -76,6 +77,7 @@ if TYPE_CHECKING:
        pass
    else:
        from .feature_extraction_mobilevit import MobileViTFeatureExtractor
+        from .image_processing_mobilevit import MobileViTImageProcessor

    try:
        if not is_torch_available():
@@ -91,14 +93,6 @@ if TYPE_CHECKING:
            MobileViTPreTrainedModel,
        )

-    try:
-        if not is_vision_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .feature_extraction_mobilevit import MobileViTFeatureExtractor
-
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()

--- a/src/transformers/models/perceiver/__init__.py
+++ b/src/transformers/models/perceiver/__init__.py
@@ -38,6 +38,7 @@ except OptionalDependencyNotAvailable:
    pass
 else:
    _import_structure["feature_extraction_perceiver"] = ["PerceiverFeatureExtractor"]
+    _import_structure["image_processing_perceiver"] = ["PerceiverImageProcessor"]

 try:
    if not is_torch_available():
@@ -71,6 +72,7 @@ if TYPE_CHECKING:
        pass
    else:
        from .feature_extraction_perceiver import PerceiverFeatureExtractor
+        from .image_processing_perceiver import PerceiverImageProcessor

    try:
        if not is_torch_available():

--- a/src/transformers/models/poolformer/__init__.py
+++ b/src/transformers/models/poolformer/__init__.py
@@ -30,6 +30,7 @@ except OptionalDependencyNotAvailable:
    pass
 else:
    _import_structure["feature_extraction_poolformer"] = ["PoolFormerFeatureExtractor"]
+    _import_structure["image_processing_poolformer"] = ["PoolFormerImageProcessor"]

 try:
    if not is_torch_available():
@@ -55,6 +56,7 @@ if TYPE_CHECKING:
        pass
    else:
        from .feature_extraction_poolformer import PoolFormerFeatureExtractor
+        from .image_processing_poolformer import PoolFormerImageProcessor

    try:
        if not is_torch_available():

--- a/src/transformers/models/segformer/__init__.py
+++ b/src/transformers/models/segformer/__init__.py
@@ -37,6 +37,7 @@ except OptionalDependencyNotAvailable:
    pass
 else:
    _import_structure["feature_extraction_segformer"] = ["SegformerFeatureExtractor"]
+    _import_structure["image_processing_segformer"] = ["SegformerImageProcessor"]

 try:
    if not is_torch_available():
@@ -80,6 +81,7 @@ if TYPE_CHECKING:
        pass
    else:
        from .feature_extraction_segformer import SegformerFeatureExtractor
+        from .image_processing_segformer import SegformerImageProcessor

    try:
        if not is_torch_available():

--- a/src/transformers/models/videomae/__init__.py
+++ b/src/transformers/models/videomae/__init__.py
@@ -45,6 +45,7 @@ except OptionalDependencyNotAvailable:
    pass
 else:
    _import_structure["feature_extraction_videomae"] = ["VideoMAEFeatureExtractor"]
+    _import_structure["image_processing_videomae"] = ["VideoMAEImageProcessor"]

 if TYPE_CHECKING:
    from .configuration_videomae import VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP, VideoMAEConfig
@@ -70,6 +71,7 @@ if TYPE_CHECKING:
        pass
    else:
        from .feature_extraction_videomae import VideoMAEFeatureExtractor
+        from .image_processing_videomae import VideoMAEImageProcessor

 else:
    import sys

--- a/src/transformers/models/vilt/__init__.py
+++ b/src/transformers/models/vilt/__init__.py
@@ -30,6 +30,7 @@ except OptionalDependencyNotAvailable:
    pass
 else:
    _import_structure["feature_extraction_vilt"] = ["ViltFeatureExtractor"]
+    _import_structure["image_processing_vilt"] = ["ViltImageProcessor"]
    _import_structure["processing_vilt"] = ["ViltProcessor"]

 try:
@@ -61,6 +62,7 @@ if TYPE_CHECKING:
        pass
    else:
        from .feature_extraction_vilt import ViltFeatureExtractor
+        from .image_processing_vilt import ViltImageProcessor
        from .processing_vilt import ViltProcessor

    try:

--- a/src/transformers/models/vit/__init__.py
+++ b/src/transformers/models/vit/__init__.py
@@ -36,6 +36,7 @@ except OptionalDependencyNotAvailable:
    pass
 else:
    _import_structure["feature_extraction_vit"] = ["ViTFeatureExtractor"]
+    _import_structure["image_processing_vit"] = ["ViTImageProcessor"]

 try:
    if not is_torch_available():
@@ -85,6 +86,7 @@ if TYPE_CHECKING:
        pass
    else:
        from .feature_extraction_vit import ViTFeatureExtractor
+        from .image_processing_vit import ViTImageProcessor

    try:
        if not is_torch_available():

--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -3,7 +3,7 @@
 from ..utils import DummyObject, requires_backends


-class ImageProcessorMixin(metaclass=DummyObject):
+class ImageProcessingMixin(metaclass=DummyObject):
    _backends = ["vision"]

    def __init__(self, *args, **kwargs):
@@ -36,6 +36,13 @@ class BeitFeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class BeitImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class CLIPFeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -43,6 +50,13 @@ class CLIPFeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class CLIPImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ConditionalDetrFeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -57,6 +71,13 @@ class ConvNextFeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class ConvNextImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class DeformableDetrFeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -71,6 +92,13 @@ class DeiTFeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class DeiTImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class DetrFeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -92,6 +120,13 @@ class DPTFeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class DPTImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class FlavaFeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -99,6 +134,13 @@ class FlavaFeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class FlavaImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class FlavaProcessor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -113,6 +155,13 @@ class GLPNFeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class GLPNImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ImageGPTFeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -120,6 +169,13 @@ class ImageGPTFeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class ImageGPTImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class LayoutLMv2FeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -127,6 +183,13 @@ class LayoutLMv2FeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class LayoutLMv2ImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class LayoutLMv3FeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -134,6 +197,13 @@ class LayoutLMv3FeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class LayoutLMv3ImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class LevitFeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -141,6 +211,13 @@ class LevitFeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class LevitImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class MaskFormerFeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -155,6 +232,13 @@ class MobileViTFeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class MobileViTImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class OwlViTFeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -169,6 +253,13 @@ class PerceiverFeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class PerceiverImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class PoolFormerFeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -176,6 +267,13 @@ class PoolFormerFeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class PoolFormerImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class SegformerFeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -183,6 +281,13 @@ class SegformerFeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class SegformerImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class VideoMAEFeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -190,6 +295,13 @@ class VideoMAEFeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class VideoMAEImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ViltFeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -197,6 +309,13 @@ class ViltFeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class ViltImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ViltProcessor(metaclass=DummyObject):
    _backends = ["vision"]

@@ -211,6 +330,13 @@ class ViTFeatureExtractor(metaclass=DummyObject):
        requires_backends(self, ["vision"])


+class ViTImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class YolosFeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]


--- a/tests/models/auto/test_image_processing_auto.py
+++ b/tests/models/auto/test_image_processing_auto.py
+# coding=utf-8
+# Copyright 2021 the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+
+from transformers import (
+    CONFIG_MAPPING,
+    IMAGE_PROCESSOR_MAPPING,
+    AutoConfig,
+    AutoImageProcessor,
+    CLIPConfig,
+    CLIPImageProcessor,
+)
+from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER
+
+
+sys.path.append(str(Path(__file__).parent.parent.parent.parent / "utils"))
+
+from test_module.custom_configuration import CustomConfig  # noqa E402
+from test_module.custom_image_processing import CustomImageProcessor  # noqa E402
+
+
+class AutoImageProcessorTest(unittest.TestCase):
+    def test_image_processor_from_model_shortcut(self):
+        config = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        self.assertIsInstance(config, CLIPImageProcessor)
+
+    def test_image_processor_from_local_directory_from_key(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
+            config_tmpfile = Path(tmpdirname) / "config.json"
+            json.dump(
+                {"image_processor_type": "CLIPImageProcessor", "processor_class": "CLIPProcessor"},
+                open(processor_tmpfile, "w"),
+            )
+            json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
+
+            config = AutoImageProcessor.from_pretrained(tmpdirname)
+            self.assertIsInstance(config, CLIPImageProcessor)
+
+    def test_image_processor_from_local_directory_from_feature_extractor_key(self):
+        # Ensure we can load the image processor from the feature extractor config
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
+            config_tmpfile = Path(tmpdirname) / "config.json"
+            json.dump(
+                {"feature_extractor_type": "CLIPFeatureExtractor", "processor_class": "CLIPProcessor"},
+                open(processor_tmpfile, "w"),
+            )
+            json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
+
+            config = AutoImageProcessor.from_pretrained(tmpdirname)
+            self.assertIsInstance(config, CLIPImageProcessor)
+
+    def test_image_processor_from_local_directory_from_config(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model_config = CLIPConfig()
+
+            # Create a dummy config file with image_proceesor_type
+            processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
+            config_tmpfile = Path(tmpdirname) / "config.json"
+            json.dump(
+                {"image_processor_type": "CLIPImageProcessor", "processor_class": "CLIPProcessor"},
+                open(processor_tmpfile, "w"),
+            )
+            json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
+
+            # remove image_processor_type to make sure config.json alone is enough to load image processor locally
+            config_dict = AutoImageProcessor.from_pretrained(tmpdirname).to_dict()
+
+            config_dict.pop("image_processor_type")
+            config = CLIPImageProcessor(**config_dict)
+
+            # save in new folder
+            model_config.save_pretrained(tmpdirname)
+            config.save_pretrained(tmpdirname)
+
+            config = AutoImageProcessor.from_pretrained(tmpdirname)
+
+            # make sure private variable is not incorrectly saved
+            dict_as_saved = json.loads(config.to_json_string())
+            self.assertTrue("_processor_class" not in dict_as_saved)
+
+        self.assertIsInstance(config, CLIPImageProcessor)
+
+    def test_image_processor_from_local_file(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
+            json.dump(
+                {"image_processor_type": "CLIPImageProcessor", "processor_class": "CLIPProcessor"},
+                open(processor_tmpfile, "w"),
+            )
+
+            config = AutoImageProcessor.from_pretrained(processor_tmpfile)
+            self.assertIsInstance(config, CLIPImageProcessor)
+
+    def test_repo_not_found(self):
+        with self.assertRaisesRegex(
+            EnvironmentError, "clip-base is not a local folder and is not a valid model identifier"
+        ):
+            _ = AutoImageProcessor.from_pretrained("clip-base")
+
+    def test_revision_not_found(self):
+        with self.assertRaisesRegex(
+            EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
+        ):
+            _ = AutoImageProcessor.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
+
+    def test_image_processor_not_found(self):
+        with self.assertRaisesRegex(
+            EnvironmentError,
+            "hf-internal-testing/config-no-model does not appear to have a file named preprocessor_config.json.",
+        ):
+            _ = AutoImageProcessor.from_pretrained("hf-internal-testing/config-no-model")
+
+    def test_from_pretrained_dynamic_image_processor(self):
+        model = AutoImageProcessor.from_pretrained(
+            "hf-internal-testing/test_dynamic_image_processor", trust_remote_code=True
+        )
+        self.assertEqual(model.__class__.__name__, "NewImageProcessor")
+
+    def test_new_image_processor_registration(self):
+        try:
+            AutoConfig.register("custom", CustomConfig)
+            AutoImageProcessor.register(CustomConfig, CustomImageProcessor)
+            # Trying to register something existing in the Transformers library will raise an error
+            with self.assertRaises(ValueError):
+                AutoImageProcessor.register(CLIPConfig, CLIPImageProcessor)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
+                config_tmpfile = Path(tmpdirname) / "config.json"
+                json.dump(
+                    {"feature_extractor_type": "CLIPFeatureExtractor", "processor_class": "CLIPProcessor"},
+                    open(processor_tmpfile, "w"),
+                )
+                json.dump({"model_type": "clip"}, open(config_tmpfile, "w"))
+
+                image_processor = CustomImageProcessor.from_pretrained(tmpdirname)
+
+            # Now that the config is registered, it can be used as any other config with the auto-API
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                image_processor.save_pretrained(tmp_dir)
+                new_image_processor = AutoImageProcessor.from_pretrained(tmp_dir)
+                self.assertIsInstance(new_image_processor, CustomImageProcessor)
+
+        finally:
+            if "custom" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["custom"]
+            if CustomConfig in IMAGE_PROCESSOR_MAPPING._extra_content:
+                del IMAGE_PROCESSOR_MAPPING._extra_content[CustomConfig]
--- a/utils/test_module/custom_image_processing.py
+++ b/utils/test_module/custom_image_processing.py
+from transformers import CLIPImageProcessor
+
+
+class CustomImageProcessor(CLIPImageProcessor):
+    pass