Add feature extraction mapping for automatic metadata update (#28944)

* add feature extraction mapping * added prefix * ruff check * minor fix * Update modeling_auto.py * fix typo * remove prefix to make variable public/importable * Update src/transformers/models/auto/modeling_auto.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * fixes * addressed comments * nit * fix-copies * remove from tests * this should fix * Update tests/models/convnextv2/test_modeling_convnextv2.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * nits --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

Add feature extraction mapping for automatic metadata update (#28944)
* add feature extraction mapping * added prefix * ruff check * minor fix * Update modeling_auto.py * fix typo * remove prefix to make variable public/importable * Update src/transformers/models/auto/modeling_auto.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * fixes * addressed comments * nit * fix-copies * remove from tests * this should fix * Update tests/models/convnextv2/test_modeling_convnextv2.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * nits --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
7c4995f9 · Merve Noyan · GitHub · 2a7746c4 · 7c4995f9 · 7c4995f9
Unverified Commit 7c4995f9 authored Feb 26, 2024 by Merve Noyan Committed by GitHub Feb 26, 2024
9 changed files
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1460,6 +1460,7 @@ else:
            "MODEL_FOR_DEPTH_ESTIMATION_MAPPING",
            "MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
            "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
+            "MODEL_FOR_IMAGE_MAPPING",
            "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
            "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING",
            "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
@@ -6203,6 +6204,7 @@ if TYPE_CHECKING:
            MODEL_FOR_DEPTH_ESTIMATION_MAPPING,
            MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
            MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+            MODEL_FOR_IMAGE_MAPPING,
            MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
            MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
            MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,

--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -49,6 +49,7 @@ else:
        "MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
        "MODEL_FOR_DEPTH_ESTIMATION_MAPPING",
        "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
+        "MODEL_FOR_IMAGE_MAPPING",
        "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
        "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING",
        "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
@@ -233,6 +234,7 @@ if TYPE_CHECKING:
            MODEL_FOR_DEPTH_ESTIMATION_MAPPING,
            MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
            MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+            MODEL_FOR_IMAGE_MAPPING,
            MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
            MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
            MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,

--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -29,7 +29,6 @@ from .configuration_auto import CONFIG_MAPPING_NAMES

 logger = logging.get_logger(__name__)

-
 MODEL_MAPPING_NAMES = OrderedDict(
    [
        # Base model mapping
@@ -478,6 +477,58 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
    ]
 )

+MODEL_FOR_IMAGE_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Image mapping
+        ("beit", "BeitModel"),
+        ("bit", "BitModel"),
+        ("conditional_detr", "ConditionalDetrModel"),
+        ("convnext", "ConvNextModel"),
+        ("convnextv2", "ConvNextV2Model"),
+        ("data2vec-vision", "Data2VecVisionModel"),
+        ("deformable_detr", "DeformableDetrModel"),
+        ("deit", "DeiTModel"),
+        ("deta", "DetaModel"),
+        ("detr", "DetrModel"),
+        ("dinat", "DinatModel"),
+        ("dinov2", "Dinov2Model"),
+        ("dpt", "DPTModel"),
+        ("efficientformer", "EfficientFormerModel"),
+        ("efficientnet", "EfficientNetModel"),
+        ("focalnet", "FocalNetModel"),
+        ("glpn", "GLPNModel"),
+        ("imagegpt", "ImageGPTModel"),
+        ("levit", "LevitModel"),
+        ("mobilenet_v1", "MobileNetV1Model"),
+        ("mobilenet_v2", "MobileNetV2Model"),
+        ("mobilevit", "MobileViTModel"),
+        ("mobilevitv2", "MobileViTV2Model"),
+        ("nat", "NatModel"),
+        ("poolformer", "PoolFormerModel"),
+        ("pvt", "PvtModel"),
+        ("regnet", "RegNetModel"),
+        ("resnet", "ResNetModel"),
+        ("segformer", "SegformerModel"),
+        ("siglip_vision_model", "SiglipVisionModel"),
+        ("swiftformer", "SwiftFormerModel"),
+        ("swin", "SwinModel"),
+        ("swin2sr", "Swin2SRModel"),
+        ("swinv2", "Swinv2Model"),
+        ("table-transformer", "TableTransformerModel"),
+        ("timesformer", "TimesformerModel"),
+        ("timm_backbone", "TimmBackbone"),
+        ("van", "VanModel"),
+        ("videomae", "VideoMAEModel"),
+        ("vit", "ViTModel"),
+        ("vit_hybrid", "ViTHybridModel"),
+        ("vit_mae", "ViTMAEModel"),
+        ("vit_msn", "ViTMSNModel"),
+        ("vitdet", "VitDetModel"),
+        ("vivit", "VivitModel"),
+        ("yolos", "YolosModel"),
+    ]
+)
+
 MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES = OrderedDict(
    [
        ("deit", "DeiTForMaskedImageModeling"),
@@ -1243,6 +1294,7 @@ MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
    CONFIG_MAPPING_NAMES, MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
 )
 MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES)
+MODEL_FOR_IMAGE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_MAPPING_NAMES)
 MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = _LazyAutoMapping(
    CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES
 )

--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -63,7 +63,10 @@ from .integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, i
 from .integrations.tpu import tpu_spmd_dataloader
 from .modelcard import TrainingSummary
 from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model
-from .models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_MAPPING_NAMES
+from .models.auto.modeling_auto import (
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_MAPPING_NAMES,
+)
 from .optimization import Adafactor, get_scheduler
 from .pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
 from .tokenization_utils_base import PreTrainedTokenizerBase

--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -598,6 +598,9 @@ MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = None
 MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None


+MODEL_FOR_IMAGE_MAPPING = None
+
+
 MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = None



--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -39,6 +39,7 @@ from ..models.auto.modeling_auto import (
    MODEL_FOR_CTC_MAPPING_NAMES,
    MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_MAPPING_NAMES,
    MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
    MODEL_FOR_MASKED_LM_MAPPING_NAMES,
    MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
@@ -95,6 +96,7 @@ def _generate_supported_model_class_names(
        "audio-classification": MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
        "semantic-segmentation": MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
        "backbone": MODEL_FOR_BACKBONE_MAPPING_NAMES,
+        "image-feature-extraction": MODEL_FOR_IMAGE_MAPPING_NAMES,
    }

    if supported_tasks is None:

--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -700,7 +700,10 @@ class ModelTesterMixin:
        for model_class in self.all_model_classes:
            if (
                model_class.__name__
-                in [*get_values(MODEL_MAPPING_NAMES), *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)]
+                in [
+                    *get_values(MODEL_MAPPING_NAMES),
+                    *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES),
+                ]
                or not model_class.supports_gradient_checkpointing
            ):
                continue

--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -732,6 +732,8 @@ def check_all_auto_object_names_being_defined():
                        # module, if it's a private model defined in this file.
                        if name.endswith("MODEL_MAPPING_NAMES") and is_a_private_model(class_name):
                            continue
+                        if name.endswith("MODEL_FOR_IMAGE_MAPPING_NAMES") and is_a_private_model(class_name):
+                            continue
                        failures.append(
                            f"`{class_name}` appears in the mapping `{name}` but it is not defined in the library."
                        )

--- a/utils/update_metadata.py
+++ b/utils/update_metadata.py
@@ -62,6 +62,7 @@ _re_pt_models = re.compile(r"(.*)(?:Model|Encoder|Decoder|ForConditionalGenerati
 PIPELINE_TAGS_AND_AUTO_MODELS = [
    ("pretraining", "MODEL_FOR_PRETRAINING_MAPPING_NAMES", "AutoModelForPreTraining"),
    ("feature-extraction", "MODEL_MAPPING_NAMES", "AutoModel"),
+    ("image-feature-extraction", "MODEL_FOR_IMAGE_MAPPING_NAMES", "AutoModel"),
    ("audio-classification", "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES", "AutoModelForAudioClassification"),
    ("text-generation", "MODEL_FOR_CAUSAL_LM_MAPPING_NAMES", "AutoModelForCausalLM"),
    ("automatic-speech-recognition", "MODEL_FOR_CTC_MAPPING_NAMES", "AutoModelForCTC"),