Add Universal Segmentation class + mapping (#20766)

* Add mapping * Add mapping to pipeline * Apply suggestions * Fix feature extractor tests * Use ForInstance, add model to universal mapping * More fixes * Remove model from deprecated objectsé Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>

Add Universal Segmentation class + mapping (#20766)
* Add mapping * Add mapping to pipeline * Apply suggestions * Fix feature extractor tests * Use ForInstance, add model to universal mapping * More fixes * Remove model from deprecated objectsé Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
7f998612 · NielsRogge · GitHub · e65445b4 · 7f998612 · 7f998612
Unverified Commit 7f998612 authored Dec 16, 2022 by NielsRogge Committed by GitHub Dec 16, 2022
7 changed files
--- a/docs/source/en/model_doc/auto.mdx
+++ b/docs/source/en/model_doc/auto.mdx
@@ -254,6 +254,10 @@ The following auto classes are available for the following computer vision tasks
 [[autodoc]] AutoModelForInstanceSegmentation
+### AutoModelForUniversalSegmentation
+[[autodoc]] AutoModelForUniversalSegmentation
 ### AutoModelForZeroShotObjectDetection
 [[autodoc]] AutoModelForZeroShotObjectDetection

--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -943,6 +943,7 @@ else:
            "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
            "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
            "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
+            "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
            "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
            "MODEL_FOR_VISION_2_SEQ_MAPPING",
            "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
@@ -974,6 +975,7 @@ else:
            "AutoModelForSpeechSeq2Seq",
            "AutoModelForTableQuestionAnswering",
            "AutoModelForTokenClassification",
+            "AutoModelForUniversalSegmentation",
            "AutoModelForVideoClassification",
            "AutoModelForVision2Seq",
            "AutoModelForVisualQuestionAnswering",
@@ -4113,6 +4115,7 @@ if TYPE_CHECKING:
            MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
            MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
            MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+            MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
            MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
            MODEL_FOR_VISION_2_SEQ_MAPPING,
            MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
@@ -4144,6 +4147,7 @@ if TYPE_CHECKING:
            AutoModelForSpeechSeq2Seq,
            AutoModelForTableQuestionAnswering,
            AutoModelForTokenClassification,
+            AutoModelForUniversalSegmentation,
            AutoModelForVideoClassification,
            AutoModelForVision2Seq,
            AutoModelForVisualQuestionAnswering,

--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -67,6 +67,7 @@ else:
        "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
        "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
        "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
+        "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
        "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
        "MODEL_FOR_VISION_2_SEQ_MAPPING",
        "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
@@ -97,6 +98,7 @@ else:
        "AutoModelForSpeechSeq2Seq",
        "AutoModelForTableQuestionAnswering",
        "AutoModelForTokenClassification",
+        "AutoModelForUniversalSegmentation",
        "AutoModelForVideoClassification",
        "AutoModelForVision2Seq",
        "AutoModelForVisualQuestionAnswering",
@@ -222,6 +224,7 @@ if TYPE_CHECKING:
            MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
            MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
            MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+            MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
            MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
            MODEL_FOR_VISION_2_SEQ_MAPPING,
            MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
@@ -253,6 +256,7 @@ if TYPE_CHECKING:
            AutoModelForSpeechSeq2Seq,
            AutoModelForTableQuestionAnswering,
            AutoModelForTokenClassification,
+            AutoModelForUniversalSegmentation,
            AutoModelForVideoClassification,
            AutoModelForVision2Seq,
            AutoModelForVisualQuestionAnswering,

--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -434,6 +434,15 @@ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = OrderedDict(
 MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES = OrderedDict(
    [
        # Model for Instance Segmentation mapping
+        # MaskFormerForInstanceSegmentation can be removed from this mapping in v5
+        ("maskformer", "MaskFormerForInstanceSegmentation"),
+    ]
+)
+MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Universal Segmentation mapping
+        ("detr", "DetrForSegmentation"),
        ("maskformer", "MaskFormerForInstanceSegmentation"),
    ]
 )
@@ -892,6 +901,9 @@ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = _LazyAutoMapping(
 MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING = _LazyAutoMapping(
    CONFIG_MAPPING_NAMES, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES
 )
+MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES
+)
 MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
    CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
 )
@@ -1083,6 +1095,15 @@ AutoModelForSemanticSegmentation = auto_class_update(
 )
+class AutoModelForUniversalSegmentation(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING
+AutoModelForUniversalSegmentation = auto_class_update(
+    AutoModelForUniversalSegmentation, head_doc="universal image segmentation"
+)
 class AutoModelForInstanceSegmentation(_BaseAutoModelClass):
    _model_mapping = MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING

--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -16,6 +16,7 @@ if is_torch_available():
        MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
        MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
        MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
+        MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
    )
@@ -75,6 +76,7 @@ class ImageSegmentationPipeline(Pipeline):
                MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()
                + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items()
                + MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items()
+                + MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING.items()
            )
        )

--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -446,6 +446,9 @@ MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None
 MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
+MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING = None
 MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = None
@@ -639,6 +642,13 @@ class AutoModelForTokenClassification(metaclass=DummyObject):
        requires_backends(self, ["torch"])
+class AutoModelForUniversalSegmentation(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
 class AutoModelForVideoClassification(metaclass=DummyObject):
    _backends = ["torch"]

--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -239,7 +239,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
    "VisualBertForMultipleChoice",
    "TFWav2Vec2ForCTC",
    "TFHubertForCTC",
-    "MaskFormerForInstanceSegmentation",
    "XCLIPVisionModel",
    "XCLIPTextModel",
 ]