Improve and fix ImageSegmentationPipeline (#19367)

- Fixes the image segmentation pipeline test failures caused by changes to the postprocessing methods of supported models - Updates the ImageSegmentationPipeline tests - Improves docs, adds 'task' argument to optionally perform semantic, instance or panoptic segmentation

Improve and fix ImageSegmentationPipeline (#19367)
- Fixes the image segmentation pipeline test failures caused by changes to the postprocessing methods of supported models - Updates the ImageSegmentationPipeline tests - Improves docs, adds 'task' argument to optionally perform semantic, instance or panoptic segmentation
983451a1 · Alara Dirik · GitHub · de4d71ea · 983451a1 · 983451a1
Unverified Commit 983451a1 authored Oct 07, 2022 by Alara Dirik Committed by GitHub Oct 07, 2022
Showing with 144 additions and 110 deletions

src/transformers/pipelines/image_segmentation.py src/transformers/pipelines/image_segmentation.py +71 -69

tests/pipelines/test_pipelines_image_segmentation.py tests/pipelines/test_pipelines_image_segmentation.py +73 -41

No files found.
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -12,9 +12,6 @@ if is_vision_available():
    from ..image_utils import load_image
 if is_torch_available():
-    import torch
-    from torch import nn
    from ..models.auto.modeling_auto import (
        MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
        MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
@@ -59,13 +56,15 @@ class ImageSegmentationPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        postprocess_kwargs = {}
+        if "task" in kwargs:
+            postprocess_kwargs["task"] = kwargs["task"]
        if "threshold" in kwargs:
            postprocess_kwargs["threshold"] = kwargs["threshold"]
-        if "mask_threshold" in kwargs:
+        if "overlap_mask_area_threshold" in kwargs:
-            postprocess_kwargs["mask_threshold"] = kwargs["mask_threshold"]
+            postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"]
        return {}, {}, postprocess_kwargs
-    def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
+    def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]:
        """
        Perform segmentation (detect masks & classes) in the image(s) passed as inputs.
@@ -79,30 +78,34 @@ class ImageSegmentationPipeline(Pipeline):
                The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
                same format: all as HTTP(S) links, all as local paths, or all as PIL images.
+            task (`str`, defaults to `semantic`):
+                Segmentation task to be performed, choose [`semantic`, `instance` and `panoptic`] depending on model
+                capabilities.
            threshold (`float`, *optional*, defaults to 0.9):
-                The probability necessary to make a prediction.
+                Probability threshold to filter out predicted masks.
-            mask_threshold (`float`, *optional*, defaults to 0.5):
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5):
-                Threshold to use when turning the predicted masks into binary values.
+                Mask overlap threshold to eliminate small, disconnected segments.
        Return:
            A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a
            list of dictionaries, if the input is a list of several images, will return a list of list of dictionaries
            corresponding to each image.
-            The dictionaries contain the following keys:
+            The dictionaries contain the mask, label and score (where applicable) of each detected object and contains
+            the following keys:
            - **label** (`str`) -- The class label identified by the model.
-            - **mask** (`PIL.Image`) -- Pil Image with size (heigth, width) of the original image. Pixel values in the
+            - **mask** (`PIL.Image`) -- A binary mask of the detected object as a Pil Image of shape (width, height) of
-              image are in the range 0-255. 0 means the pixel is *not* part of the *label*, 255 means it definitely is.
+              the original image. Returns a mask filled with zeros if no object is found.
            - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the
              "object" described by the label and the mask.
        """
-        return super().__call__(*args, **kwargs)
+        return super().__call__(images, **kwargs)
    def preprocess(self, image):
        image = load_image(image)
-        target_size = torch.IntTensor([[image.height, image.width]])
+        target_size = [(image.height, image.width)]
        inputs = self.feature_extractor(images=[image], return_tensors="pt")
        inputs["target_size"] = target_size
        return inputs
@@ -113,66 +116,65 @@ class ImageSegmentationPipeline(Pipeline):
        model_outputs["target_size"] = target_size
        return model_outputs
-    def postprocess(self, model_outputs, raw_image=False, threshold=0.9, mask_threshold=0.5):
+    def postprocess(self, model_outputs, task="semantic", threshold=0.9, overlap_mask_area_threshold=0.5):
-        if hasattr(self.feature_extractor, "post_process_panoptic_segmentation"):
+        if task == "instance" and hasattr(self.feature_extractor, "post_process_instance_segmentation"):
            outputs = self.feature_extractor.post_process_panoptic_segmentation(
-                model_outputs, object_mask_threshold=threshold
+                model_outputs,
+                threshold=threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                target_sizes=model_outputs["target_size"],
            )[0]
            annotation = []
            segmentation = outputs["segmentation"]
-            for segment in outputs["segments"]:
+            if len(outputs["segments_info"]) == 0:
+                mask = Image.fromarray(np.zeros(segmentation.shape).astype(np.uint8), mode="L")
+                annotation.append({"mask": mask, "label": None, "score": 0.0})
+            else:
+                for segment in outputs["segments_info"]:
                    mask = (segmentation == segment["id"]) * 255
                    mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
                    label = self.model.config.id2label[segment["label_id"]]
-                annotation.append({"mask": mask, "label": label, "score": None})
+                    score = segment["score"]
-        elif hasattr(self.feature_extractor, "post_process_segmentation"):
+                    annotation.append({"mask": mask, "label": label, "score": score})
-            # Panoptic
-            raw_annotations = self.feature_extractor.post_process_segmentation(
+        elif task == "panoptic" and hasattr(self.feature_extractor, "post_process_panoptic_segmentation"):
-                model_outputs, model_outputs["target_size"], threshold=threshold, mask_threshold=0.5
+            outputs = self.feature_extractor.post_process_panoptic_segmentation(
-            )
+                model_outputs,
-            raw_annotation = raw_annotations[0]
+                threshold=threshold,
-            raw_annotation["masks"] *= 255  # [0,1] -> [0,255] black and white pixels
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
-            raw_annotation["scores"] = raw_annotation["scores"].tolist()
+                target_sizes=model_outputs["target_size"],
-            raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in raw_annotation["labels"]]
+            )[0]
-            raw_annotation["masks"] = [
-                Image.fromarray(mask.numpy().astype(np.uint8), mode="L") for mask in raw_annotation["masks"]
+            annotation = []
-            ]
+            segmentation = outputs["segmentation"]
-            # {"scores": [...], ...} --> [{"score":x, ...}, ...]
-            keys = ["score", "label", "mask"]
+            if len(outputs["segments_info"]) == 0:
-            annotation = [
+                mask = Image.fromarray(np.zeros(segmentation.shape).astype(np.uint8), mode="L")
-                dict(zip(keys, vals))
+                annotation.append({"mask": mask, "label": None, "score": 0.0})
-                for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["masks"])
-            ]
            else:
-            # Default logits
+                for segment in outputs["segments_info"]:
-            logits = model_outputs.logits
+                    mask = (segmentation == segment["id"]) * 255
-            logits = logits.softmax(dim=1)
+                    mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
-            if len(logits.shape) != 4:
+                    label = self.model.config.id2label[segment["label_id"]]
-                raise ValueError(f"Logits don't have expected dimensions, expected [1, N, H, W], got {logits.shape}")
+                    score = segment["score"]
-            batch_size, num_labels, height, width = logits.shape
+                    annotation.append({"score": score, "label": label, "mask": mask})
-            expected_num_labels = len(self.model.config.id2label)
-            if num_labels != expected_num_labels:
+        elif task == "semantic" and hasattr(self.feature_extractor, "post_process_semantic_segmentation"):
-                raise ValueError(
+            outputs = self.feature_extractor.post_process_semantic_segmentation(
-                    f"Logits don't have expected dimensions, expected [1, {num_labels}, H, W], got {logits.shape}"
+                model_outputs, target_sizes=model_outputs["target_size"]
-                )
+            )[0]
-            size = model_outputs["target_size"].squeeze(0).tolist()
-            logits_reshaped = nn.functional.interpolate(logits, size=size, mode="bilinear", align_corners=False)
-            classes = logits_reshaped.argmax(dim=1)[0]
            annotation = []
+            segmentation = outputs.numpy()
+            labels = np.unique(segmentation)
-            for label_id in range(num_labels):
+            for label in labels:
-                label = self.model.config.id2label[label_id]
+                mask = (segmentation == label) * 255
-                mask = classes == label_id
+                mask = Image.fromarray(mask, mode="L")
-                mask_sum = mask.sum()
+                label = self.model.config.id2label[label]
-                # Remove empty masks.
-                if mask_sum == 0:
-                    continue
-                mask = Image.fromarray((mask * 255).numpy().astype(np.uint8), mode="L")
-                # Semantic segmentation does not output a global score for the mask
-                # so we don't attempt to compute one.
-                # XXX: We could send a mask with values between 0 and 255 instead
-                # of a pure mask to enable users to get the probabilities that
-                # are really outputted by the logits.
                annotation.append({"score": None, "label": label, "mask": mask})
+        else:
+            raise ValueError(f"task {task} is not supported for model {self.model}")
        return annotation
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -74,9 +74,6 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
    }
    def get_test_pipeline(self, model, tokenizer, feature_extractor):
-        # Fix me Alara
-        if model.__class__.__name__ in ["DetrForSegmentation", "MaskFormerForInstanceSegmentation"]:
-            return None, None
        image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor)
        return image_segmenter, [
            "./tests/fixtures/tests_samples/COCO/000000039769.png",
@@ -150,7 +147,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
        pass
    @require_torch
-    @unittest.skip("Fix me Alara!")
+    @unittest.skip("No weights found for hf-internal-testing/tiny-detr-mobilenetsv3-panoptic")
    def test_small_model_pt(self):
        model_id = "hf-internal-testing/tiny-detr-mobilenetsv3-panoptic"
@@ -158,9 +155,15 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
        feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
        image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor)
-        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.0)
+        outputs = image_segmenter(
+            "http://images.cocodataset.org/val2017/000000039769.jpg",
+            task="panoptic",
+            threshold=0.0,
+            overlap_mask_area_threshold=0.0,
+        )
+        # Shortening by hashing
        for o in outputs:
-            # shortening by hashing
            o["mask"] = hashimage(o["mask"])
        self.assertEqual(
@@ -235,12 +238,12 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
                {
                    "score": None,
                    "label": "LABEL_0",
-                    "mask": "6225140faf502d272af076222776d7e4",
+                    "mask": "775518a7ed09eea888752176c6ba8f38",
                },
                {
                    "score": None,
                    "label": "LABEL_1",
-                    "mask": "8297c9f8eb43ddd3f32a6dae21e015a1",
+                    "mask": "a12da23a46848128af68c63aa8ba7a02",
                },
            ],
        )
@@ -249,22 +252,28 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
    @slow
    def test_integration_torch_image_segmentation(self):
        model_id = "facebook/detr-resnet-50-panoptic"
        image_segmenter = pipeline("image-segmentation", model=model_id)
-        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg")
+        outputs = image_segmenter(
+            "http://images.cocodataset.org/val2017/000000039769.jpg",
+            task="panoptic",
+            threshold=0,
+            overlap_mask_area_threshold=0.0,
+        )
+        # Shortening by hashing
        for o in outputs:
            o["mask"] = hashimage(o["mask"])
        self.assertEqual(
            nested_simplify(outputs, decimals=4),
            [
-                {"score": 0.9094, "label": "blanket", "mask": "6500201749480f87154fd967783b2b97"},
+                {"score": 0.9094, "label": "blanket", "mask": "dcff19a97abd8bd555e21186ae7c066a"},
-                {"score": 0.9941, "label": "cat", "mask": "f3a7f80220788acc0245ebc084df6afc"},
+                {"score": 0.9941, "label": "cat", "mask": "9c0af87bd00f9d3a4e0c8888e34e70e2"},
-                {"score": 0.9987, "label": "remote", "mask": "7703408f54da1d0ebda47841da875e48"},
+                {"score": 0.9987, "label": "remote", "mask": "c7870600d6c02a1f6d96470fc7220e8e"},
-                {"score": 0.9995, "label": "remote", "mask": "bd726918f10fed3efaef0091e11f923b"},
+                {"score": 0.9995, "label": "remote", "mask": "ef899a25fd44ec056c653f0ca2954fdd"},
-                {"score": 0.9722, "label": "couch", "mask": "226d6dcb98bebc3fbc208abdc0c83196"},
+                {"score": 0.9722, "label": "couch", "mask": "37b8446ac578a17108aa2b7fccc33114"},
-                {"score": 0.9994, "label": "cat", "mask": "fa5d8d5c329546ba5339f3095641ef56"},
+                {"score": 0.9994, "label": "cat", "mask": "6a09d3655efd8a388ab4511e4cbbb797"},
            ],
        )
@@ -273,8 +282,12 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
                "http://images.cocodataset.org/val2017/000000039769.jpg",
                "http://images.cocodataset.org/val2017/000000039769.jpg",
            ],
+            task="panoptic",
            threshold=0.0,
+            overlap_mask_area_threshold=0.0,
        )
+        # Shortening by hashing
        for output in outputs:
            for o in output:
                o["mask"] = hashimage(o["mask"])
@@ -283,20 +296,20 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
            nested_simplify(outputs, decimals=4),
            [
                [
-                    {"score": 0.9094, "label": "blanket", "mask": "6500201749480f87154fd967783b2b97"},
+                    {"score": 0.9094, "label": "blanket", "mask": "dcff19a97abd8bd555e21186ae7c066a"},
-                    {"score": 0.9941, "label": "cat", "mask": "f3a7f80220788acc0245ebc084df6afc"},
+                    {"score": 0.9941, "label": "cat", "mask": "9c0af87bd00f9d3a4e0c8888e34e70e2"},
-                    {"score": 0.9987, "label": "remote", "mask": "7703408f54da1d0ebda47841da875e48"},
+                    {"score": 0.9987, "label": "remote", "mask": "c7870600d6c02a1f6d96470fc7220e8e"},
-                    {"score": 0.9995, "label": "remote", "mask": "bd726918f10fed3efaef0091e11f923b"},
+                    {"score": 0.9995, "label": "remote", "mask": "ef899a25fd44ec056c653f0ca2954fdd"},
-                    {"score": 0.9722, "label": "couch", "mask": "226d6dcb98bebc3fbc208abdc0c83196"},
+                    {"score": 0.9722, "label": "couch", "mask": "37b8446ac578a17108aa2b7fccc33114"},
-                    {"score": 0.9994, "label": "cat", "mask": "fa5d8d5c329546ba5339f3095641ef56"},
+                    {"score": 0.9994, "label": "cat", "mask": "6a09d3655efd8a388ab4511e4cbbb797"},
                ],
                [
-                    {"score": 0.9094, "label": "blanket", "mask": "6500201749480f87154fd967783b2b97"},
+                    {"score": 0.9094, "label": "blanket", "mask": "dcff19a97abd8bd555e21186ae7c066a"},
-                    {"score": 0.9941, "label": "cat", "mask": "f3a7f80220788acc0245ebc084df6afc"},
+                    {"score": 0.9941, "label": "cat", "mask": "9c0af87bd00f9d3a4e0c8888e34e70e2"},
-                    {"score": 0.9987, "label": "remote", "mask": "7703408f54da1d0ebda47841da875e48"},
+                    {"score": 0.9987, "label": "remote", "mask": "c7870600d6c02a1f6d96470fc7220e8e"},
-                    {"score": 0.9995, "label": "remote", "mask": "bd726918f10fed3efaef0091e11f923b"},
+                    {"score": 0.9995, "label": "remote", "mask": "ef899a25fd44ec056c653f0ca2954fdd"},
-                    {"score": 0.9722, "label": "couch", "mask": "226d6dcb98bebc3fbc208abdc0c83196"},
+                    {"score": 0.9722, "label": "couch", "mask": "37b8446ac578a17108aa2b7fccc33114"},
-                    {"score": 0.9994, "label": "cat", "mask": "fa5d8d5c329546ba5339f3095641ef56"},
+                    {"score": 0.9994, "label": "cat", "mask": "6a09d3655efd8a388ab4511e4cbbb797"},
                ],
            ],
        )
@@ -304,12 +317,27 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
    @require_torch
    @slow
    def test_threshold(self):
-        threshold = 0.999
        model_id = "facebook/detr-resnet-50-panoptic"
        image_segmenter = pipeline("image-segmentation", model=model_id)
-        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=threshold)
+        outputs = image_segmenter(
+            "http://images.cocodataset.org/val2017/000000039769.jpg", task="panoptic", threshold=0.999
+        )
+        # Shortening by hashing
+        for o in outputs:
+            o["mask"] = hashimage(o["mask"])
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9995, "label": "remote", "mask": "d02404f5789f075e3b3174adbc3fd5b8"},
+                {"score": 0.9994, "label": "cat", "mask": "eaa115b40c96d3a6f4fe498963a7e470"},
+            ],
+        )
+        outputs = image_segmenter(
+            "http://images.cocodataset.org/val2017/000000039769.jpg", task="panoptic", threshold=0.5
+        )
        for o in outputs:
            o["mask"] = hashimage(o["mask"])
@@ -317,8 +345,11 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
        self.assertEqual(
            nested_simplify(outputs, decimals=4),
            [
-                {"score": 0.9995, "label": "remote", "mask": "bd726918f10fed3efaef0091e11f923b"},
+                {"score": 0.9941, "label": "cat", "mask": "9c0af87bd00f9d3a4e0c8888e34e70e2"},
-                {"score": 0.9994, "label": "cat", "mask": "fa5d8d5c329546ba5339f3095641ef56"},
+                {"score": 0.9987, "label": "remote", "mask": "c7870600d6c02a1f6d96470fc7220e8e"},
+                {"score": 0.9995, "label": "remote", "mask": "ef899a25fd44ec056c653f0ca2954fdd"},
+                {"score": 0.9722, "label": "couch", "mask": "37b8446ac578a17108aa2b7fccc33114"},
+                {"score": 0.9994, "label": "cat", "mask": "6a09d3655efd8a388ab4511e4cbbb797"},
            ],
        )
@@ -335,20 +366,21 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
        file = image[0]["file"]
-        outputs = image_segmenter(file, threshold=threshold)
+        outputs = image_segmenter(file, task="panoptic", threshold=threshold)
+        # Shortening by hashing
        for o in outputs:
            o["mask"] = hashimage(o["mask"])
        self.assertEqual(
            nested_simplify(outputs, decimals=4),
            [
-                {"mask": "20d1b9480d1dc1501dbdcfdff483e370", "label": "wall", "score": None},
+                {"score": 0.9974, "label": "wall", "mask": "a547b7c062917f4f3e36501827ad3cd6"},
-                {"mask": "0f902fbc66a0ff711ea455b0e4943adf", "label": "house", "score": None},
+                {"score": 0.949, "label": "house", "mask": "0da9b7b38feac47bd2528a63e5ea7b19"},
-                {"mask": "4537bdc07d47d84b3f8634b7ada37bd4", "label": "grass", "score": None},
+                {"score": 0.9995, "label": "grass", "mask": "1d07ea0a263dcf38ca8ae1a15fdceda1"},
-                {"mask": "b7ac77dfae44a904b479a0926a2acaf7", "label": "tree", "score": None},
+                {"score": 0.9976, "label": "tree", "mask": "6cdc97c7daf1dc596fa181f461ddd2ba"},
-                {"mask": "e9bedd56bd40650fb263ce03eb621079", "label": "plant", "score": None},
+                {"score": 0.8239, "label": "plant", "mask": "1ab4ce378f6ceff57d428055cfbd742f"},
-                {"mask": "37a609f8c9c1b8db91fbff269f428b20", "label": "road, route", "score": None},
+                {"score": 0.9942, "label": "road, route", "mask": "39c5d17be53b2d1b0f46aad8ebb15813"},
-                {"mask": "0d8cdfd63bae8bf6e4344d460a2fa711", "label": "sky", "score": None},
+                {"score": 1.0, "label": "sky", "mask": "a3756324a692981510c39b1a59510a36"},
            ],
        )