Unverified Commit 983451a1 authored by Alara Dirik's avatar Alara Dirik Committed by GitHub
Browse files

Improve and fix ImageSegmentationPipeline (#19367)

- Fixes the image segmentation pipeline test failures caused by changes to the postprocessing methods of supported models
- Updates the ImageSegmentationPipeline tests
- Improves docs, adds 'task' argument to optionally perform semantic, instance or panoptic segmentation
parent de4d71ea
......@@ -12,9 +12,6 @@ if is_vision_available():
from ..image_utils import load_image
if is_torch_available():
import torch
from torch import nn
from ..models.auto.modeling_auto import (
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
......@@ -59,13 +56,15 @@ class ImageSegmentationPipeline(Pipeline):
def _sanitize_parameters(self, **kwargs):
postprocess_kwargs = {}
if "task" in kwargs:
postprocess_kwargs["task"] = kwargs["task"]
if "threshold" in kwargs:
postprocess_kwargs["threshold"] = kwargs["threshold"]
if "mask_threshold" in kwargs:
postprocess_kwargs["mask_threshold"] = kwargs["mask_threshold"]
if "overlap_mask_area_threshold" in kwargs:
postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"]
return {}, {}, postprocess_kwargs
def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
def __call__(self, images, **kwargs) -> Union[Predictions, List[Prediction]]:
"""
Perform segmentation (detect masks & classes) in the image(s) passed as inputs.
......@@ -79,30 +78,34 @@ class ImageSegmentationPipeline(Pipeline):
The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
same format: all as HTTP(S) links, all as local paths, or all as PIL images.
task (`str`, defaults to `semantic`):
Segmentation task to be performed, choose [`semantic`, `instance` and `panoptic`] depending on model
capabilities.
threshold (`float`, *optional*, defaults to 0.9):
The probability necessary to make a prediction.
mask_threshold (`float`, *optional*, defaults to 0.5):
Threshold to use when turning the predicted masks into binary values.
Probability threshold to filter out predicted masks.
overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5):
Mask overlap threshold to eliminate small, disconnected segments.
Return:
A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a
list of dictionaries, if the input is a list of several images, will return a list of list of dictionaries
corresponding to each image.
The dictionaries contain the following keys:
The dictionaries contain the mask, label and score (where applicable) of each detected object and contains
the following keys:
- **label** (`str`) -- The class label identified by the model.
- **mask** (`PIL.Image`) -- Pil Image with size (heigth, width) of the original image. Pixel values in the
image are in the range 0-255. 0 means the pixel is *not* part of the *label*, 255 means it definitely is.
- **mask** (`PIL.Image`) -- A binary mask of the detected object as a Pil Image of shape (width, height) of
the original image. Returns a mask filled with zeros if no object is found.
- **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the
"object" described by the label and the mask.
"""
return super().__call__(*args, **kwargs)
return super().__call__(images, **kwargs)
def preprocess(self, image):
image = load_image(image)
target_size = torch.IntTensor([[image.height, image.width]])
target_size = [(image.height, image.width)]
inputs = self.feature_extractor(images=[image], return_tensors="pt")
inputs["target_size"] = target_size
return inputs
......@@ -113,66 +116,65 @@ class ImageSegmentationPipeline(Pipeline):
model_outputs["target_size"] = target_size
return model_outputs
def postprocess(self, model_outputs, raw_image=False, threshold=0.9, mask_threshold=0.5):
if hasattr(self.feature_extractor, "post_process_panoptic_segmentation"):
def postprocess(self, model_outputs, task="semantic", threshold=0.9, overlap_mask_area_threshold=0.5):
if task == "instance" and hasattr(self.feature_extractor, "post_process_instance_segmentation"):
outputs = self.feature_extractor.post_process_panoptic_segmentation(
model_outputs, object_mask_threshold=threshold
model_outputs,
threshold=threshold,
overlap_mask_area_threshold=overlap_mask_area_threshold,
target_sizes=model_outputs["target_size"],
)[0]
annotation = []
segmentation = outputs["segmentation"]
for segment in outputs["segments"]:
if len(outputs["segments_info"]) == 0:
mask = Image.fromarray(np.zeros(segmentation.shape).astype(np.uint8), mode="L")
annotation.append({"mask": mask, "label": None, "score": 0.0})
else:
for segment in outputs["segments_info"]:
mask = (segmentation == segment["id"]) * 255
mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
label = self.model.config.id2label[segment["label_id"]]
annotation.append({"mask": mask, "label": label, "score": None})
elif hasattr(self.feature_extractor, "post_process_segmentation"):
# Panoptic
raw_annotations = self.feature_extractor.post_process_segmentation(
model_outputs, model_outputs["target_size"], threshold=threshold, mask_threshold=0.5
)
raw_annotation = raw_annotations[0]
raw_annotation["masks"] *= 255 # [0,1] -> [0,255] black and white pixels
raw_annotation["scores"] = raw_annotation["scores"].tolist()
raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in raw_annotation["labels"]]
raw_annotation["masks"] = [
Image.fromarray(mask.numpy().astype(np.uint8), mode="L") for mask in raw_annotation["masks"]
]
# {"scores": [...], ...} --> [{"score":x, ...}, ...]
keys = ["score", "label", "mask"]
annotation = [
dict(zip(keys, vals))
for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["masks"])
]
score = segment["score"]
annotation.append({"mask": mask, "label": label, "score": score})
elif task == "panoptic" and hasattr(self.feature_extractor, "post_process_panoptic_segmentation"):
outputs = self.feature_extractor.post_process_panoptic_segmentation(
model_outputs,
threshold=threshold,
overlap_mask_area_threshold=overlap_mask_area_threshold,
target_sizes=model_outputs["target_size"],
)[0]
annotation = []
segmentation = outputs["segmentation"]
if len(outputs["segments_info"]) == 0:
mask = Image.fromarray(np.zeros(segmentation.shape).astype(np.uint8), mode="L")
annotation.append({"mask": mask, "label": None, "score": 0.0})
else:
# Default logits
logits = model_outputs.logits
logits = logits.softmax(dim=1)
if len(logits.shape) != 4:
raise ValueError(f"Logits don't have expected dimensions, expected [1, N, H, W], got {logits.shape}")
batch_size, num_labels, height, width = logits.shape
expected_num_labels = len(self.model.config.id2label)
if num_labels != expected_num_labels:
raise ValueError(
f"Logits don't have expected dimensions, expected [1, {num_labels}, H, W], got {logits.shape}"
)
size = model_outputs["target_size"].squeeze(0).tolist()
logits_reshaped = nn.functional.interpolate(logits, size=size, mode="bilinear", align_corners=False)
classes = logits_reshaped.argmax(dim=1)[0]
for segment in outputs["segments_info"]:
mask = (segmentation == segment["id"]) * 255
mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
label = self.model.config.id2label[segment["label_id"]]
score = segment["score"]
annotation.append({"score": score, "label": label, "mask": mask})
elif task == "semantic" and hasattr(self.feature_extractor, "post_process_semantic_segmentation"):
outputs = self.feature_extractor.post_process_semantic_segmentation(
model_outputs, target_sizes=model_outputs["target_size"]
)[0]
annotation = []
segmentation = outputs.numpy()
labels = np.unique(segmentation)
for label_id in range(num_labels):
label = self.model.config.id2label[label_id]
mask = classes == label_id
mask_sum = mask.sum()
# Remove empty masks.
if mask_sum == 0:
continue
mask = Image.fromarray((mask * 255).numpy().astype(np.uint8), mode="L")
# Semantic segmentation does not output a global score for the mask
# so we don't attempt to compute one.
# XXX: We could send a mask with values between 0 and 255 instead
# of a pure mask to enable users to get the probabilities that
# are really outputted by the logits.
for label in labels:
mask = (segmentation == label) * 255
mask = Image.fromarray(mask, mode="L")
label = self.model.config.id2label[label]
annotation.append({"score": None, "label": label, "mask": mask})
else:
raise ValueError(f"task {task} is not supported for model {self.model}")
return annotation
......@@ -74,9 +74,6 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
}
def get_test_pipeline(self, model, tokenizer, feature_extractor):
# Fix me Alara
if model.__class__.__name__ in ["DetrForSegmentation", "MaskFormerForInstanceSegmentation"]:
return None, None
image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor)
return image_segmenter, [
"./tests/fixtures/tests_samples/COCO/000000039769.png",
......@@ -150,7 +147,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
pass
@require_torch
@unittest.skip("Fix me Alara!")
@unittest.skip("No weights found for hf-internal-testing/tiny-detr-mobilenetsv3-panoptic")
def test_small_model_pt(self):
model_id = "hf-internal-testing/tiny-detr-mobilenetsv3-panoptic"
......@@ -158,9 +155,15 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor)
outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.0)
outputs = image_segmenter(
"http://images.cocodataset.org/val2017/000000039769.jpg",
task="panoptic",
threshold=0.0,
overlap_mask_area_threshold=0.0,
)
# Shortening by hashing
for o in outputs:
# shortening by hashing
o["mask"] = hashimage(o["mask"])
self.assertEqual(
......@@ -235,12 +238,12 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
{
"score": None,
"label": "LABEL_0",
"mask": "6225140faf502d272af076222776d7e4",
"mask": "775518a7ed09eea888752176c6ba8f38",
},
{
"score": None,
"label": "LABEL_1",
"mask": "8297c9f8eb43ddd3f32a6dae21e015a1",
"mask": "a12da23a46848128af68c63aa8ba7a02",
},
],
)
......@@ -249,22 +252,28 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
@slow
def test_integration_torch_image_segmentation(self):
model_id = "facebook/detr-resnet-50-panoptic"
image_segmenter = pipeline("image-segmentation", model=model_id)
outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg")
outputs = image_segmenter(
"http://images.cocodataset.org/val2017/000000039769.jpg",
task="panoptic",
threshold=0,
overlap_mask_area_threshold=0.0,
)
# Shortening by hashing
for o in outputs:
o["mask"] = hashimage(o["mask"])
self.assertEqual(
nested_simplify(outputs, decimals=4),
[
{"score": 0.9094, "label": "blanket", "mask": "6500201749480f87154fd967783b2b97"},
{"score": 0.9941, "label": "cat", "mask": "f3a7f80220788acc0245ebc084df6afc"},
{"score": 0.9987, "label": "remote", "mask": "7703408f54da1d0ebda47841da875e48"},
{"score": 0.9995, "label": "remote", "mask": "bd726918f10fed3efaef0091e11f923b"},
{"score": 0.9722, "label": "couch", "mask": "226d6dcb98bebc3fbc208abdc0c83196"},
{"score": 0.9994, "label": "cat", "mask": "fa5d8d5c329546ba5339f3095641ef56"},
{"score": 0.9094, "label": "blanket", "mask": "dcff19a97abd8bd555e21186ae7c066a"},
{"score": 0.9941, "label": "cat", "mask": "9c0af87bd00f9d3a4e0c8888e34e70e2"},
{"score": 0.9987, "label": "remote", "mask": "c7870600d6c02a1f6d96470fc7220e8e"},
{"score": 0.9995, "label": "remote", "mask": "ef899a25fd44ec056c653f0ca2954fdd"},
{"score": 0.9722, "label": "couch", "mask": "37b8446ac578a17108aa2b7fccc33114"},
{"score": 0.9994, "label": "cat", "mask": "6a09d3655efd8a388ab4511e4cbbb797"},
],
)
......@@ -273,8 +282,12 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
"http://images.cocodataset.org/val2017/000000039769.jpg",
"http://images.cocodataset.org/val2017/000000039769.jpg",
],
task="panoptic",
threshold=0.0,
overlap_mask_area_threshold=0.0,
)
# Shortening by hashing
for output in outputs:
for o in output:
o["mask"] = hashimage(o["mask"])
......@@ -283,20 +296,20 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
nested_simplify(outputs, decimals=4),
[
[
{"score": 0.9094, "label": "blanket", "mask": "6500201749480f87154fd967783b2b97"},
{"score": 0.9941, "label": "cat", "mask": "f3a7f80220788acc0245ebc084df6afc"},
{"score": 0.9987, "label": "remote", "mask": "7703408f54da1d0ebda47841da875e48"},
{"score": 0.9995, "label": "remote", "mask": "bd726918f10fed3efaef0091e11f923b"},
{"score": 0.9722, "label": "couch", "mask": "226d6dcb98bebc3fbc208abdc0c83196"},
{"score": 0.9994, "label": "cat", "mask": "fa5d8d5c329546ba5339f3095641ef56"},
{"score": 0.9094, "label": "blanket", "mask": "dcff19a97abd8bd555e21186ae7c066a"},
{"score": 0.9941, "label": "cat", "mask": "9c0af87bd00f9d3a4e0c8888e34e70e2"},
{"score": 0.9987, "label": "remote", "mask": "c7870600d6c02a1f6d96470fc7220e8e"},
{"score": 0.9995, "label": "remote", "mask": "ef899a25fd44ec056c653f0ca2954fdd"},
{"score": 0.9722, "label": "couch", "mask": "37b8446ac578a17108aa2b7fccc33114"},
{"score": 0.9994, "label": "cat", "mask": "6a09d3655efd8a388ab4511e4cbbb797"},
],
[
{"score": 0.9094, "label": "blanket", "mask": "6500201749480f87154fd967783b2b97"},
{"score": 0.9941, "label": "cat", "mask": "f3a7f80220788acc0245ebc084df6afc"},
{"score": 0.9987, "label": "remote", "mask": "7703408f54da1d0ebda47841da875e48"},
{"score": 0.9995, "label": "remote", "mask": "bd726918f10fed3efaef0091e11f923b"},
{"score": 0.9722, "label": "couch", "mask": "226d6dcb98bebc3fbc208abdc0c83196"},
{"score": 0.9994, "label": "cat", "mask": "fa5d8d5c329546ba5339f3095641ef56"},
{"score": 0.9094, "label": "blanket", "mask": "dcff19a97abd8bd555e21186ae7c066a"},
{"score": 0.9941, "label": "cat", "mask": "9c0af87bd00f9d3a4e0c8888e34e70e2"},
{"score": 0.9987, "label": "remote", "mask": "c7870600d6c02a1f6d96470fc7220e8e"},
{"score": 0.9995, "label": "remote", "mask": "ef899a25fd44ec056c653f0ca2954fdd"},
{"score": 0.9722, "label": "couch", "mask": "37b8446ac578a17108aa2b7fccc33114"},
{"score": 0.9994, "label": "cat", "mask": "6a09d3655efd8a388ab4511e4cbbb797"},
],
],
)
......@@ -304,12 +317,27 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
@require_torch
@slow
def test_threshold(self):
threshold = 0.999
model_id = "facebook/detr-resnet-50-panoptic"
image_segmenter = pipeline("image-segmentation", model=model_id)
outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=threshold)
outputs = image_segmenter(
"http://images.cocodataset.org/val2017/000000039769.jpg", task="panoptic", threshold=0.999
)
# Shortening by hashing
for o in outputs:
o["mask"] = hashimage(o["mask"])
self.assertEqual(
nested_simplify(outputs, decimals=4),
[
{"score": 0.9995, "label": "remote", "mask": "d02404f5789f075e3b3174adbc3fd5b8"},
{"score": 0.9994, "label": "cat", "mask": "eaa115b40c96d3a6f4fe498963a7e470"},
],
)
outputs = image_segmenter(
"http://images.cocodataset.org/val2017/000000039769.jpg", task="panoptic", threshold=0.5
)
for o in outputs:
o["mask"] = hashimage(o["mask"])
......@@ -317,8 +345,11 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
self.assertEqual(
nested_simplify(outputs, decimals=4),
[
{"score": 0.9995, "label": "remote", "mask": "bd726918f10fed3efaef0091e11f923b"},
{"score": 0.9994, "label": "cat", "mask": "fa5d8d5c329546ba5339f3095641ef56"},
{"score": 0.9941, "label": "cat", "mask": "9c0af87bd00f9d3a4e0c8888e34e70e2"},
{"score": 0.9987, "label": "remote", "mask": "c7870600d6c02a1f6d96470fc7220e8e"},
{"score": 0.9995, "label": "remote", "mask": "ef899a25fd44ec056c653f0ca2954fdd"},
{"score": 0.9722, "label": "couch", "mask": "37b8446ac578a17108aa2b7fccc33114"},
{"score": 0.9994, "label": "cat", "mask": "6a09d3655efd8a388ab4511e4cbbb797"},
],
)
......@@ -335,20 +366,21 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
file = image[0]["file"]
outputs = image_segmenter(file, threshold=threshold)
outputs = image_segmenter(file, task="panoptic", threshold=threshold)
# Shortening by hashing
for o in outputs:
o["mask"] = hashimage(o["mask"])
self.assertEqual(
nested_simplify(outputs, decimals=4),
[
{"mask": "20d1b9480d1dc1501dbdcfdff483e370", "label": "wall", "score": None},
{"mask": "0f902fbc66a0ff711ea455b0e4943adf", "label": "house", "score": None},
{"mask": "4537bdc07d47d84b3f8634b7ada37bd4", "label": "grass", "score": None},
{"mask": "b7ac77dfae44a904b479a0926a2acaf7", "label": "tree", "score": None},
{"mask": "e9bedd56bd40650fb263ce03eb621079", "label": "plant", "score": None},
{"mask": "37a609f8c9c1b8db91fbff269f428b20", "label": "road, route", "score": None},
{"mask": "0d8cdfd63bae8bf6e4344d460a2fa711", "label": "sky", "score": None},
{"score": 0.9974, "label": "wall", "mask": "a547b7c062917f4f3e36501827ad3cd6"},
{"score": 0.949, "label": "house", "mask": "0da9b7b38feac47bd2528a63e5ea7b19"},
{"score": 0.9995, "label": "grass", "mask": "1d07ea0a263dcf38ca8ae1a15fdceda1"},
{"score": 0.9976, "label": "tree", "mask": "6cdc97c7daf1dc596fa181f461ddd2ba"},
{"score": 0.8239, "label": "plant", "mask": "1ab4ce378f6ceff57d428055cfbd742f"},
{"score": 0.9942, "label": "road, route", "mask": "39c5d17be53b2d1b0f46aad8ebb15813"},
{"score": 1.0, "label": "sky", "mask": "a3756324a692981510c39b1a59510a36"},
],
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment