Unverified Commit 9e71d464 authored by Nicolas Patry's avatar Nicolas Patry Committed by GitHub
Browse files

Enable `image-segmentation` on `AutoModelForSemanticSegmentation` (#15647)

* Enabling Beit SegFormer to `image-segmentation`.

* Fixing the score.

* Fix import ?

* Missing in type hint.

* Multiple test fixes:

- Add `raw_image` support. It should be the default IMHO since in Python
  world it doesn't make any sense to base64 encode the image (Sorry
  @mishig, didn't catch that in my review). I really think we should
  consider breaking BC here.
- Add support for Segformer tiny test (needed
  `SegformerModelTester.get_config` to enable TinyConfig
  @NielsRogge)
- Add the check that `batch_size` works correctly on that pipeline.
  Uncovered that it doesn't for Detr, which IMO is OK since images
  after `feature_extractor` don't have the same size. Comment should
  explain.

* Type hint as a string.

* Make fixup + update black.

* torch+vision protections.

* Don't use torchvision, use F.interpolate instead (no new dep).

* Last fixes for Segformer.

* Update test to reflect new image (which was broken)

* Update tests.

* Major BC modification:

- Removed the string compressed PNG string, that's a job for users
`transformers` stays in python land.
- Removed the `score` for semantic segmentation. It has hardly a meaning
  on its own in this context.
- Don't include the grayscale with logits for now (which could enable
  users to get a sense of confidence). Might be done later.
- Don't include the surface of the mask (could be used for sorting by
  users, to filter out small masks). It's already calculable, and
  it's easier to add later, than to add now and break later if we need.

* `make fixup`.

* Small changes.

* Rebase + doc fixup.
parent 1b239797
......@@ -680,6 +680,7 @@ if is_torch_available():
"MODEL_FOR_OBJECT_DETECTION_MAPPING",
"MODEL_FOR_PRETRAINING_MAPPING",
"MODEL_FOR_QUESTION_ANSWERING_MAPPING",
"MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING",
"MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
"MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
"MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
......@@ -2850,6 +2851,7 @@ if TYPE_CHECKING:
MODEL_FOR_OBJECT_DETECTION_MAPPING,
MODEL_FOR_PRETRAINING_MAPPING,
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
......
......@@ -103,6 +103,7 @@ if is_torch_available():
AutoModelForMaskedLM,
AutoModelForObjectDetection,
AutoModelForQuestionAnswering,
AutoModelForSemanticSegmentation,
AutoModelForSeq2SeqLM,
AutoModelForSequenceClassification,
AutoModelForSpeechSeq2Seq,
......@@ -264,7 +265,7 @@ SUPPORTED_TASKS = {
"image-segmentation": {
"impl": ImageSegmentationPipeline,
"tf": (),
"pt": (AutoModelForImageSegmentation,) if is_torch_available() else (),
"pt": (AutoModelForImageSegmentation, AutoModelForSemanticSegmentation) if is_torch_available() else (),
"default": {"model": {"pt": "facebook/detr-resnet-50-panoptic"}},
"type": "image",
},
......
import base64
import io
from typing import Any, Dict, List, Union
import numpy as np
......@@ -16,8 +14,13 @@ if is_vision_available():
if is_torch_available():
import torch
from torch import nn
from ..models.auto.modeling_auto import (
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
)
from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_SEGMENTATION_MAPPING
logger = logging.get_logger(__name__)
......@@ -46,7 +49,9 @@ class ImageSegmentationPipeline(Pipeline):
raise ValueError(f"The {self.__class__} is only available in PyTorch.")
requires_backends(self, "vision")
self.check_model_type(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING)
self.check_model_type(
dict(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items() + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items())
)
def _sanitize_parameters(self, **kwargs):
postprocess_kwargs = {}
......@@ -77,16 +82,16 @@ class ImageSegmentationPipeline(Pipeline):
Return:
A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a
dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
each image.
list of dictionaries, if the input is a list of several images, will return a list of list of dictionaries
corresponding to each image.
The dictionaries contain the following keys:
- **label** (`str`) -- The class label identified by the model.
- **score** (`float`) -- The score attributed by the model for that label.
- **mask** (`str`) -- base64 string of a grayscale (single-channel) PNG image that contain masks
information. The PNG image has size (heigth, width) of the original image. Pixel values in the image are
either 0 or 255 (i.e. mask is absent VS mask is present).
- **mask** (`PIL.Image`) -- Pil Image with size (heigth, width) of the original image. Pixel values in the
image are in the range 0-255. 0 means the pixel is *not* part of the *label*, 255 means it definitely is.
- **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the
"object" described by the label and the mask.
"""
return super().__call__(*args, **kwargs)
......@@ -104,40 +109,55 @@ class ImageSegmentationPipeline(Pipeline):
model_outputs["target_size"] = target_size
return model_outputs
def postprocess(self, model_outputs, threshold=0.9, mask_threshold=0.5):
raw_annotations = self.feature_extractor.post_process_segmentation(
model_outputs, model_outputs["target_size"], threshold=threshold, mask_threshold=0.5
)
raw_annotation = raw_annotations[0]
raw_annotation["masks"] *= 255 # [0,1] -> [0,255] black and white pixels
raw_annotation["scores"] = raw_annotation["scores"].tolist()
raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in raw_annotation["labels"]]
raw_annotation["masks"] = [self._get_mask_str(mask) for mask in raw_annotation["masks"].cpu().numpy()]
# {"scores": [...], ...} --> [{"score":x, ...}, ...]
keys = ["score", "label", "mask"]
annotation = [
dict(zip(keys, vals))
for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["masks"])
]
def postprocess(self, model_outputs, raw_image=False, threshold=0.9, mask_threshold=0.5):
if hasattr(self.feature_extractor, "post_process_segmentation"):
# Panoptic
raw_annotations = self.feature_extractor.post_process_segmentation(
model_outputs, model_outputs["target_size"], threshold=threshold, mask_threshold=0.5
)
raw_annotation = raw_annotations[0]
raw_annotation["masks"] *= 255 # [0,1] -> [0,255] black and white pixels
raw_annotation["scores"] = raw_annotation["scores"].tolist()
raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in raw_annotation["labels"]]
raw_annotation["masks"] = [
Image.fromarray(mask.numpy().astype(np.uint8), mode="L") for mask in raw_annotation["masks"]
]
# {"scores": [...], ...} --> [{"score":x, ...}, ...]
keys = ["score", "label", "mask"]
annotation = [
dict(zip(keys, vals))
for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["masks"])
]
else:
# Default logits
logits = model_outputs.logits
logits = logits.softmax(dim=1)
if len(logits.shape) != 4:
raise ValueError(f"Logits don't have expected dimensions, expected [1, N, H, W], got {logits.shape}")
batch_size, num_labels, height, width = logits.shape
expected_num_labels = len(self.model.config.id2label)
if num_labels != expected_num_labels:
raise ValueError(
f"Logits don't have expected dimensions, expected [1, {num_labels}, H, W], got {logits.shape}"
)
size = model_outputs["target_size"].squeeze(0).tolist()
logits_reshaped = nn.functional.interpolate(logits, size=size, mode="bilinear", align_corners=False)
classes = logits_reshaped.argmax(dim=1)[0]
annotation = []
for label_id in range(num_labels):
label = self.model.config.id2label[label_id]
mask = classes == label_id
mask_sum = mask.sum()
# Remove empty masks.
if mask_sum == 0:
continue
mask = Image.fromarray((mask * 255).numpy().astype(np.uint8), mode="L")
# Semantic segmentation does not output a global score for the mask
# so we don't attempt to compute one.
# XXX: We could send a mask with values between 0 and 255 instead
# of a pure mask to enable users to get the probabilities that
# are really outputted by the logits.
annotation.append({"score": None, "label": label, "mask": mask})
return annotation
def _get_mask_str(self, mask: np.array) -> str:
"""
Turns mask numpy array into mask base64 str.
Args:
mask (`np.array`): Numpy array (with shape (heigth, width) of the original image) containing masks
information. Values in the array are either 0 or 255 (i.e. mask is absent VS mask is present).
Returns:
A base64 string of a single-channel PNG image that contain masks information.
"""
img = Image.fromarray(mask.astype(np.int8), mode="L")
with io.BytesIO() as out:
img.save(out, format="PNG")
png_string = out.getvalue()
return base64.b64encode(png_string).decode("utf-8")
......@@ -383,6 +383,9 @@ MODEL_FOR_PRETRAINING_MAPPING = None
MODEL_FOR_QUESTION_ANSWERING_MAPPING = None
MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = None
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = None
......
......@@ -101,7 +101,11 @@ class SegformerModelTester:
if self.use_labels:
labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
config = SegformerConfig(
config = self.get_config()
return config, pixel_values, labels
def get_config(self):
return SegformerConfig(
image_size=self.image_size,
num_channels=self.num_channels,
num_encoder_blocks=self.num_encoder_blocks,
......@@ -114,8 +118,6 @@ class SegformerModelTester:
initializer_range=self.initializer_range,
)
return config, pixel_values, labels
def create_and_check_model(self, config, pixel_values, labels):
model = SegformerModel(config=config)
model.to(torch_device)
......
......@@ -126,14 +126,14 @@ def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config, feature_
class ANY:
def __init__(self, _type):
self._type = _type
def __init__(self, *_types):
self._types = _types
def __eq__(self, other):
return isinstance(other, self._type)
return isinstance(other, self._types)
def __repr__(self):
return f"ANY({self._type.__name__})"
return f"ANY({', '.join(_type.__name__ for _type in self._types)})"
class PipelineTestCaseMeta(type):
......
......@@ -15,10 +15,14 @@
import hashlib
import unittest
import datasets
from transformers import (
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
AutoFeatureExtractor,
AutoModelForImageSegmentation,
DetrForSegmentation,
ImageSegmentationPipeline,
is_vision_available,
pipeline,
......@@ -46,12 +50,23 @@ else:
pass
def hashimage(image: Image) -> str:
m = hashlib.md5(image.tobytes())
return m.hexdigest()
@require_vision
@require_timm
@require_torch
@is_pipeline_test
class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
model_mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING
model_mapping = {
k: v
for k, v in (
list(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()) if MODEL_FOR_IMAGE_SEGMENTATION_MAPPING else []
)
+ (MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items() if MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING else [])
}
def get_test_pipeline(self, model, tokenizer, feature_extractor):
image_segmenter = ImageSegmentationPipeline(model=model, feature_extractor=feature_extractor)
......@@ -62,34 +77,59 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
def run_pipeline_test(self, image_segmenter, examples):
outputs = image_segmenter("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0)
self.assertEqual(outputs, [{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12)
import datasets
self.assertIsInstance(outputs, list)
n = len(outputs)
self.assertGreater(n, 1)
# XXX: PIL.Image implements __eq__ which bypasses ANY, so we inverse the comparison
# to make it work
self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, outputs)
dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", "image", split="test")
# RGBA
outputs = image_segmenter(dataset[0]["file"])
m = len(outputs)
self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
# LA
outputs = image_segmenter(dataset[1]["file"])
m = len(outputs)
self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
# L
outputs = image_segmenter(dataset[2]["file"])
m = len(outputs)
self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
if isinstance(image_segmenter.model, DetrForSegmentation):
# We need to test batch_size with images with the same size.
# Detr doesn't normalize the size of the images, meaning we can have
# 800x800 or 800x1200, meaning we cannot batch simply.
# We simply bail on this
batch_size = 1
else:
batch_size = 2
# 5 times the same image so the output shape is predictable
batch = [
Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
"http://images.cocodataset.org/val2017/000000039769.jpg",
# RGBA
dataset[0]["file"],
# LA
dataset[1]["file"],
# L
dataset[2]["file"],
"./tests/fixtures/tests_samples/COCO/000000039769.png",
"./tests/fixtures/tests_samples/COCO/000000039769.png",
"./tests/fixtures/tests_samples/COCO/000000039769.png",
"./tests/fixtures/tests_samples/COCO/000000039769.png",
"./tests/fixtures/tests_samples/COCO/000000039769.png",
]
outputs = image_segmenter(batch, threshold=0.0)
outputs = image_segmenter(batch, threshold=0.0, batch_size=batch_size)
self.assertEqual(len(batch), len(outputs))
self.assertEqual({"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}, outputs[0][0])
self.assertEqual(len(outputs[0]), n)
self.assertEqual(
outputs,
[
[{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
[{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
[{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
[{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
[{"score": ANY(float), "label": ANY(str), "mask": ANY(str)}] * 12,
[{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
[{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
[{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
[{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
[{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
],
outputs,
f"Expected [{n}, {n}, {n}, {n}, {n}], got {[len(item) for item in outputs]}",
)
@require_tf
......@@ -108,7 +148,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.0)
for o in outputs:
# shortening by hashing
o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest()
o["mask"] = hashimage(o["mask"])
self.assertEqual(
nested_simplify(outputs, decimals=4),
......@@ -116,12 +156,12 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
{
"score": 0.004,
"label": "LABEL_0",
"mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
"mask": "34eecd16bbfb0f476083ef947d81bf66",
},
{
"score": 0.004,
"label": "LABEL_0",
"mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
"mask": "34eecd16bbfb0f476083ef947d81bf66",
},
],
)
......@@ -135,7 +175,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
)
for output in outputs:
for o in output:
o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest()
o["mask"] = hashimage(o["mask"])
self.assertEqual(
nested_simplify(outputs, decimals=4),
......@@ -144,29 +184,54 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
{
"score": 0.004,
"label": "LABEL_0",
"mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
"mask": "34eecd16bbfb0f476083ef947d81bf66",
},
{
"score": 0.004,
"label": "LABEL_0",
"mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
"mask": "34eecd16bbfb0f476083ef947d81bf66",
},
],
[
{
"score": 0.004,
"label": "LABEL_0",
"mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
"mask": "34eecd16bbfb0f476083ef947d81bf66",
},
{
"score": 0.004,
"label": "LABEL_0",
"mask": "4276f7db4ca2983b2666f7e0c102d8186aed20be",
"mask": "34eecd16bbfb0f476083ef947d81bf66",
},
],
],
)
@require_torch
def test_small_model_pt_semantic(self):
model_id = "hf-internal-testing/tiny-random-beit-pipeline"
image_segmenter = pipeline(model=model_id)
outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg")
for o in outputs:
# shortening by hashing
o["mask"] = hashimage(o["mask"])
self.assertEqual(
nested_simplify(outputs, decimals=4),
[
{
"score": None,
"label": "LABEL_0",
"mask": "01245d8ad25d03f09493ca97965788ae",
},
{
"score": None,
"label": "LABEL_1",
"mask": "f741516de8d5196a2c830739b9ac1c8c",
},
],
)
@require_torch
@slow
def test_integration_torch_image_segmentation(self):
......@@ -176,7 +241,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg")
for o in outputs:
o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest()
o["mask"] = hashimage(o["mask"])
self.assertEqual(
nested_simplify(outputs, decimals=4),
......@@ -234,7 +299,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=threshold)
for o in outputs:
o["mask"] = hashlib.sha1(o["mask"].encode("UTF-8")).hexdigest()
o["mask"] = hashimage(o["mask"])
self.assertEqual(
nested_simplify(outputs, decimals=4),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment