Unverified Commit 0b294c23 authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

[Conditional, Deformable DETR] Add postprocessing methods (#19709)



* Add postprocessing methods

* Update docs

* Add fix

* Add test

* Add test for deformable detr postprocessing

* Add post processing methods for segmentation

* Update code examples

* Add post_process to make the pipeline work

* Apply updates
Co-authored-by: default avatarNiels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
parent 2e35bac4
...@@ -37,9 +37,10 @@ This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The o ...@@ -37,9 +37,10 @@ This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The o
[[autodoc]] ConditionalDetrFeatureExtractor [[autodoc]] ConditionalDetrFeatureExtractor
- __call__ - __call__
- pad_and_create_pixel_mask - pad_and_create_pixel_mask
- post_process - post_process_object_detection
- post_process_segmentation - post_process_instance_segmentation
- post_process_panoptic - post_process_semantic_segmentation
- post_process_panoptic_segmentation
## ConditionalDetrModel ## ConditionalDetrModel
......
...@@ -38,9 +38,7 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The origi ...@@ -38,9 +38,7 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The origi
[[autodoc]] DeformableDetrFeatureExtractor [[autodoc]] DeformableDetrFeatureExtractor
- __call__ - __call__
- pad_and_create_pixel_mask - pad_and_create_pixel_mask
- post_process - post_process_object_detection
- post_process_segmentation
- post_process_panoptic
## DeformableDetrConfig ## DeformableDetrConfig
......
...@@ -14,11 +14,9 @@ ...@@ -14,11 +14,9 @@
# limitations under the License. # limitations under the License.
"""Feature extractor class for Conditional DETR.""" """Feature extractor class for Conditional DETR."""
import io
import pathlib import pathlib
import warnings import warnings
from collections import defaultdict from typing import Dict, List, Optional, Set, Tuple, Union
from typing import Dict, List, Optional, Union
import numpy as np import numpy as np
from PIL import Image from PIL import Image
...@@ -104,21 +102,157 @@ def rgb_to_id(color): ...@@ -104,21 +102,157 @@ def rgb_to_id(color):
return int(color[0] + 256 * color[1] + 256 * 256 * color[2]) return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
# Copied from transformers.models.detr.feature_extraction_detr.id_to_rgb # Copied from transformers.models.detr.feature_extraction_detr.binary_mask_to_rle
def id_to_rgb(id_map): def binary_mask_to_rle(mask):
if isinstance(id_map, np.ndarray): """
id_map_copy = id_map.copy() Args:
rgb_shape = tuple(list(id_map.shape) + [3]) Converts given binary mask of shape (height, width) to the run-length encoding (RLE) format.
rgb_map = np.zeros(rgb_shape, dtype=np.uint8) mask (`torch.Tensor` or `numpy.array`):
for i in range(3): A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
rgb_map[..., i] = id_map_copy % 256 segment_id or class_id.
id_map_copy //= 256 Returns:
return rgb_map `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
color = [] format.
for _ in range(3): """
color.append(id_map % 256) if is_torch_tensor(mask):
id_map //= 256 mask = mask.numpy()
return color
pixels = mask.flatten()
pixels = np.concatenate([[0], pixels, [0]])
runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
runs[1::2] -= runs[::2]
return [x for x in runs]
# Copied from transformers.models.detr.feature_extraction_detr.convert_segmentation_to_rle
def convert_segmentation_to_rle(segmentation):
"""
Converts given segmentation map of shape (height, width) to the run-length encoding (RLE) format.
Args:
segmentation (`torch.Tensor` or `numpy.array`):
A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
Returns:
`List[List]`: A list of lists, where each list is the run-length encoding of a segment / class id.
"""
segment_ids = torch.unique(segmentation)
run_length_encodings = []
for idx in segment_ids:
mask = torch.where(segmentation == idx, 1, 0)
rle = binary_mask_to_rle(mask)
run_length_encodings.append(rle)
return run_length_encodings
# Copied from transformers.models.detr.feature_extraction_detr.remove_low_and_no_objects
def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
"""
Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
`labels`.
Args:
masks (`torch.Tensor`):
A tensor of shape `(num_queries, height, width)`.
scores (`torch.Tensor`):
A tensor of shape `(num_queries)`.
labels (`torch.Tensor`):
A tensor of shape `(num_queries)`.
object_mask_threshold (`float`):
A number between 0 and 1 used to binarize the masks.
Raises:
`ValueError`: Raised when the first dimension doesn't match in all input tensors.
Returns:
`Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
< `object_mask_threshold`.
"""
if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
raise ValueError("mask, scores and labels must have the same shape!")
to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)
return masks[to_keep], scores[to_keep], labels[to_keep]
# Copied from transformers.models.detr.feature_extraction_detr.check_segment_validity
def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
# Get the mask associated with the k class
mask_k = mask_labels == k
mask_k_area = mask_k.sum()
# Compute the area of all the stuff in query k
original_area = (mask_probs[k] >= mask_threshold).sum()
mask_exists = mask_k_area > 0 and original_area > 0
# Eliminate disconnected tiny segments
if mask_exists:
area_ratio = mask_k_area / original_area
if not area_ratio.item() > overlap_mask_area_threshold:
mask_exists = False
return mask_exists, mask_k
# Copied from transformers.models.detr.feature_extraction_detr.compute_segments
def compute_segments(
mask_probs,
pred_scores,
pred_labels,
mask_threshold: float = 0.5,
overlap_mask_area_threshold: float = 0.8,
label_ids_to_fuse: Optional[Set[int]] = None,
target_size: Tuple[int, int] = None,
):
height = mask_probs.shape[1] if target_size is None else target_size[0]
width = mask_probs.shape[2] if target_size is None else target_size[1]
segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
segments: List[Dict] = []
if target_size is not None:
mask_probs = nn.functional.interpolate(
mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
)[0]
current_segment_id = 0
# Weigh each mask by its prediction score
mask_probs *= pred_scores.view(-1, 1, 1)
mask_labels = mask_probs.argmax(0) # [height, width]
# Keep track of instances of each class
stuff_memory_list: Dict[str, int] = {}
for k in range(pred_labels.shape[0]):
pred_class = pred_labels[k].item()
should_fuse = pred_class in label_ids_to_fuse
# Check if mask exists and large enough to be a segment
mask_exists, mask_k = check_segment_validity(
mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
)
if mask_exists:
if pred_class in stuff_memory_list:
current_segment_id = stuff_memory_list[pred_class]
else:
current_segment_id += 1
# Add current object segment to final segmentation map
segmentation[mask_k] = current_segment_id
segment_score = round(pred_scores[k].item(), 6)
segments.append(
{
"id": current_segment_id,
"label_id": pred_class,
"was_fused": should_fuse,
"score": segment_score,
}
)
if should_fuse:
stuff_memory_list[pred_class] = current_segment_id
return segmentation, segments
class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
...@@ -128,7 +262,6 @@ class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtrac ...@@ -128,7 +262,6 @@ class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtrac
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
should refer to this superclass for more information regarding those methods. should refer to this superclass for more information regarding those methods.
Args: Args:
format (`str`, *optional*, defaults to `"coco_detection"`): format (`str`, *optional*, defaults to `"coco_detection"`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic". Data format of the annotations. One of "coco_detection" or "coco_panoptic".
...@@ -691,25 +824,27 @@ class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtrac ...@@ -691,25 +824,27 @@ class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtrac
return encoded_inputs return encoded_inputs
# POSTPROCESSING METHODS
# inspired by https://github.com/Atten4Vis/conditionalDETR/blob/master/models/conditional_detr.py#L258
def post_process(self, outputs, target_sizes): def post_process(self, outputs, target_sizes):
""" """
Args:
Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the COCO api. Only Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the COCO api. Only
supports PyTorch. supports PyTorch.
Args:
outputs ([`ConditionalDetrObjectDetectionOutput`]): outputs ([`ConditionalDetrObjectDetectionOutput`]):
Raw outputs of the model. Raw outputs of the model.
target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
image size (before any data augmentation). For visualization, this should be the image size after data image size (before any data augmentation). For visualization, this should be the image size after data
augment, but before padding. augment, but before padding.
Returns: Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model. in the batch as predicted by the model.
""" """
warnings.warn(
"`post_process` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_object_detection`",
FutureWarning,
)
out_logits, out_bbox = outputs.logits, outputs.pred_boxes out_logits, out_bbox = outputs.logits, outputs.pred_boxes
if len(out_logits) != len(target_sizes): if len(out_logits) != len(target_sizes):
...@@ -734,240 +869,283 @@ class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtrac ...@@ -734,240 +869,283 @@ class ConditionalDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtrac
return results return results
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_segmentation with Detr->ConditionalDetr # Copied from transformers.models.deformable_detr.feature_extraction_deformable_detr.DeformableDetrFeatureExtractor.post_process_object_detection with DeformableDetr->ConditionalDetr
def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5): def post_process_object_detection(
self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
):
""" """
Converts the output of [`ConditionalDetrForSegmentation`] into image segmentation predictions. Only supports Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the COCO api. Only
PyTorch. supports PyTorch.
Parameters: Args:
outputs ([`ConditionalDetrSegmentationOutput`]): outputs ([`DetrObjectDetectionOutput`]):
Raw outputs of the model. Raw outputs of the model.
target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`): threshold (`float`, *optional*):
Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. Score threshold to keep object detection predictions.
threshold (`float`, *optional*, defaults to 0.9): target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*, defaults to `None`):
Threshold to use to filter out queries. Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
mask_threshold (`float`, *optional*, defaults to 0.5): (height, width) of each image in the batch. If left to None, predictions will not be resized.
Threshold to use when turning the predicted masks into binary values.
Returns: Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model. in the batch as predicted by the model.
""" """
warnings.warn( out_logits, out_bbox = outputs.logits, outputs.pred_boxes
"`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_semantic_segmentation`.", if target_sizes is not None:
FutureWarning, if len(out_logits) != len(target_sizes):
) raise ValueError(
out_logits, raw_masks = outputs.logits, outputs.pred_masks "Make sure that you pass in as many target sizes as the batch dimension of the logits"
preds = [] )
def to_tuple(tup): prob = out_logits.sigmoid()
if isinstance(tup, tuple): topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
return tup scores = topk_values
return tuple(tup.cpu().tolist()) topk_boxes = topk_indexes // out_logits.shape[2]
labels = topk_indexes % out_logits.shape[2]
for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes): boxes = center_to_corners_format(out_bbox)
# we filter empty queries and detection below threshold boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
scores, labels = cur_logits.softmax(-1).max(-1)
keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold) # and from relative [0, 1] to absolute [0, height] coordinates
cur_scores, cur_classes = cur_logits.softmax(-1).max(-1) if isinstance(target_sizes, List):
cur_scores = cur_scores[keep] img_h = torch.Tensor([i[0] for i in target_sizes])
cur_classes = cur_classes[keep] img_w = torch.Tensor([i[1] for i in target_sizes])
cur_masks = cur_masks[keep] else:
cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) img_h, img_w = target_sizes.unbind(1)
cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1 scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
boxes = boxes * scale_fct[:, None, :]
predictions = {"scores": cur_scores, "labels": cur_classes, "masks": cur_masks}
preds.append(predictions) results = []
return preds for s, l, b in zip(scores, labels, boxes):
score = s[s > threshold]
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_instance with Detr->ConditionalDetr label = l[s > threshold]
def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5): box = b[s > threshold]
results.append({"scores": score, "labels": label, "boxes": box})
return results
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_semantic_segmentation with Detr->ConditionalDetr
def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple[int, int]] = None):
""" """
Converts the output of [`ConditionalDetrForSegmentation`] into actual instance segmentation predictions. Only Converts the output of [`ConditionalDetrForSegmentation`] into semantic segmentation maps. Only supports
supports PyTorch. PyTorch.
Args: Args:
results (`List[Dict]`): outputs ([`ConditionalDetrForSegmentation`]):
Results list obtained by [`~ConditionalDetrFeatureExtractor.post_process`], to which "masks" results Raw outputs of the model.
will be added. target_sizes (`List[Tuple[int, int]]`, *optional*, defaults to `None`):
outputs ([`ConditionalDetrSegmentationOutput`]): A list of tuples (`Tuple[int, int]`) containing the target size (height, width) of each image in the
batch. If left to None, predictions will not be resized.
Returns:
`List[torch.Tensor]`:
A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
`torch.Tensor` correspond to a semantic class id.
"""
class_queries_logits = outputs.logits # [batch_size, num_queries, num_classes+1]
masks_queries_logits = outputs.pred_masks # [batch_size, num_queries, height, width]
# Remove the null class `[..., :-1]`
masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
masks_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width]
# Semantic segmentation logits of shape (batch_size, num_classes, height, width)
segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
batch_size = class_queries_logits.shape[0]
# Resize logits and compute semantic segmentation maps
if target_sizes is not None:
if batch_size != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
semantic_segmentation = []
for idx in range(batch_size):
resized_logits = nn.functional.interpolate(
segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
)
semantic_map = resized_logits[0].argmax(dim=0)
semantic_segmentation.append(semantic_map)
else:
semantic_segmentation = segmentation.argmax(dim=1)
semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
return semantic_segmentation
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_instance_segmentation with Detr->ConditionalDetr
def post_process_instance_segmentation(
self,
outputs,
threshold: float = 0.5,
mask_threshold: float = 0.5,
overlap_mask_area_threshold: float = 0.8,
target_sizes: Optional[List[Tuple[int, int]]] = None,
return_coco_annotation: Optional[bool] = False,
) -> List[Dict]:
"""
Converts the output of [`ConditionalDetrForSegmentation`] into instance segmentation predictions. Only supports
PyTorch.
Args:
outputs ([`ConditionalDetrForSegmentation`]):
Raw outputs of the model. Raw outputs of the model.
orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
image size (before any data augmentation).
max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the
original image size (before any data augmentation).
threshold (`float`, *optional*, defaults to 0.5): threshold (`float`, *optional*, defaults to 0.5):
The probability score threshold to keep predicted instance masks.
mask_threshold (`float`, *optional*, defaults to 0.5):
Threshold to use when turning the predicted masks into binary values. Threshold to use when turning the predicted masks into binary values.
overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
The overlap mask area threshold to merge or discard small disconnected parts within each binary
instance mask.
target_sizes (`List[Tuple]`, *optional*):
List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
final size (height, width) of each prediction. If left to None, predictions will not be resized.
return_coco_annotation (`bool`, *optional*):
Defaults to `False`. If set to `True`, segmentation maps are returned in COCO run-length encoding (RLE)
format.
Returns: Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
image in the batch as predicted by the model. - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or
`List[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to
`True`. Set to `None` if no mask if found above `threshold`.
- **segments_info** -- A dictionary that contains additional information on each segment.
- **id** -- An integer representing the `segment_id`.
- **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
- **score** -- Prediction score of segment with `segment_id`.
""" """
warnings.warn( class_queries_logits = outputs.logits # [batch_size, num_queries, num_classes+1]
"`post_process_instance` is deprecated and will be removed in v5 of Transformers, please use" masks_queries_logits = outputs.pred_masks # [batch_size, num_queries, height, width]
" `post_process_instance_segmentation`.",
FutureWarning,
)
if len(orig_target_sizes) != len(max_target_sizes): batch_size = class_queries_logits.shape[0]
raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes") num_labels = class_queries_logits.shape[-1] - 1
max_h, max_w = max_target_sizes.max(0)[0].tolist()
outputs_masks = outputs.pred_masks.squeeze(2) mask_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width]
outputs_masks = nn.functional.interpolate(
outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False
)
outputs_masks = (outputs_masks.sigmoid() > threshold).cpu()
for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)): # Predicted label and score of each query (batch_size, num_queries)
img_h, img_w = t[0], t[1] pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1)
results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
results[i]["masks"] = nn.functional.interpolate(
results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
).byte()
# Loop over items in batch size
results: List[Dict[str, TensorType]] = []
for i in range(batch_size):
mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(
mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels
)
# No mask found
if mask_probs_item.shape[0] <= 0:
height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:]
segmentation = torch.zeros((height, width)) - 1
results.append({"segmentation": segmentation, "segments_info": []})
continue
# Get segmentation map and segment information of batch item
target_size = target_sizes[i] if target_sizes is not None else None
segmentation, segments = compute_segments(
mask_probs_item,
pred_scores_item,
pred_labels_item,
mask_threshold,
overlap_mask_area_threshold,
target_size,
)
# Return segmentation map in run-length encoding (RLE) format
if return_coco_annotation:
segmentation = convert_segmentation_to_rle(segmentation)
results.append({"segmentation": segmentation, "segments_info": segments})
return results return results
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_panoptic with Detr->ConditionalDetr # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_panoptic_segmentation with Detr->ConditionalDetr
def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85): def post_process_panoptic_segmentation(
self,
outputs,
threshold: float = 0.5,
mask_threshold: float = 0.5,
overlap_mask_area_threshold: float = 0.8,
label_ids_to_fuse: Optional[Set[int]] = None,
target_sizes: Optional[List[Tuple[int, int]]] = None,
) -> List[Dict]:
""" """
Converts the output of [`ConditionalDetrForSegmentation`] into actual panoptic predictions. Only supports Converts the output of [`ConditionalDetrForSegmentation`] into image panoptic segmentation predictions. Only
PyTorch. supports PyTorch.
Parameters: Args:
outputs ([`ConditionalDetrSegmentationOutput`]): outputs ([`ConditionalDetrForSegmentation`]):
Raw outputs of the model. The outputs from [`ConditionalDetrForSegmentation`].
processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`): threshold (`float`, *optional*, defaults to 0.5):
Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data The probability score threshold to keep predicted instance masks.
augmentation but before batching. mask_threshold (`float`, *optional*, defaults to 0.5):
target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`, *optional*): Threshold to use when turning the predicted masks into binary values.
Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. If left to overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
None, it will default to the `processed_sizes`. The overlap mask area threshold to merge or discard small disconnected parts within each binary
is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*): instance mask.
Dictionary mapping class indices to either True or False, depending on whether or not they are a thing. label_ids_to_fuse (`Set[int]`, *optional*):
If not set, defaults to the `is_thing_map` of COCO panoptic. The labels in this state will have all their instances be fused together. For instance we could say
threshold (`float`, *optional*, defaults to 0.85): there can only be one sky in an image, but several persons, so the label ID for sky would be in that
Threshold to use to filter out queries. set, but not the one for person.
target_sizes (`List[Tuple]`, *optional*):
List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
final size (height, width) of each prediction in batch. If left to None, predictions will not be
resized.
Returns: Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
an image in the batch as predicted by the model. - **segmentation** -- a tensor of shape `(height, width)` where each pixel represents a `segment_id` or
`None` if no mask if found above `threshold`. If `target_sizes` is specified, segmentation is resized to
the corresponding `target_sizes` entry.
- **segments_info** -- A dictionary that contains additional information on each segment.
- **id** -- an integer representing the `segment_id`.
- **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
- **was_fused** -- a boolean, `True` if `label_id` was in `label_ids_to_fuse`, `False` otherwise.
Multiple instances of the same class / label were fused and assigned a single `segment_id`.
- **score** -- Prediction score of segment with `segment_id`.
""" """
warnings.warn(
"`post_process_panoptic is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_panoptic_segmentation`.",
FutureWarning,
)
if target_sizes is None:
target_sizes = processed_sizes
if len(processed_sizes) != len(target_sizes):
raise ValueError("Make sure to pass in as many processed_sizes as target_sizes")
if is_thing_map is None: if label_ids_to_fuse is None:
# default to is_thing_map of COCO panoptic warnings.warn("`label_ids_to_fuse` unset. No instance will be fused.")
is_thing_map = {i: i <= 90 for i in range(201)} label_ids_to_fuse = set()
out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes class_queries_logits = outputs.logits # [batch_size, num_queries, num_classes+1]
if not len(out_logits) == len(raw_masks) == len(target_sizes): masks_queries_logits = outputs.pred_masks # [batch_size, num_queries, height, width]
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits and masks" batch_size = class_queries_logits.shape[0]
num_labels = class_queries_logits.shape[-1] - 1
mask_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width]
# Predicted label and score of each query (batch_size, num_queries)
pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1)
# Loop over items in batch size
results: List[Dict[str, TensorType]] = []
for i in range(batch_size):
mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(
mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels
) )
preds = []
def to_tuple(tup):
if isinstance(tup, tuple):
return tup
return tuple(tup.cpu().tolist())
for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
):
# we filter empty queries and detection below threshold
scores, labels = cur_logits.softmax(-1).max(-1)
keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
cur_scores = cur_scores[keep]
cur_classes = cur_classes[keep]
cur_masks = cur_masks[keep]
cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
cur_boxes = center_to_corners_format(cur_boxes[keep])
h, w = cur_masks.shape[-2:]
if len(cur_boxes) != len(cur_classes):
raise ValueError("Not as many boxes as there are classes")
# It may be that we have several predicted masks for the same stuff class.
# In the following, we track the list of masks ids for each stuff class (they are merged later on)
cur_masks = cur_masks.flatten(1)
stuff_equiv_classes = defaultdict(lambda: [])
for k, label in enumerate(cur_classes):
if not is_thing_map[label.item()]:
stuff_equiv_classes[label.item()].append(k)
def get_ids_area(masks, scores, dedup=False):
# This helper function creates the final panoptic segmentation image
# It also returns the area of the masks that appears on the image
m_id = masks.transpose(0, 1).softmax(-1)
if m_id.shape[-1] == 0:
# We didn't detect any mask :(
m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device)
else:
m_id = m_id.argmax(-1).view(h, w)
if dedup:
# Merge the masks corresponding to the same stuff class
for equiv in stuff_equiv_classes.values():
if len(equiv) > 1:
for eq_id in equiv:
m_id.masked_fill_(m_id.eq(eq_id), equiv[0])
final_h, final_w = to_tuple(target_size)
seg_img = Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy()))
seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST)
np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes()))
np_seg_img = np_seg_img.view(final_h, final_w, 3)
np_seg_img = np_seg_img.numpy()
m_id = torch.from_numpy(rgb_to_id(np_seg_img))
area = []
for i in range(len(scores)):
area.append(m_id.eq(i).sum().item())
return area, seg_img
area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True)
if cur_classes.numel() > 0:
# We know filter empty masks as long as we find some
while True:
filtered_small = torch.as_tensor(
[area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device
)
if filtered_small.any().item():
cur_scores = cur_scores[~filtered_small]
cur_classes = cur_classes[~filtered_small]
cur_masks = cur_masks[~filtered_small]
area, seg_img = get_ids_area(cur_masks, cur_scores)
else:
break
else: # No mask found
cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device) if mask_probs_item.shape[0] <= 0:
height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:]
segments_info = [] segmentation = torch.zeros((height, width)) - 1
for i, a in enumerate(area): results.append({"segmentation": segmentation, "segments_info": []})
cat = cur_classes[i].item() continue
segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a})
del cur_classes # Get segmentation map and segment information of batch item
target_size = target_sizes[i] if target_sizes is not None else None
with io.BytesIO() as out: segmentation, segments = compute_segments(
seg_img.save(out, format="PNG") mask_probs_item,
predictions = {"png_string": out.getvalue(), "segments_info": segments_info} pred_scores_item,
preds.append(predictions) pred_labels_item,
return preds mask_threshold,
overlap_mask_area_threshold,
label_ids_to_fuse,
target_size,
)
results.append({"segmentation": segmentation, "segments_info": segments})
return results
...@@ -1699,15 +1699,15 @@ class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel): ...@@ -1699,15 +1699,15 @@ class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel):
>>> # convert outputs (bounding boxes and class logits) to COCO API >>> # convert outputs (bounding boxes and class logits) to COCO API
>>> target_sizes = torch.tensor([image.size[::-1]]) >>> target_sizes = torch.tensor([image.size[::-1]])
>>> results = feature_extractor.post_process(outputs, target_sizes=target_sizes)[0] >>> results = feature_extractor.post_process_object_detection(
... outputs, threshold=0.5, target_sizes=target_sizes
... )[0]
>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
... box = [round(i, 2) for i in box.tolist()] ... box = [round(i, 2) for i in box.tolist()]
... # let's only keep detections with score > 0.5 ... print(
... if score > 0.5: ... f"Detected {model.config.id2label[label.item()]} with confidence "
... print( ... f"{round(score.item(), 3)} at location {box}"
... f"Detected {model.config.id2label[label.item()]} with confidence " ... )
... f"{round(score.item(), 3)} at location {box}"
... )
Detected remote with confidence 0.833 at location [38.31, 72.1, 177.63, 118.45] Detected remote with confidence 0.833 at location [38.31, 72.1, 177.63, 118.45]
Detected cat with confidence 0.831 at location [9.2, 51.38, 321.13, 469.0] Detected cat with confidence 0.831 at location [9.2, 51.38, 321.13, 469.0]
Detected cat with confidence 0.804 at location [340.3, 16.85, 642.93, 370.95] Detected cat with confidence 0.804 at location [340.3, 16.85, 642.93, 370.95]
...@@ -1897,17 +1897,13 @@ class ConditionalDetrForSegmentation(ConditionalDetrPreTrainedModel): ...@@ -1897,17 +1897,13 @@ class ConditionalDetrForSegmentation(ConditionalDetrPreTrainedModel):
>>> # forward pass >>> # forward pass
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> # use the `post_process_panoptic` method of `ConditionalDetrFeatureExtractor` to convert to COCO format >>> # Use the `post_process_panoptic_segmentation` method of `ConditionalDetrFeatureExtractor` to retrieve post-processed panoptic segmentation maps
>>> processed_sizes = torch.as_tensor(inputs["pixel_values"].shape[-2:]).unsqueeze(0) >>> # Segmentation results are returned as a list of dictionaries
>>> result = feature_extractor.post_process_panoptic(outputs, processed_sizes)[0] >>> result = feature_extractor.post_process_panoptic_segmentation(outputs, target_sizes=[(300, 500)])
>>> # A tensor of shape (height, width) where each value denotes a segment id, filled with -1 if no segment is found
>>> # the segmentation is stored in a special-format png >>> panoptic_seg = result[0]["segmentation"]
>>> panoptic_seg = Image.open(io.BytesIO(result["png_string"])) >>> # Get prediction score and segment_id to class_id mapping of each segment
>>> panoptic_seg = numpy.array(panoptic_seg, dtype=numpy.uint8) >>> panoptic_segments_info = result[0]["segments_info"]
>>> # retrieve the ids corresponding to each mask
>>> panoptic_seg_id = rgb_to_id(panoptic_seg)
>>> panoptic_seg_id.shape
(800, 1066)
```""" ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......
...@@ -14,11 +14,9 @@ ...@@ -14,11 +14,9 @@
# limitations under the License. # limitations under the License.
"""Feature extractor class for Deformable DETR.""" """Feature extractor class for Deformable DETR."""
import io
import pathlib import pathlib
import warnings import warnings
from collections import defaultdict from typing import Dict, List, Optional, Tuple, Union
from typing import Dict, List, Optional, Union
import numpy as np import numpy as np
from PIL import Image from PIL import Image
...@@ -707,6 +705,12 @@ class DeformableDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtract ...@@ -707,6 +705,12 @@ class DeformableDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtract
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model. in the batch as predicted by the model.
""" """
warnings.warn(
"`post_process` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_object_detection`.",
FutureWarning,
)
out_logits, out_bbox = outputs.logits, outputs.pred_boxes out_logits, out_bbox = outputs.logits, outputs.pred_boxes
if len(out_logits) != len(target_sizes): if len(out_logits) != len(target_sizes):
...@@ -731,240 +735,56 @@ class DeformableDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtract ...@@ -731,240 +735,56 @@ class DeformableDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtract
return results return results
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_segmentation with Detr->DeformableDetr def post_process_object_detection(
def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5): self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
""" ):
Converts the output of [`DeformableDetrForSegmentation`] into image segmentation predictions. Only supports
PyTorch.
Parameters:
outputs ([`DeformableDetrSegmentationOutput`]):
Raw outputs of the model.
target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
threshold (`float`, *optional*, defaults to 0.9):
Threshold to use to filter out queries.
mask_threshold (`float`, *optional*, defaults to 0.5):
Threshold to use when turning the predicted masks into binary values.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
in the batch as predicted by the model.
"""
warnings.warn(
"`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_semantic_segmentation`.",
FutureWarning,
)
out_logits, raw_masks = outputs.logits, outputs.pred_masks
preds = []
def to_tuple(tup):
if isinstance(tup, tuple):
return tup
return tuple(tup.cpu().tolist())
for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
# we filter empty queries and detection below threshold
scores, labels = cur_logits.softmax(-1).max(-1)
keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
cur_scores = cur_scores[keep]
cur_classes = cur_classes[keep]
cur_masks = cur_masks[keep]
cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1
predictions = {"scores": cur_scores, "labels": cur_classes, "masks": cur_masks}
preds.append(predictions)
return preds
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_instance with Detr->DeformableDetr
def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
""" """
Converts the output of [`DeformableDetrForSegmentation`] into actual instance segmentation predictions. Only Converts the output of [`DeformableDetrForObjectDetection`] into the format expected by the COCO api. Only
supports PyTorch. supports PyTorch.
Args: Args:
results (`List[Dict]`): outputs ([`DetrObjectDetectionOutput`]):
Results list obtained by [`~DeformableDetrFeatureExtractor.post_process`], to which "masks" results
will be added.
outputs ([`DeformableDetrSegmentationOutput`]):
Raw outputs of the model. Raw outputs of the model.
orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): threshold (`float`, *optional*):
Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original Score threshold to keep object detection predictions.
image size (before any data augmentation). target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*, defaults to `None`):
max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the (height, width) of each image in the batch. If left to None, predictions will not be resized.
original image size (before any data augmentation).
threshold (`float`, *optional*, defaults to 0.5):
Threshold to use when turning the predicted masks into binary values.
Returns: Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
image in the batch as predicted by the model. in the batch as predicted by the model.
"""
warnings.warn(
"`post_process_instance` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_instance_segmentation`.",
FutureWarning,
)
if len(orig_target_sizes) != len(max_target_sizes):
raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes")
max_h, max_w = max_target_sizes.max(0)[0].tolist()
outputs_masks = outputs.pred_masks.squeeze(2)
outputs_masks = nn.functional.interpolate(
outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False
)
outputs_masks = (outputs_masks.sigmoid() > threshold).cpu()
for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
img_h, img_w = t[0], t[1]
results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
results[i]["masks"] = nn.functional.interpolate(
results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
).byte()
return results
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_panoptic with Detr->DeformableDetr
def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
""" """
Converts the output of [`DeformableDetrForSegmentation`] into actual panoptic predictions. Only supports out_logits, out_bbox = outputs.logits, outputs.pred_boxes
PyTorch.
Parameters: if target_sizes is not None:
outputs ([`DeformableDetrSegmentationOutput`]): if len(out_logits) != len(target_sizes):
Raw outputs of the model. raise ValueError(
processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`): "Make sure that you pass in as many target sizes as the batch dimension of the logits"
Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data )
augmentation but before batching.
target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`, *optional*):
Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. If left to
None, it will default to the `processed_sizes`.
is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
Dictionary mapping class indices to either True or False, depending on whether or not they are a thing.
If not set, defaults to the `is_thing_map` of COCO panoptic.
threshold (`float`, *optional*, defaults to 0.85):
Threshold to use to filter out queries.
Returns: prob = out_logits.sigmoid()
`List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
an image in the batch as predicted by the model. scores = topk_values
""" topk_boxes = topk_indexes // out_logits.shape[2]
warnings.warn( labels = topk_indexes % out_logits.shape[2]
"`post_process_panoptic is deprecated and will be removed in v5 of Transformers, please use" boxes = center_to_corners_format(out_bbox)
" `post_process_panoptic_segmentation`.", boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
FutureWarning,
)
if target_sizes is None:
target_sizes = processed_sizes
if len(processed_sizes) != len(target_sizes):
raise ValueError("Make sure to pass in as many processed_sizes as target_sizes")
if is_thing_map is None: # and from relative [0, 1] to absolute [0, height] coordinates
# default to is_thing_map of COCO panoptic if isinstance(target_sizes, List):
is_thing_map = {i: i <= 90 for i in range(201)} img_h = torch.Tensor([i[0] for i in target_sizes])
img_w = torch.Tensor([i[1] for i in target_sizes])
else:
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
boxes = boxes * scale_fct[:, None, :]
out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes results = []
if not len(out_logits) == len(raw_masks) == len(target_sizes): for s, l, b in zip(scores, labels, boxes):
raise ValueError( score = s[s > threshold]
"Make sure that you pass in as many target sizes as the batch dimension of the logits and masks" label = l[s > threshold]
) box = b[s > threshold]
preds = [] results.append({"scores": score, "labels": label, "boxes": box})
def to_tuple(tup):
if isinstance(tup, tuple):
return tup
return tuple(tup.cpu().tolist())
for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
):
# we filter empty queries and detection below threshold
scores, labels = cur_logits.softmax(-1).max(-1)
keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
cur_scores = cur_scores[keep]
cur_classes = cur_classes[keep]
cur_masks = cur_masks[keep]
cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
cur_boxes = center_to_corners_format(cur_boxes[keep])
h, w = cur_masks.shape[-2:]
if len(cur_boxes) != len(cur_classes):
raise ValueError("Not as many boxes as there are classes")
# It may be that we have several predicted masks for the same stuff class.
# In the following, we track the list of masks ids for each stuff class (they are merged later on)
cur_masks = cur_masks.flatten(1)
stuff_equiv_classes = defaultdict(lambda: [])
for k, label in enumerate(cur_classes):
if not is_thing_map[label.item()]:
stuff_equiv_classes[label.item()].append(k)
def get_ids_area(masks, scores, dedup=False):
# This helper function creates the final panoptic segmentation image
# It also returns the area of the masks that appears on the image
m_id = masks.transpose(0, 1).softmax(-1)
if m_id.shape[-1] == 0:
# We didn't detect any mask :(
m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device)
else:
m_id = m_id.argmax(-1).view(h, w)
if dedup:
# Merge the masks corresponding to the same stuff class
for equiv in stuff_equiv_classes.values():
if len(equiv) > 1:
for eq_id in equiv:
m_id.masked_fill_(m_id.eq(eq_id), equiv[0])
final_h, final_w = to_tuple(target_size)
seg_img = Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy()))
seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST)
np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes()))
np_seg_img = np_seg_img.view(final_h, final_w, 3)
np_seg_img = np_seg_img.numpy()
m_id = torch.from_numpy(rgb_to_id(np_seg_img))
area = []
for i in range(len(scores)):
area.append(m_id.eq(i).sum().item())
return area, seg_img
area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True)
if cur_classes.numel() > 0:
# We know filter empty masks as long as we find some
while True:
filtered_small = torch.as_tensor(
[area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device
)
if filtered_small.any().item():
cur_scores = cur_scores[~filtered_small]
cur_classes = cur_classes[~filtered_small]
cur_masks = cur_masks[~filtered_small]
area, seg_img = get_ids_area(cur_masks, cur_scores)
else:
break
else: return results
cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device)
segments_info = []
for i, a in enumerate(area):
cat = cur_classes[i].item()
segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a})
del cur_classes
with io.BytesIO() as out:
seg_img.save(out, format="PNG")
predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
preds.append(predictions)
return preds
...@@ -236,8 +236,8 @@ class DeformableDetrObjectDetectionOutput(ModelOutput): ...@@ -236,8 +236,8 @@ class DeformableDetrObjectDetectionOutput(ModelOutput):
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
possible padding). You can use [`~AutoFeatureExtractor.post_process`] to retrieve the unnormalized bounding possible padding). You can use [`~AutoFeatureExtractor.post_process_object_detection`] to retrieve the
boxes. unnormalized bounding boxes.
auxiliary_outputs (`list[Dict]`, *optional*): auxiliary_outputs (`list[Dict]`, *optional*):
Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
...@@ -1878,15 +1878,15 @@ class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel): ...@@ -1878,15 +1878,15 @@ class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel):
>>> # convert outputs (bounding boxes and class logits) to COCO API >>> # convert outputs (bounding boxes and class logits) to COCO API
>>> target_sizes = torch.tensor([image.size[::-1]]) >>> target_sizes = torch.tensor([image.size[::-1]])
>>> results = feature_extractor.post_process(outputs, target_sizes=target_sizes)[0] >>> results = feature_extractor.post_process_object_detection(
... outputs, threshold=0.5, target_sizes=target_sizes
... )[0]
>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
... box = [round(i, 2) for i in box.tolist()] ... box = [round(i, 2) for i in box.tolist()]
... # let's only keep detections with score > 0.5 ... print(
... if score > 0.5: ... f"Detected {model.config.id2label[label.item()]} with confidence "
... print( ... f"{round(score.item(), 3)} at location {box}"
... f"Detected {model.config.id2label[label.item()]} with confidence " ... )
... f"{round(score.item(), 3)} at location {box}"
... )
Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78] Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78]
Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25] Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25]
Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25] Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25]
......
...@@ -878,7 +878,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): ...@@ -878,7 +878,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
""" """
Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch. Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch.
Parameters: Args:
outputs ([`DetrSegmentationOutput`]): outputs ([`DetrSegmentationOutput`]):
Raw outputs of the model. Raw outputs of the model.
target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`): target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
...@@ -974,7 +974,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): ...@@ -974,7 +974,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
""" """
Converts the output of [`DetrForSegmentation`] into actual panoptic predictions. Only supports PyTorch. Converts the output of [`DetrForSegmentation`] into actual panoptic predictions. Only supports PyTorch.
Parameters: Args:
outputs ([`DetrSegmentationOutput`]): outputs ([`DetrSegmentationOutput`]):
Raw outputs of the model. Raw outputs of the model.
processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`): processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
...@@ -1165,13 +1165,15 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): ...@@ -1165,13 +1165,15 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple[int, int]] = None): def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple[int, int]] = None):
""" """
Args:
Converts the output of [`DetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch. Converts the output of [`DetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch.
Args:
outputs ([`DetrForSegmentation`]): outputs ([`DetrForSegmentation`]):
Raw outputs of the model. Raw outputs of the model.
target_sizes (`List[Tuple[int, int]]`, *optional*, defaults to `None`): target_sizes (`List[Tuple[int, int]]`, *optional*, defaults to `None`):
A list of tuples (`Tuple[int, int]`) containing the target size (height, width) of each image in the A list of tuples (`Tuple[int, int]`) containing the target size (height, width) of each image in the
batch. If left to None, predictions will not be resized. batch. If left to None, predictions will not be resized.
Returns: Returns:
`List[torch.Tensor]`: `List[torch.Tensor]`:
A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width) A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
...@@ -1219,8 +1221,9 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): ...@@ -1219,8 +1221,9 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
return_coco_annotation: Optional[bool] = False, return_coco_annotation: Optional[bool] = False,
) -> List[Dict]: ) -> List[Dict]:
""" """
Args:
Converts the output of [`DetrForSegmentation`] into instance segmentation predictions. Only supports PyTorch. Converts the output of [`DetrForSegmentation`] into instance segmentation predictions. Only supports PyTorch.
Args:
outputs ([`DetrForSegmentation`]): outputs ([`DetrForSegmentation`]):
Raw outputs of the model. Raw outputs of the model.
threshold (`float`, *optional*, defaults to 0.5): threshold (`float`, *optional*, defaults to 0.5):
...@@ -1236,6 +1239,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): ...@@ -1236,6 +1239,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
return_coco_annotation (`bool`, *optional*): return_coco_annotation (`bool`, *optional*):
Defaults to `False`. If set to `True`, segmentation maps are returned in COCO run-length encoding (RLE) Defaults to `False`. If set to `True`, segmentation maps are returned in COCO run-length encoding (RLE)
format. format.
Returns: Returns:
`List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys: `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
- **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or
...@@ -1301,9 +1305,10 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): ...@@ -1301,9 +1305,10 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
target_sizes: Optional[List[Tuple[int, int]]] = None, target_sizes: Optional[List[Tuple[int, int]]] = None,
) -> List[Dict]: ) -> List[Dict]:
""" """
Args:
Converts the output of [`DetrForSegmentation`] into image panoptic segmentation predictions. Only supports Converts the output of [`DetrForSegmentation`] into image panoptic segmentation predictions. Only supports
PyTorch. PyTorch.
Args:
outputs ([`DetrForSegmentation`]): outputs ([`DetrForSegmentation`]):
The outputs from [`DetrForSegmentation`]. The outputs from [`DetrForSegmentation`].
threshold (`float`, *optional*, defaults to 0.5): threshold (`float`, *optional*, defaults to 0.5):
...@@ -1321,6 +1326,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): ...@@ -1321,6 +1326,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
final size (height, width) of each prediction in batch. If left to None, predictions will not be final size (height, width) of each prediction in batch. If left to None, predictions will not be
resized. resized.
Returns: Returns:
`List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys: `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
- **segmentation** -- a tensor of shape `(height, width)` where each pixel represents a `segment_id` or - **segmentation** -- a tensor of shape `(height, width)` where each pixel represents a `segment_id` or
......
...@@ -699,53 +699,6 @@ class YolosFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin) ...@@ -699,53 +699,6 @@ class YolosFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin)
results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
return results return results
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_segmentation
def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
"""
Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch.
Parameters:
outputs ([`DetrSegmentationOutput`]):
Raw outputs of the model.
target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
threshold (`float`, *optional*, defaults to 0.9):
Threshold to use to filter out queries.
mask_threshold (`float`, *optional*, defaults to 0.5):
Threshold to use when turning the predicted masks into binary values.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
in the batch as predicted by the model.
"""
warnings.warn(
"`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_semantic_segmentation`.",
FutureWarning,
)
out_logits, raw_masks = outputs.logits, outputs.pred_masks
preds = []
def to_tuple(tup):
if isinstance(tup, tuple):
return tup
return tuple(tup.cpu().tolist())
for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
# we filter empty queries and detection below threshold
scores, labels = cur_logits.softmax(-1).max(-1)
keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
cur_scores = cur_scores[keep]
cur_classes = cur_classes[keep]
cur_masks = cur_masks[keep]
cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1
predictions = {"scores": cur_scores, "labels": cur_classes, "masks": cur_masks}
preds.append(predictions)
return preds
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_object_detection with Detr->Yolos # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_object_detection with Detr->Yolos
def post_process_object_detection( def post_process_object_detection(
self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
......
...@@ -303,7 +303,6 @@ class ConditionalDetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, uni ...@@ -303,7 +303,6 @@ class ConditionalDetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, uni
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
# encode them # encode them
# TODO replace by .from_pretrained microsoft/conditional-detr-resnet-50-panoptic
feature_extractor = ConditionalDetrFeatureExtractor(format="coco_panoptic") feature_extractor = ConditionalDetrFeatureExtractor(format="coco_panoptic")
encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt") encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
......
...@@ -492,6 +492,7 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase): ...@@ -492,6 +492,7 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase):
with torch.no_grad(): with torch.no_grad():
outputs = model(pixel_values, pixel_mask) outputs = model(pixel_values, pixel_mask)
# verify logits + box predictions
expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels)) expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
self.assertEqual(outputs.logits.shape, expected_shape_logits) self.assertEqual(outputs.logits.shape, expected_shape_logits)
expected_slice_logits = torch.tensor( expected_slice_logits = torch.tensor(
...@@ -505,3 +506,16 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase): ...@@ -505,3 +506,16 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase):
[[0.7733, 0.6576, 0.4496], [0.5171, 0.1184, 0.9094], [0.8846, 0.5647, 0.2486]] [[0.7733, 0.6576, 0.4496], [0.5171, 0.1184, 0.9094], [0.8846, 0.5647, 0.2486]]
).to(torch_device) ).to(torch_device)
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4)) self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
# verify postprocessing
results = feature_extractor.post_process_object_detection(
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
)[0]
expected_scores = torch.tensor([0.8330, 0.8313, 0.8039, 0.6829, 0.5355])
expected_labels = [75, 17, 17, 75, 63]
expected_slice_boxes = torch.tensor([38.3089, 72.1022, 177.6293, 118.4512])
self.assertEqual(len(results["scores"]), 5)
self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
...@@ -565,6 +565,19 @@ class DeformableDetrModelIntegrationTests(unittest.TestCase): ...@@ -565,6 +565,19 @@ class DeformableDetrModelIntegrationTests(unittest.TestCase):
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)) self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
# verify postprocessing
results = feature_extractor.post_process_object_detection(
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
)[0]
expected_scores = torch.tensor([0.7999, 0.7894, 0.6331, 0.4720, 0.4382])
expected_labels = [17, 17, 75, 75, 63]
expected_slice_boxes = torch.tensor([16.5028, 52.8390, 318.2544, 470.7841])
self.assertEqual(len(results["scores"]), 5)
self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
def test_inference_object_detection_head_with_box_refine_two_stage(self): def test_inference_object_detection_head_with_box_refine_two_stage(self):
model = DeformableDetrForObjectDetection.from_pretrained( model = DeformableDetrForObjectDetection.from_pretrained(
"SenseTime/deformable-detr-with-box-refine-two-stage" "SenseTime/deformable-detr-with-box-refine-two-stage"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment