Unverified Commit 0b294c23 authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

[Conditional, Deformable DETR] Add postprocessing methods (#19709)



* Add postprocessing methods

* Update docs

* Add fix

* Add test

* Add test for deformable detr postprocessing

* Add post processing methods for segmentation

* Update code examples

* Add post_process to make the pipeline work

* Apply updates
Co-authored-by: default avatarNiels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
parent 2e35bac4
......@@ -37,9 +37,10 @@ This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The o
[[autodoc]] ConditionalDetrFeatureExtractor
- __call__
- pad_and_create_pixel_mask
- post_process
- post_process_segmentation
- post_process_panoptic
- post_process_object_detection
- post_process_instance_segmentation
- post_process_semantic_segmentation
- post_process_panoptic_segmentation
## ConditionalDetrModel
......
......@@ -38,9 +38,7 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The origi
[[autodoc]] DeformableDetrFeatureExtractor
- __call__
- pad_and_create_pixel_mask
- post_process
- post_process_segmentation
- post_process_panoptic
- post_process_object_detection
## DeformableDetrConfig
......
......@@ -1699,15 +1699,15 @@ class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel):
>>> # convert outputs (bounding boxes and class logits) to COCO API
>>> target_sizes = torch.tensor([image.size[::-1]])
>>> results = feature_extractor.post_process(outputs, target_sizes=target_sizes)[0]
>>> results = feature_extractor.post_process_object_detection(
... outputs, threshold=0.5, target_sizes=target_sizes
... )[0]
>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
... box = [round(i, 2) for i in box.tolist()]
... # let's only keep detections with score > 0.5
... if score > 0.5:
... print(
... f"Detected {model.config.id2label[label.item()]} with confidence "
... f"{round(score.item(), 3)} at location {box}"
... )
... print(
... f"Detected {model.config.id2label[label.item()]} with confidence "
... f"{round(score.item(), 3)} at location {box}"
... )
Detected remote with confidence 0.833 at location [38.31, 72.1, 177.63, 118.45]
Detected cat with confidence 0.831 at location [9.2, 51.38, 321.13, 469.0]
Detected cat with confidence 0.804 at location [340.3, 16.85, 642.93, 370.95]
......@@ -1897,17 +1897,13 @@ class ConditionalDetrForSegmentation(ConditionalDetrPreTrainedModel):
>>> # forward pass
>>> outputs = model(**inputs)
>>> # use the `post_process_panoptic` method of `ConditionalDetrFeatureExtractor` to convert to COCO format
>>> processed_sizes = torch.as_tensor(inputs["pixel_values"].shape[-2:]).unsqueeze(0)
>>> result = feature_extractor.post_process_panoptic(outputs, processed_sizes)[0]
>>> # the segmentation is stored in a special-format png
>>> panoptic_seg = Image.open(io.BytesIO(result["png_string"]))
>>> panoptic_seg = numpy.array(panoptic_seg, dtype=numpy.uint8)
>>> # retrieve the ids corresponding to each mask
>>> panoptic_seg_id = rgb_to_id(panoptic_seg)
>>> panoptic_seg_id.shape
(800, 1066)
>>> # Use the `post_process_panoptic_segmentation` method of `ConditionalDetrFeatureExtractor` to retrieve post-processed panoptic segmentation maps
>>> # Segmentation results are returned as a list of dictionaries
>>> result = feature_extractor.post_process_panoptic_segmentation(outputs, target_sizes=[(300, 500)])
>>> # A tensor of shape (height, width) where each value denotes a segment id, filled with -1 if no segment is found
>>> panoptic_seg = result[0]["segmentation"]
>>> # Get prediction score and segment_id to class_id mapping of each segment
>>> panoptic_segments_info = result[0]["segments_info"]
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......
......@@ -14,11 +14,9 @@
# limitations under the License.
"""Feature extractor class for Deformable DETR."""
import io
import pathlib
import warnings
from collections import defaultdict
from typing import Dict, List, Optional, Union
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
from PIL import Image
......@@ -707,6 +705,12 @@ class DeformableDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtract
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model.
"""
warnings.warn(
"`post_process` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_object_detection`.",
FutureWarning,
)
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
if len(out_logits) != len(target_sizes):
......@@ -731,240 +735,56 @@ class DeformableDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtract
return results
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_segmentation with Detr->DeformableDetr
def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
"""
Converts the output of [`DeformableDetrForSegmentation`] into image segmentation predictions. Only supports
PyTorch.
Parameters:
outputs ([`DeformableDetrSegmentationOutput`]):
Raw outputs of the model.
target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
threshold (`float`, *optional*, defaults to 0.9):
Threshold to use to filter out queries.
mask_threshold (`float`, *optional*, defaults to 0.5):
Threshold to use when turning the predicted masks into binary values.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
in the batch as predicted by the model.
"""
warnings.warn(
"`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_semantic_segmentation`.",
FutureWarning,
)
out_logits, raw_masks = outputs.logits, outputs.pred_masks
preds = []
def to_tuple(tup):
if isinstance(tup, tuple):
return tup
return tuple(tup.cpu().tolist())
for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
# we filter empty queries and detection below threshold
scores, labels = cur_logits.softmax(-1).max(-1)
keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
cur_scores = cur_scores[keep]
cur_classes = cur_classes[keep]
cur_masks = cur_masks[keep]
cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1
predictions = {"scores": cur_scores, "labels": cur_classes, "masks": cur_masks}
preds.append(predictions)
return preds
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_instance with Detr->DeformableDetr
def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
def post_process_object_detection(
self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
):
"""
Converts the output of [`DeformableDetrForSegmentation`] into actual instance segmentation predictions. Only
Converts the output of [`DeformableDetrForObjectDetection`] into the format expected by the COCO api. Only
supports PyTorch.
Args:
results (`List[Dict]`):
Results list obtained by [`~DeformableDetrFeatureExtractor.post_process`], to which "masks" results
will be added.
outputs ([`DeformableDetrSegmentationOutput`]):
outputs ([`DetrObjectDetectionOutput`]):
Raw outputs of the model.
orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
image size (before any data augmentation).
max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the
original image size (before any data augmentation).
threshold (`float`, *optional*, defaults to 0.5):
Threshold to use when turning the predicted masks into binary values.
threshold (`float`, *optional*):
Score threshold to keep object detection predictions.
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*, defaults to `None`):
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
(height, width) of each image in the batch. If left to None, predictions will not be resized.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an
image in the batch as predicted by the model.
"""
warnings.warn(
"`post_process_instance` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_instance_segmentation`.",
FutureWarning,
)
if len(orig_target_sizes) != len(max_target_sizes):
raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes")
max_h, max_w = max_target_sizes.max(0)[0].tolist()
outputs_masks = outputs.pred_masks.squeeze(2)
outputs_masks = nn.functional.interpolate(
outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False
)
outputs_masks = (outputs_masks.sigmoid() > threshold).cpu()
for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
img_h, img_w = t[0], t[1]
results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
results[i]["masks"] = nn.functional.interpolate(
results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
).byte()
return results
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_panoptic with Detr->DeformableDetr
def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model.
"""
Converts the output of [`DeformableDetrForSegmentation`] into actual panoptic predictions. Only supports
PyTorch.
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
Parameters:
outputs ([`DeformableDetrSegmentationOutput`]):
Raw outputs of the model.
processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data
augmentation but before batching.
target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`, *optional*):
Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. If left to
None, it will default to the `processed_sizes`.
is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
Dictionary mapping class indices to either True or False, depending on whether or not they are a thing.
If not set, defaults to the `is_thing_map` of COCO panoptic.
threshold (`float`, *optional*, defaults to 0.85):
Threshold to use to filter out queries.
if target_sizes is not None:
if len(out_logits) != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for
an image in the batch as predicted by the model.
"""
warnings.warn(
"`post_process_panoptic is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_panoptic_segmentation`.",
FutureWarning,
)
if target_sizes is None:
target_sizes = processed_sizes
if len(processed_sizes) != len(target_sizes):
raise ValueError("Make sure to pass in as many processed_sizes as target_sizes")
prob = out_logits.sigmoid()
topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
scores = topk_values
topk_boxes = topk_indexes // out_logits.shape[2]
labels = topk_indexes % out_logits.shape[2]
boxes = center_to_corners_format(out_bbox)
boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
if is_thing_map is None:
# default to is_thing_map of COCO panoptic
is_thing_map = {i: i <= 90 for i in range(201)}
# and from relative [0, 1] to absolute [0, height] coordinates
if isinstance(target_sizes, List):
img_h = torch.Tensor([i[0] for i in target_sizes])
img_w = torch.Tensor([i[1] for i in target_sizes])
else:
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
boxes = boxes * scale_fct[:, None, :]
out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes
if not len(out_logits) == len(raw_masks) == len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits and masks"
)
preds = []
def to_tuple(tup):
if isinstance(tup, tuple):
return tup
return tuple(tup.cpu().tolist())
for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
):
# we filter empty queries and detection below threshold
scores, labels = cur_logits.softmax(-1).max(-1)
keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
cur_scores = cur_scores[keep]
cur_classes = cur_classes[keep]
cur_masks = cur_masks[keep]
cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
cur_boxes = center_to_corners_format(cur_boxes[keep])
h, w = cur_masks.shape[-2:]
if len(cur_boxes) != len(cur_classes):
raise ValueError("Not as many boxes as there are classes")
# It may be that we have several predicted masks for the same stuff class.
# In the following, we track the list of masks ids for each stuff class (they are merged later on)
cur_masks = cur_masks.flatten(1)
stuff_equiv_classes = defaultdict(lambda: [])
for k, label in enumerate(cur_classes):
if not is_thing_map[label.item()]:
stuff_equiv_classes[label.item()].append(k)
def get_ids_area(masks, scores, dedup=False):
# This helper function creates the final panoptic segmentation image
# It also returns the area of the masks that appears on the image
m_id = masks.transpose(0, 1).softmax(-1)
if m_id.shape[-1] == 0:
# We didn't detect any mask :(
m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device)
else:
m_id = m_id.argmax(-1).view(h, w)
if dedup:
# Merge the masks corresponding to the same stuff class
for equiv in stuff_equiv_classes.values():
if len(equiv) > 1:
for eq_id in equiv:
m_id.masked_fill_(m_id.eq(eq_id), equiv[0])
final_h, final_w = to_tuple(target_size)
seg_img = Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy()))
seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST)
np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes()))
np_seg_img = np_seg_img.view(final_h, final_w, 3)
np_seg_img = np_seg_img.numpy()
m_id = torch.from_numpy(rgb_to_id(np_seg_img))
area = []
for i in range(len(scores)):
area.append(m_id.eq(i).sum().item())
return area, seg_img
area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True)
if cur_classes.numel() > 0:
# We know filter empty masks as long as we find some
while True:
filtered_small = torch.as_tensor(
[area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device
)
if filtered_small.any().item():
cur_scores = cur_scores[~filtered_small]
cur_classes = cur_classes[~filtered_small]
cur_masks = cur_masks[~filtered_small]
area, seg_img = get_ids_area(cur_masks, cur_scores)
else:
break
results = []
for s, l, b in zip(scores, labels, boxes):
score = s[s > threshold]
label = l[s > threshold]
box = b[s > threshold]
results.append({"scores": score, "labels": label, "boxes": box})
else:
cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device)
segments_info = []
for i, a in enumerate(area):
cat = cur_classes[i].item()
segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a})
del cur_classes
with io.BytesIO() as out:
seg_img.save(out, format="PNG")
predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
preds.append(predictions)
return preds
return results
......@@ -236,8 +236,8 @@ class DeformableDetrObjectDetectionOutput(ModelOutput):
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
possible padding). You can use [`~AutoFeatureExtractor.post_process`] to retrieve the unnormalized bounding
boxes.
possible padding). You can use [`~AutoFeatureExtractor.post_process_object_detection`] to retrieve the
unnormalized bounding boxes.
auxiliary_outputs (`list[Dict]`, *optional*):
Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
......@@ -1878,15 +1878,15 @@ class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel):
>>> # convert outputs (bounding boxes and class logits) to COCO API
>>> target_sizes = torch.tensor([image.size[::-1]])
>>> results = feature_extractor.post_process(outputs, target_sizes=target_sizes)[0]
>>> results = feature_extractor.post_process_object_detection(
... outputs, threshold=0.5, target_sizes=target_sizes
... )[0]
>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
... box = [round(i, 2) for i in box.tolist()]
... # let's only keep detections with score > 0.5
... if score > 0.5:
... print(
... f"Detected {model.config.id2label[label.item()]} with confidence "
... f"{round(score.item(), 3)} at location {box}"
... )
... print(
... f"Detected {model.config.id2label[label.item()]} with confidence "
... f"{round(score.item(), 3)} at location {box}"
... )
Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78]
Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25]
Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25]
......
......@@ -878,7 +878,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
"""
Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch.
Parameters:
Args:
outputs ([`DetrSegmentationOutput`]):
Raw outputs of the model.
target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
......@@ -974,7 +974,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
"""
Converts the output of [`DetrForSegmentation`] into actual panoptic predictions. Only supports PyTorch.
Parameters:
Args:
outputs ([`DetrSegmentationOutput`]):
Raw outputs of the model.
processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
......@@ -1165,13 +1165,15 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple[int, int]] = None):
"""
Args:
Converts the output of [`DetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch.
Args:
outputs ([`DetrForSegmentation`]):
Raw outputs of the model.
target_sizes (`List[Tuple[int, int]]`, *optional*, defaults to `None`):
A list of tuples (`Tuple[int, int]`) containing the target size (height, width) of each image in the
batch. If left to None, predictions will not be resized.
Returns:
`List[torch.Tensor]`:
A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
......@@ -1219,8 +1221,9 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
return_coco_annotation: Optional[bool] = False,
) -> List[Dict]:
"""
Args:
Converts the output of [`DetrForSegmentation`] into instance segmentation predictions. Only supports PyTorch.
Args:
outputs ([`DetrForSegmentation`]):
Raw outputs of the model.
threshold (`float`, *optional*, defaults to 0.5):
......@@ -1236,6 +1239,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
return_coco_annotation (`bool`, *optional*):
Defaults to `False`. If set to `True`, segmentation maps are returned in COCO run-length encoding (RLE)
format.
Returns:
`List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
- **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or
......@@ -1301,9 +1305,10 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
target_sizes: Optional[List[Tuple[int, int]]] = None,
) -> List[Dict]:
"""
Args:
Converts the output of [`DetrForSegmentation`] into image panoptic segmentation predictions. Only supports
PyTorch.
Args:
outputs ([`DetrForSegmentation`]):
The outputs from [`DetrForSegmentation`].
threshold (`float`, *optional*, defaults to 0.5):
......@@ -1321,6 +1326,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
List of length (batch_size), where each list item (`Tuple[int, int]]`) corresponds to the requested
final size (height, width) of each prediction in batch. If left to None, predictions will not be
resized.
Returns:
`List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
- **segmentation** -- a tensor of shape `(height, width)` where each pixel represents a `segment_id` or
......
......@@ -699,53 +699,6 @@ class YolosFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin)
results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
return results
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_segmentation
def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
"""
Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch.
Parameters:
outputs ([`DetrSegmentationOutput`]):
Raw outputs of the model.
target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
threshold (`float`, *optional*, defaults to 0.9):
Threshold to use to filter out queries.
mask_threshold (`float`, *optional*, defaults to 0.5):
Threshold to use when turning the predicted masks into binary values.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
in the batch as predicted by the model.
"""
warnings.warn(
"`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use"
" `post_process_semantic_segmentation`.",
FutureWarning,
)
out_logits, raw_masks = outputs.logits, outputs.pred_masks
preds = []
def to_tuple(tup):
if isinstance(tup, tuple):
return tup
return tuple(tup.cpu().tolist())
for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
# we filter empty queries and detection below threshold
scores, labels = cur_logits.softmax(-1).max(-1)
keep = labels.ne(outputs.logits.shape[-1] - 1) & (scores > threshold)
cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
cur_scores = cur_scores[keep]
cur_classes = cur_classes[keep]
cur_masks = cur_masks[keep]
cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1
predictions = {"scores": cur_scores, "labels": cur_classes, "masks": cur_masks}
preds.append(predictions)
return preds
# Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor.post_process_object_detection with Detr->Yolos
def post_process_object_detection(
self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
......
......@@ -303,7 +303,6 @@ class ConditionalDetrFeatureExtractionTest(FeatureExtractionSavingTestMixin, uni
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
# encode them
# TODO replace by .from_pretrained microsoft/conditional-detr-resnet-50-panoptic
feature_extractor = ConditionalDetrFeatureExtractor(format="coco_panoptic")
encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
......
......@@ -492,6 +492,7 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase):
with torch.no_grad():
outputs = model(pixel_values, pixel_mask)
# verify logits + box predictions
expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.num_labels))
self.assertEqual(outputs.logits.shape, expected_shape_logits)
expected_slice_logits = torch.tensor(
......@@ -505,3 +506,16 @@ class ConditionalDetrModelIntegrationTests(unittest.TestCase):
[[0.7733, 0.6576, 0.4496], [0.5171, 0.1184, 0.9094], [0.8846, 0.5647, 0.2486]]
).to(torch_device)
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
# verify postprocessing
results = feature_extractor.post_process_object_detection(
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
)[0]
expected_scores = torch.tensor([0.8330, 0.8313, 0.8039, 0.6829, 0.5355])
expected_labels = [75, 17, 17, 75, 63]
expected_slice_boxes = torch.tensor([38.3089, 72.1022, 177.6293, 118.4512])
self.assertEqual(len(results["scores"]), 5)
self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
......@@ -565,6 +565,19 @@ class DeformableDetrModelIntegrationTests(unittest.TestCase):
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
# verify postprocessing
results = feature_extractor.post_process_object_detection(
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
)[0]
expected_scores = torch.tensor([0.7999, 0.7894, 0.6331, 0.4720, 0.4382])
expected_labels = [17, 17, 75, 75, 63]
expected_slice_boxes = torch.tensor([16.5028, 52.8390, 318.2544, 470.7841])
self.assertEqual(len(results["scores"]), 5)
self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
def test_inference_object_detection_head_with_box_refine_two_stage(self):
model = DeformableDetrForObjectDetection.from_pretrained(
"SenseTime/deformable-detr-with-box-refine-two-stage"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment