Unverified Commit bd4b83e1 authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

[`DETR`] Update the processing to adapt masks & bboxes to reflect padding (#28363)

* Update the processing so bbox coords are adjusted for padding

* Just pad masks

* Tidy up, add tests

* Better tests

* Fix yolos and mark as slow for pycocotols

* Fix yolos - return_tensors

* Clarify padding and normalization behaviour
parent 3de6a6b4
......@@ -280,7 +280,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
**kwargs,
)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
......@@ -308,7 +308,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
)
return padded_image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
def pad(
self,
images: List[np.ndarray],
......
......@@ -785,9 +785,14 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
overridden by the `do_pad` parameter in the `preprocess` method.
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
"""
model_input_names = ["pixel_values", "pixel_mask"]
......@@ -804,6 +809,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
do_normalize: bool = True,
image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True,
**kwargs,
) -> None:
......@@ -822,6 +828,10 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
size = get_size_dict(size, max_size=max_size, default_to_square=False)
# Backwards compatibility
if do_convert_annotations is None:
do_convert_annotations = do_normalize
super().__init__(**kwargs)
self.format = format
self.do_resize = do_resize
......@@ -830,6 +840,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.do_convert_annotations = do_convert_annotations
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
......@@ -1007,18 +1018,64 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
"""
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format.
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
"""
return normalize_annotation(annotation, image_size=image_size)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size
for key, value in annotation.items():
if key == "masks":
masks = value
masks = pad(
masks,
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.FIRST,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "boxes" and update_bboxes:
boxes = value
boxes *= np.asarray(
[
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
]
)
new_annotation["boxes"] = boxes
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> np.ndarray:
"""
Pad an image with zeros to the given size.
......@@ -1037,25 +1094,33 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
data_format=data_format,
input_data_format=input_data_format,
)
return padded_image
if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
)
return padded_image, annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad(
self,
images: List[np.ndarray],
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> BatchFeature:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask.
Args:
image (`np.ndarray`):
Image to pad.
images (List[`np.ndarray`]):
Images to pad.
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
Annotations to transform according to the padding that is applied to the images.
constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`):
......@@ -1071,19 +1136,29 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
update_bboxes (`bool`, *optional*, defaults to `True`):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)
padded_images = [
self._pad_image(
annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = []
padded_annotations = []
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image,
pad_size,
annotation,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
update_bboxes=update_bboxes,
)
for image in images
]
padded_images.append(padded_image)
padded_annotations.append(padded_annotation)
data = {"pixel_values": padded_images}
if return_pixel_mask:
......@@ -1093,7 +1168,14 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
]
data["pixel_mask"] = masks
return BatchFeature(data=data, tensor_type=return_tensors)
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
]
return encoded_inputs
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
def preprocess(
......@@ -1108,6 +1190,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
do_rescale: Optional[bool] = None,
rescale_factor: Optional[Union[int, float]] = None,
do_normalize: Optional[bool] = None,
do_convert_annotations: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_pad: Optional[bool] = None,
......@@ -1151,12 +1234,17 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
Rescale factor to use when rescaling the image.
do_normalize (`bool`, *optional*, defaults to self.do_normalize):
Whether to normalize the image.
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
Whether to convert the annotations to the format expected by the model. Converts the bounding
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
and in relative coordinates.
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
Mean to use when normalizing the image.
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image.
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
......@@ -1197,6 +1285,9 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
do_normalize = self.do_normalize if do_normalize is None else do_normalize
image_mean = self.image_mean if image_mean is None else image_mean
image_std = self.image_std if image_std is None else image_std
do_convert_annotations = (
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad
format = self.format if format is None else format
......@@ -1300,29 +1391,34 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
images = [
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
]
if annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
for annotation, image in zip(annotations, images)
]
if do_convert_annotations and annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
for annotation, image in zip(annotations, images)
]
if do_pad:
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
data = self.pad(
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
encoded_inputs = self.pad(
images,
annotations=annotations,
return_pixel_mask=True,
data_format=data_format,
input_data_format=input_data_format,
return_tensors=return_tensors,
update_bboxes=do_convert_annotations,
)
else:
images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
for image in images
]
data = {"pixel_values": images}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
]
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
]
return encoded_inputs
......
......@@ -783,9 +783,14 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
overridden by the `do_pad` parameter in the `preprocess` method.
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
"""
model_input_names = ["pixel_values", "pixel_mask"]
......@@ -802,6 +807,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
do_normalize: bool = True,
image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True,
**kwargs,
) -> None:
......@@ -820,6 +826,10 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
size = get_size_dict(size, max_size=max_size, default_to_square=False)
# Backwards compatibility
if do_convert_annotations is None:
do_convert_annotations = do_normalize
super().__init__(**kwargs)
self.format = format
self.do_resize = do_resize
......@@ -828,6 +838,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.do_convert_annotations = do_convert_annotations
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
......@@ -1005,18 +1016,64 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
"""
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format.
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
"""
return normalize_annotation(annotation, image_size=image_size)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size
for key, value in annotation.items():
if key == "masks":
masks = value
masks = pad(
masks,
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.FIRST,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "boxes" and update_bboxes:
boxes = value
boxes *= np.asarray(
[
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
]
)
new_annotation["boxes"] = boxes
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> np.ndarray:
"""
Pad an image with zeros to the given size.
......@@ -1035,25 +1092,33 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
data_format=data_format,
input_data_format=input_data_format,
)
return padded_image
if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
)
return padded_image, annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad(
self,
images: List[np.ndarray],
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> BatchFeature:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask.
Args:
image (`np.ndarray`):
Image to pad.
images (List[`np.ndarray`]):
Images to pad.
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
Annotations to transform according to the padding that is applied to the images.
constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`):
......@@ -1069,19 +1134,29 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
update_bboxes (`bool`, *optional*, defaults to `True`):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)
padded_images = [
self._pad_image(
annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = []
padded_annotations = []
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image,
pad_size,
annotation,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
update_bboxes=update_bboxes,
)
for image in images
]
padded_images.append(padded_image)
padded_annotations.append(padded_annotation)
data = {"pixel_values": padded_images}
if return_pixel_mask:
......@@ -1091,7 +1166,14 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
]
data["pixel_mask"] = masks
return BatchFeature(data=data, tensor_type=return_tensors)
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
]
return encoded_inputs
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
def preprocess(
......@@ -1106,6 +1188,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
do_rescale: Optional[bool] = None,
rescale_factor: Optional[Union[int, float]] = None,
do_normalize: Optional[bool] = None,
do_convert_annotations: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_pad: Optional[bool] = None,
......@@ -1149,12 +1232,17 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
Rescale factor to use when rescaling the image.
do_normalize (`bool`, *optional*, defaults to self.do_normalize):
Whether to normalize the image.
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
Whether to convert the annotations to the format expected by the model. Converts the bounding
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
and in relative coordinates.
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
Mean to use when normalizing the image.
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image.
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
......@@ -1195,6 +1283,9 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
do_normalize = self.do_normalize if do_normalize is None else do_normalize
image_mean = self.image_mean if image_mean is None else image_mean
image_std = self.image_std if image_std is None else image_std
do_convert_annotations = (
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad
format = self.format if format is None else format
......@@ -1298,29 +1389,34 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
images = [
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
]
if annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
for annotation, image in zip(annotations, images)
]
if do_convert_annotations and annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
for annotation, image in zip(annotations, images)
]
if do_pad:
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
data = self.pad(
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
encoded_inputs = self.pad(
images,
annotations=annotations,
return_pixel_mask=True,
data_format=data_format,
input_data_format=input_data_format,
return_tensors=return_tensors,
update_bboxes=do_convert_annotations,
)
else:
images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
for image in images
]
data = {"pixel_values": images}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
]
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
]
return encoded_inputs
......
......@@ -35,6 +35,7 @@ from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
AnnotationFormat,
AnnotationType,
ChannelDimension,
ImageInput,
PILImageResampling,
......@@ -492,9 +493,14 @@ class DetaImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
overridden by the `do_pad` parameter in the `preprocess` method.
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
"""
model_input_names = ["pixel_values", "pixel_mask"]
......@@ -510,6 +516,7 @@ class DetaImageProcessor(BaseImageProcessor):
do_normalize: bool = True,
image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None,
do_convert_annotations: bool = True,
do_pad: bool = True,
**kwargs,
) -> None:
......@@ -519,6 +526,9 @@ class DetaImageProcessor(BaseImageProcessor):
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
size = get_size_dict(size, default_to_square=False)
if do_convert_annotations is None:
do_convert_annotations = do_normalize
super().__init__(**kwargs)
self.format = format
self.do_resize = do_resize
......@@ -527,6 +537,7 @@ class DetaImageProcessor(BaseImageProcessor):
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.do_convert_annotations = do_convert_annotations
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
......@@ -680,18 +691,64 @@ class DetaImageProcessor(BaseImageProcessor):
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
"""
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format.
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
"""
return normalize_annotation(annotation, image_size=image_size)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size
for key, value in annotation.items():
if key == "masks":
masks = value
masks = pad(
masks,
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.FIRST,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "boxes" and update_bboxes:
boxes = value
boxes *= np.asarray(
[
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
]
)
new_annotation["boxes"] = boxes
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> np.ndarray:
"""
Pad an image with zeros to the given size.
......@@ -710,25 +767,33 @@ class DetaImageProcessor(BaseImageProcessor):
data_format=data_format,
input_data_format=input_data_format,
)
return padded_image
if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
)
return padded_image, annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad(
self,
images: List[np.ndarray],
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> BatchFeature:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask.
Args:
image (`np.ndarray`):
Image to pad.
images (List[`np.ndarray`]):
Images to pad.
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
Annotations to transform according to the padding that is applied to the images.
constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`):
......@@ -744,19 +809,29 @@ class DetaImageProcessor(BaseImageProcessor):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
update_bboxes (`bool`, *optional*, defaults to `True`):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)
padded_images = [
self._pad_image(
annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = []
padded_annotations = []
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image,
pad_size,
annotation,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
update_bboxes=update_bboxes,
)
for image in images
]
padded_images.append(padded_image)
padded_annotations.append(padded_annotation)
data = {"pixel_values": padded_images}
if return_pixel_mask:
......@@ -766,7 +841,14 @@ class DetaImageProcessor(BaseImageProcessor):
]
data["pixel_mask"] = masks
return BatchFeature(data=data, tensor_type=return_tensors)
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
]
return encoded_inputs
def preprocess(
self,
......@@ -782,6 +864,7 @@ class DetaImageProcessor(BaseImageProcessor):
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: Optional[bool] = None,
format: Optional[Union[str, AnnotationFormat]] = None,
return_tensors: Optional[Union[TensorType, str]] = None,
......@@ -827,8 +910,13 @@ class DetaImageProcessor(BaseImageProcessor):
Mean to use when normalizing the image.
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image.
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
Whether to convert the annotations to the format expected by the model. Converts the bounding
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
and in relative coordinates.
do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image.
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
......@@ -861,6 +949,9 @@ class DetaImageProcessor(BaseImageProcessor):
do_normalize = self.do_normalize if do_normalize is None else do_normalize
image_mean = self.image_mean if image_mean is None else image_mean
image_std = self.image_std if image_std is None else image_std
do_convert_annotations = (
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad
format = self.format if format is None else format
......@@ -964,29 +1055,34 @@ class DetaImageProcessor(BaseImageProcessor):
images = [
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
]
if annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
for annotation, image in zip(annotations, images)
]
if do_convert_annotations and annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
for annotation, image in zip(annotations, images)
]
if do_pad:
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
data = self.pad(
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
encoded_inputs = self.pad(
images,
annotations=annotations,
return_pixel_mask=True,
data_format=data_format,
input_data_format=input_data_format,
return_tensors=return_tensors,
update_bboxes=do_convert_annotations,
)
else:
images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
for image in images
]
data = {"pixel_values": images}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
]
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
]
return encoded_inputs
......
......@@ -760,7 +760,7 @@ class DetrImageProcessor(BaseImageProcessor):
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize:
do_normalize (`bool`, *optional*, defaults to True):
Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
`preprocess` method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
......@@ -769,9 +769,14 @@ class DetrImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
overridden by the `do_pad` parameter in the `preprocess` method.
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
"""
model_input_names = ["pixel_values", "pixel_mask"]
......@@ -787,6 +792,7 @@ class DetrImageProcessor(BaseImageProcessor):
do_normalize: bool = True,
image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True,
**kwargs,
) -> None:
......@@ -805,6 +811,10 @@ class DetrImageProcessor(BaseImageProcessor):
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
size = get_size_dict(size, max_size=max_size, default_to_square=False)
# Backwards compatibility
if do_convert_annotations is None:
do_convert_annotations = do_normalize
super().__init__(**kwargs)
self.format = format
self.do_resize = do_resize
......@@ -813,6 +823,7 @@ class DetrImageProcessor(BaseImageProcessor):
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.do_convert_annotations = do_convert_annotations
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
......@@ -981,17 +992,62 @@ class DetrImageProcessor(BaseImageProcessor):
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
"""
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format.
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
"""
return normalize_annotation(annotation, image_size=image_size)
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size
for key, value in annotation.items():
if key == "masks":
masks = value
masks = pad(
masks,
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.FIRST,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "boxes" and update_bboxes:
boxes = value
boxes *= np.asarray(
[
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
]
)
new_annotation["boxes"] = boxes
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> np.ndarray:
"""
Pad an image with zeros to the given size.
......@@ -1010,24 +1066,32 @@ class DetrImageProcessor(BaseImageProcessor):
data_format=data_format,
input_data_format=input_data_format,
)
return padded_image
if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
)
return padded_image, annotation
def pad(
self,
images: List[np.ndarray],
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> BatchFeature:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask.
Args:
image (`np.ndarray`):
Image to pad.
images (List[`np.ndarray`]):
Images to pad.
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
Annotations to transform according to the padding that is applied to the images.
constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`):
......@@ -1043,19 +1107,29 @@ class DetrImageProcessor(BaseImageProcessor):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
update_bboxes (`bool`, *optional*, defaults to `True`):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)
padded_images = [
self._pad_image(
annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = []
padded_annotations = []
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image,
pad_size,
annotation,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
update_bboxes=update_bboxes,
)
for image in images
]
padded_images.append(padded_image)
padded_annotations.append(padded_annotation)
data = {"pixel_values": padded_images}
if return_pixel_mask:
......@@ -1065,7 +1139,14 @@ class DetrImageProcessor(BaseImageProcessor):
]
data["pixel_mask"] = masks
return BatchFeature(data=data, tensor_type=return_tensors)
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
]
return encoded_inputs
def preprocess(
self,
......@@ -1079,6 +1160,7 @@ class DetrImageProcessor(BaseImageProcessor):
do_rescale: Optional[bool] = None,
rescale_factor: Optional[Union[int, float]] = None,
do_normalize: Optional[bool] = None,
do_convert_annotations: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_pad: Optional[bool] = None,
......@@ -1122,12 +1204,17 @@ class DetrImageProcessor(BaseImageProcessor):
Rescale factor to use when rescaling the image.
do_normalize (`bool`, *optional*, defaults to self.do_normalize):
Whether to normalize the image.
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
Whether to convert the annotations to the format expected by the model. Converts the bounding
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
and in relative coordinates.
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
Mean to use when normalizing the image.
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image.
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
......@@ -1168,6 +1255,9 @@ class DetrImageProcessor(BaseImageProcessor):
do_normalize = self.do_normalize if do_normalize is None else do_normalize
image_mean = self.image_mean if image_mean is None else image_mean
image_std = self.image_std if image_std is None else image_std
do_convert_annotations = (
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad
format = self.format if format is None else format
......@@ -1271,29 +1361,34 @@ class DetrImageProcessor(BaseImageProcessor):
images = [
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
]
if annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
for annotation, image in zip(annotations, images)
]
if do_convert_annotations and annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image, input_data_format))
for annotation, image in zip(annotations, images)
]
if do_pad:
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
data = self.pad(
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format
encoded_inputs = self.pad(
images,
annotations=annotations,
return_pixel_mask=True,
data_format=data_format,
input_data_format=input_data_format,
return_tensors=return_tensors,
update_bboxes=do_convert_annotations,
)
else:
images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
for image in images
]
data = {"pixel_values": images}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
]
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
]
return encoded_inputs
......
......@@ -771,7 +771,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
)
return encoded_inputs
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
......@@ -799,7 +799,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
)
return padded_image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
def pad(
self,
images: List[np.ndarray],
......
......@@ -788,7 +788,7 @@ class MaskFormerImageProcessor(BaseImageProcessor):
)
return encoded_inputs
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
......@@ -816,7 +816,7 @@ class MaskFormerImageProcessor(BaseImageProcessor):
)
return padded_image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
def pad(
self,
images: List[np.ndarray],
......
......@@ -770,7 +770,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
)
return encoded_inputs
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
......@@ -798,7 +798,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
)
return padded_image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
# Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
def pad(
self,
images: List[np.ndarray],
......
......@@ -251,7 +251,6 @@ class ViltImageProcessor(BaseImageProcessor):
**kwargs,
)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
......@@ -279,7 +278,6 @@ class ViltImageProcessor(BaseImageProcessor):
)
return padded_image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad(
self,
images: List[np.ndarray],
......
......@@ -696,8 +696,9 @@ class YolosImageProcessor(BaseImageProcessor):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
overridden by the `do_pad` parameter in the `preprocess` method.
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
"""
model_input_names = ["pixel_values", "pixel_mask"]
......@@ -713,6 +714,7 @@ class YolosImageProcessor(BaseImageProcessor):
do_normalize: bool = True,
image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True,
**kwargs,
) -> None:
......@@ -731,6 +733,10 @@ class YolosImageProcessor(BaseImageProcessor):
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
size = get_size_dict(size, max_size=max_size, default_to_square=False)
# Backwards compatibility
if do_convert_annotations is None:
do_convert_annotations = do_normalize
super().__init__(**kwargs)
self.format = format
self.do_resize = do_resize
......@@ -739,6 +745,7 @@ class YolosImageProcessor(BaseImageProcessor):
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.do_convert_annotations = do_convert_annotations
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
......@@ -916,18 +923,64 @@ class YolosImageProcessor(BaseImageProcessor):
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
"""
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format.
`[center_x, center_y, width, height]` format and from absolute to relative pixel values.
"""
return normalize_annotation(annotation, image_size=image_size)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size
for key, value in annotation.items():
if key == "masks":
masks = value
masks = pad(
masks,
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.FIRST,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "boxes" and update_bboxes:
boxes = value
boxes *= np.asarray(
[
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
]
)
new_annotation["boxes"] = boxes
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> np.ndarray:
"""
Pad an image with zeros to the given size.
......@@ -946,16 +999,22 @@ class YolosImageProcessor(BaseImageProcessor):
data_format=data_format,
input_data_format=input_data_format,
)
return padded_image
if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
)
return padded_image, annotation
def pad(
self,
images: List[np.ndarray],
annotations: Optional[List[Dict[str, Any]]] = None,
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = False,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> BatchFeature:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
......@@ -964,6 +1023,9 @@ class YolosImageProcessor(BaseImageProcessor):
Args:
image (`np.ndarray`):
Image to pad.
annotations (`List[Dict[str, any]]`, *optional*):
Annotations to pad along with the images. If provided, the bounding boxes will be updated to match the
padded images.
constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`):
......@@ -979,19 +1041,29 @@ class YolosImageProcessor(BaseImageProcessor):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
update_bboxes (`bool`, *optional*, defaults to `True`):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)
padded_images = [
self._pad_image(
annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = []
padded_annotations = []
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image,
pad_size,
annotation,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
update_bboxes=update_bboxes,
)
for image in images
]
padded_images.append(padded_image)
padded_annotations.append(padded_annotation)
data = {"pixel_values": padded_images}
if return_pixel_mask:
......@@ -1017,6 +1089,7 @@ class YolosImageProcessor(BaseImageProcessor):
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: Optional[bool] = None,
format: Optional[Union[str, AnnotationFormat]] = None,
return_tensors: Optional[Union[TensorType, str]] = None,
......@@ -1062,8 +1135,13 @@ class YolosImageProcessor(BaseImageProcessor):
Mean to use when normalizing the image.
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image.
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
Whether to convert the annotations to the format expected by the model. Converts the bounding
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
and in relative coordinates.
do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image.
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
......@@ -1101,6 +1179,9 @@ class YolosImageProcessor(BaseImageProcessor):
do_normalize = self.do_normalize if do_normalize is None else do_normalize
image_mean = self.image_mean if image_mean is None else image_mean
image_std = self.image_std if image_std is None else image_std
do_convert_annotations = (
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad
format = self.format if format is None else format
......@@ -1204,26 +1285,34 @@ class YolosImageProcessor(BaseImageProcessor):
images = [
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
]
if annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image))
for annotation, image in zip(annotations, images)
]
if do_convert_annotations and annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image))
for annotation, image in zip(annotations, images)
]
if do_pad:
data = self.pad(images, data_format=data_format, input_data_format=input_data_format)
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
encoded_inputs = self.pad(
images,
annotations=annotations,
return_pixel_mask=True,
data_format=data_format,
input_data_format=input_data_format,
update_bboxes=do_convert_annotations,
return_tensors=return_tensors,
)
else:
images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
for image in images
]
data = {"pixel_values": images}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
]
encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
]
return encoded_inputs
......
......@@ -248,3 +248,246 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
# verify size
expected_size = torch.tensor([800, 1066])
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
@slow
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->ConditionalDetr, facebook/detr-resnet-50 ->microsoft/conditional-detr-resnet-50
def test_batched_coco_detection_annotations(self):
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
target = json.loads(f.read())
annotations_0 = {"image_id": 39769, "annotations": target}
annotations_1 = {"image_id": 39769, "annotations": target}
# Adjust the bounding boxes for the resized image
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotations_1["annotations"])):
coords = annotations_1["annotations"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotations_1["annotations"][i]["bbox"] = new_bbox
images = [image_0, image_1]
annotations = [annotations_0, annotations_1]
image_processing = ConditionalDetrImageProcessor()
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
return_tensors="pt", # do_convert_annotations=True
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.6879, 0.4609, 0.0755, 0.3691],
[0.2118, 0.3359, 0.2601, 0.1566],
[0.5011, 0.5000, 0.9979, 1.0000],
[0.5010, 0.5020, 0.9979, 0.9959],
[0.3284, 0.5944, 0.5884, 0.8112],
[0.8394, 0.5445, 0.3213, 0.9110],
]
)
expected_boxes_1 = torch.tensor(
[
[0.4130, 0.2765, 0.0453, 0.2215],
[0.1272, 0.2016, 0.1561, 0.0940],
[0.3757, 0.4933, 0.7488, 0.9865],
[0.3759, 0.5002, 0.7492, 0.9955],
[0.1971, 0.5456, 0.3532, 0.8646],
[0.5790, 0.4115, 0.3430, 0.7161],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
@slow
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->ConditionalDetr
def test_batched_coco_panoptic_annotations(self):
# prepare image, target and masks_path
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
target = json.loads(f.read())
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotation_1["segments_info"])):
coords = annotation_1["segments_info"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotation_1["segments_info"][i]["bbox"] = new_bbox
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
images = [image_0, image_1]
annotations = [annotation_0, annotation_1]
# encode them
image_processing = ConditionalDetrImageProcessor(format="coco_panoptic")
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_tensors="pt",
return_segmentation_masks=True,
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.2625, 0.5437, 0.4688, 0.8625],
[0.7719, 0.4104, 0.4531, 0.7125],
[0.5000, 0.4927, 0.9969, 0.9854],
[0.1688, 0.2000, 0.2063, 0.0917],
[0.5492, 0.2760, 0.0578, 0.2187],
[0.4992, 0.4990, 0.9984, 0.9979],
]
)
expected_boxes_1 = torch.tensor(
[
[0.1576, 0.3262, 0.2814, 0.5175],
[0.4634, 0.2463, 0.2720, 0.4275],
[0.3002, 0.2956, 0.5985, 0.5913],
[0.1013, 0.1200, 0.1238, 0.0550],
[0.3297, 0.1656, 0.0347, 0.1312],
[0.2997, 0.2994, 0.5994, 0.5987],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
......@@ -250,3 +250,246 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
# verify size
expected_size = torch.tensor([800, 1066])
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
@slow
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->DeformableDetr
def test_batched_coco_detection_annotations(self):
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
target = json.loads(f.read())
annotations_0 = {"image_id": 39769, "annotations": target}
annotations_1 = {"image_id": 39769, "annotations": target}
# Adjust the bounding boxes for the resized image
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotations_1["annotations"])):
coords = annotations_1["annotations"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotations_1["annotations"][i]["bbox"] = new_bbox
images = [image_0, image_1]
annotations = [annotations_0, annotations_1]
image_processing = DeformableDetrImageProcessor()
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
return_tensors="pt", # do_convert_annotations=True
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.6879, 0.4609, 0.0755, 0.3691],
[0.2118, 0.3359, 0.2601, 0.1566],
[0.5011, 0.5000, 0.9979, 1.0000],
[0.5010, 0.5020, 0.9979, 0.9959],
[0.3284, 0.5944, 0.5884, 0.8112],
[0.8394, 0.5445, 0.3213, 0.9110],
]
)
expected_boxes_1 = torch.tensor(
[
[0.4130, 0.2765, 0.0453, 0.2215],
[0.1272, 0.2016, 0.1561, 0.0940],
[0.3757, 0.4933, 0.7488, 0.9865],
[0.3759, 0.5002, 0.7492, 0.9955],
[0.1971, 0.5456, 0.3532, 0.8646],
[0.5790, 0.4115, 0.3430, 0.7161],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
@slow
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->DeformableDetr
def test_batched_coco_panoptic_annotations(self):
# prepare image, target and masks_path
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
target = json.loads(f.read())
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotation_1["segments_info"])):
coords = annotation_1["segments_info"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotation_1["segments_info"][i]["bbox"] = new_bbox
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
images = [image_0, image_1]
annotations = [annotation_0, annotation_1]
# encode them
image_processing = DeformableDetrImageProcessor(format="coco_panoptic")
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_tensors="pt",
return_segmentation_masks=True,
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.2625, 0.5437, 0.4688, 0.8625],
[0.7719, 0.4104, 0.4531, 0.7125],
[0.5000, 0.4927, 0.9969, 0.9854],
[0.1688, 0.2000, 0.2063, 0.0917],
[0.5492, 0.2760, 0.0578, 0.2187],
[0.4992, 0.4990, 0.9984, 0.9979],
]
)
expected_boxes_1 = torch.tensor(
[
[0.1576, 0.3262, 0.2814, 0.5175],
[0.4634, 0.2463, 0.2720, 0.4275],
[0.3002, 0.2956, 0.5985, 0.5913],
[0.1013, 0.1200, 0.1238, 0.0550],
[0.3297, 0.1656, 0.0347, 0.1312],
[0.2997, 0.2994, 0.5994, 0.5987],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
......@@ -244,3 +244,246 @@ class DetaImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
# verify size
expected_size = torch.tensor([800, 1066])
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
@slow
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->Deta
def test_batched_coco_detection_annotations(self):
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
target = json.loads(f.read())
annotations_0 = {"image_id": 39769, "annotations": target}
annotations_1 = {"image_id": 39769, "annotations": target}
# Adjust the bounding boxes for the resized image
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotations_1["annotations"])):
coords = annotations_1["annotations"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotations_1["annotations"][i]["bbox"] = new_bbox
images = [image_0, image_1]
annotations = [annotations_0, annotations_1]
image_processing = DetaImageProcessor()
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
return_tensors="pt", # do_convert_annotations=True
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.6879, 0.4609, 0.0755, 0.3691],
[0.2118, 0.3359, 0.2601, 0.1566],
[0.5011, 0.5000, 0.9979, 1.0000],
[0.5010, 0.5020, 0.9979, 0.9959],
[0.3284, 0.5944, 0.5884, 0.8112],
[0.8394, 0.5445, 0.3213, 0.9110],
]
)
expected_boxes_1 = torch.tensor(
[
[0.4130, 0.2765, 0.0453, 0.2215],
[0.1272, 0.2016, 0.1561, 0.0940],
[0.3757, 0.4933, 0.7488, 0.9865],
[0.3759, 0.5002, 0.7492, 0.9955],
[0.1971, 0.5456, 0.3532, 0.8646],
[0.5790, 0.4115, 0.3430, 0.7161],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
@slow
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->Deta
def test_batched_coco_panoptic_annotations(self):
# prepare image, target and masks_path
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
target = json.loads(f.read())
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotation_1["segments_info"])):
coords = annotation_1["segments_info"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotation_1["segments_info"][i]["bbox"] = new_bbox
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
images = [image_0, image_1]
annotations = [annotation_0, annotation_1]
# encode them
image_processing = DetaImageProcessor(format="coco_panoptic")
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_tensors="pt",
return_segmentation_masks=True,
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.2625, 0.5437, 0.4688, 0.8625],
[0.7719, 0.4104, 0.4531, 0.7125],
[0.5000, 0.4927, 0.9969, 0.9854],
[0.1688, 0.2000, 0.2063, 0.0917],
[0.5492, 0.2760, 0.0578, 0.2187],
[0.4992, 0.4990, 0.9984, 0.9979],
]
)
expected_boxes_1 = torch.tensor(
[
[0.1576, 0.3262, 0.2814, 0.5175],
[0.4634, 0.2463, 0.2720, 0.4275],
[0.3002, 0.2956, 0.5985, 0.5913],
[0.1013, 0.1200, 0.1238, 0.0550],
[0.3297, 0.1656, 0.0347, 0.1312],
[0.2997, 0.2994, 0.5994, 0.5987],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
......@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import pathlib
import unittest
......@@ -308,3 +307,244 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
# verify size
expected_size = torch.tensor([800, 1066])
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
@slow
def test_batched_coco_detection_annotations(self):
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
target = json.loads(f.read())
annotations_0 = {"image_id": 39769, "annotations": target}
annotations_1 = {"image_id": 39769, "annotations": target}
# Adjust the bounding boxes for the resized image
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotations_1["annotations"])):
coords = annotations_1["annotations"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotations_1["annotations"][i]["bbox"] = new_bbox
images = [image_0, image_1]
annotations = [annotations_0, annotations_1]
image_processing = DetrImageProcessor()
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
return_tensors="pt", # do_convert_annotations=True
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.6879, 0.4609, 0.0755, 0.3691],
[0.2118, 0.3359, 0.2601, 0.1566],
[0.5011, 0.5000, 0.9979, 1.0000],
[0.5010, 0.5020, 0.9979, 0.9959],
[0.3284, 0.5944, 0.5884, 0.8112],
[0.8394, 0.5445, 0.3213, 0.9110],
]
)
expected_boxes_1 = torch.tensor(
[
[0.4130, 0.2765, 0.0453, 0.2215],
[0.1272, 0.2016, 0.1561, 0.0940],
[0.3757, 0.4933, 0.7488, 0.9865],
[0.3759, 0.5002, 0.7492, 0.9955],
[0.1971, 0.5456, 0.3532, 0.8646],
[0.5790, 0.4115, 0.3430, 0.7161],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
@slow
def test_batched_coco_panoptic_annotations(self):
# prepare image, target and masks_path
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
target = json.loads(f.read())
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotation_1["segments_info"])):
coords = annotation_1["segments_info"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotation_1["segments_info"][i]["bbox"] = new_bbox
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
images = [image_0, image_1]
annotations = [annotation_0, annotation_1]
# encode them
image_processing = DetrImageProcessor(format="coco_panoptic")
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_tensors="pt",
return_segmentation_masks=True,
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.2625, 0.5437, 0.4688, 0.8625],
[0.7719, 0.4104, 0.4531, 0.7125],
[0.5000, 0.4927, 0.9969, 0.9854],
[0.1688, 0.2000, 0.2063, 0.0917],
[0.5492, 0.2760, 0.0578, 0.2187],
[0.4992, 0.4990, 0.9984, 0.9979],
]
)
expected_boxes_1 = torch.tensor(
[
[0.1576, 0.3262, 0.2814, 0.5175],
[0.4634, 0.2463, 0.2720, 0.4275],
[0.3002, 0.2956, 0.5985, 0.5913],
[0.1013, 0.1200, 0.1238, 0.0550],
[0.3297, 0.1656, 0.0347, 0.1312],
[0.2997, 0.2994, 0.5994, 0.5987],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
......@@ -287,3 +287,246 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
# verify size
expected_size = torch.tensor([800, 1056])
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
@slow
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->Yolos
def test_batched_coco_detection_annotations(self):
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
target = json.loads(f.read())
annotations_0 = {"image_id": 39769, "annotations": target}
annotations_1 = {"image_id": 39769, "annotations": target}
# Adjust the bounding boxes for the resized image
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotations_1["annotations"])):
coords = annotations_1["annotations"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotations_1["annotations"][i]["bbox"] = new_bbox
images = [image_0, image_1]
annotations = [annotations_0, annotations_1]
image_processing = YolosImageProcessor()
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
return_tensors="pt", # do_convert_annotations=True
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.6879, 0.4609, 0.0755, 0.3691],
[0.2118, 0.3359, 0.2601, 0.1566],
[0.5011, 0.5000, 0.9979, 1.0000],
[0.5010, 0.5020, 0.9979, 0.9959],
[0.3284, 0.5944, 0.5884, 0.8112],
[0.8394, 0.5445, 0.3213, 0.9110],
]
)
expected_boxes_1 = torch.tensor(
[
[0.4130, 0.2765, 0.0453, 0.2215],
[0.1272, 0.2016, 0.1561, 0.0940],
[0.3757, 0.4933, 0.7488, 0.9865],
[0.3759, 0.5002, 0.7492, 0.9955],
[0.1971, 0.5456, 0.3532, 0.8646],
[0.5790, 0.4115, 0.3430, 0.7161],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
@slow
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->Yolos
def test_batched_coco_panoptic_annotations(self):
# prepare image, target and masks_path
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
target = json.loads(f.read())
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotation_1["segments_info"])):
coords = annotation_1["segments_info"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotation_1["segments_info"][i]["bbox"] = new_bbox
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
images = [image_0, image_1]
annotations = [annotation_0, annotation_1]
# encode them
image_processing = YolosImageProcessor(format="coco_panoptic")
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_tensors="pt",
return_segmentation_masks=True,
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.2625, 0.5437, 0.4688, 0.8625],
[0.7719, 0.4104, 0.4531, 0.7125],
[0.5000, 0.4927, 0.9969, 0.9854],
[0.1688, 0.2000, 0.2063, 0.0917],
[0.5492, 0.2760, 0.0578, 0.2187],
[0.4992, 0.4990, 0.9984, 0.9979],
]
)
expected_boxes_1 = torch.tensor(
[
[0.1576, 0.3262, 0.2814, 0.5175],
[0.4634, 0.2463, 0.2720, 0.4275],
[0.3002, 0.2956, 0.5985, 0.5913],
[0.1013, 0.1200, 0.1238, 0.0550],
[0.3297, 0.1656, 0.0347, 0.1312],
[0.2997, 0.2994, 0.5994, 0.5987],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment