Unverified Commit bd4b83e1 authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

[`DETR`] Update the processing to adapt masks & bboxes to reflect padding (#28363)

* Update the processing so bbox coords are adjusted for padding

* Just pad masks

* Tidy up, add tests

* Better tests

* Fix yolos and mark as slow for pycocotols

* Fix yolos - return_tensors

* Clarify padding and normalization behaviour
parent 3de6a6b4
...@@ -280,7 +280,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor): ...@@ -280,7 +280,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
**kwargs, **kwargs,
) )
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
def _pad_image( def _pad_image(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -308,7 +308,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor): ...@@ -308,7 +308,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
) )
return padded_image return padded_image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
def pad( def pad(
self, self,
images: List[np.ndarray], images: List[np.ndarray],
......
...@@ -785,9 +785,14 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -785,9 +785,14 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`): do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
overridden by the `do_pad` parameter in the `preprocess` method. method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
""" """
model_input_names = ["pixel_values", "pixel_mask"] model_input_names = ["pixel_values", "pixel_mask"]
...@@ -804,6 +809,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -804,6 +809,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
do_normalize: bool = True, do_normalize: bool = True,
image_mean: Union[float, List[float]] = None, image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None, image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True, do_pad: bool = True,
**kwargs, **kwargs,
) -> None: ) -> None:
...@@ -822,6 +828,10 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -822,6 +828,10 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
size = get_size_dict(size, max_size=max_size, default_to_square=False) size = get_size_dict(size, max_size=max_size, default_to_square=False)
# Backwards compatibility
if do_convert_annotations is None:
do_convert_annotations = do_normalize
super().__init__(**kwargs) super().__init__(**kwargs)
self.format = format self.format = format
self.do_resize = do_resize self.do_resize = do_resize
...@@ -830,6 +840,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -830,6 +840,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
self.do_rescale = do_rescale self.do_rescale = do_rescale
self.rescale_factor = rescale_factor self.rescale_factor = rescale_factor
self.do_normalize = do_normalize self.do_normalize = do_normalize
self.do_convert_annotations = do_convert_annotations
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad self.do_pad = do_pad
...@@ -1007,18 +1018,64 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1007,18 +1018,64 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
""" """
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format. `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
""" """
return normalize_annotation(annotation, image_size=image_size) return normalize_annotation(annotation, image_size=image_size)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size
for key, value in annotation.items():
if key == "masks":
masks = value
masks = pad(
masks,
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.FIRST,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "boxes" and update_bboxes:
boxes = value
boxes *= np.asarray(
[
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
]
)
new_annotation["boxes"] = boxes
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image( def _pad_image(
self, self,
image: np.ndarray, image: np.ndarray,
output_size: Tuple[int, int], output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0, constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> np.ndarray: ) -> np.ndarray:
""" """
Pad an image with zeros to the given size. Pad an image with zeros to the given size.
...@@ -1037,25 +1094,33 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1037,25 +1094,33 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
data_format=data_format, data_format=data_format,
input_data_format=input_data_format, input_data_format=input_data_format,
) )
return padded_image if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
)
return padded_image, annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad( def pad(
self, self,
images: List[np.ndarray], images: List[np.ndarray],
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
constant_values: Union[float, Iterable[float]] = 0, constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True, return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> BatchFeature: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask. in the batch and optionally returns their corresponding pixel mask.
Args: Args:
image (`np.ndarray`): images (List[`np.ndarray`]):
Image to pad. Images to pad.
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
Annotations to transform according to the padding that is applied to the images.
constant_values (`float` or `Iterable[float]`, *optional*): constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`. The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`): return_pixel_mask (`bool`, *optional*, defaults to `True`):
...@@ -1071,19 +1136,29 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1071,19 +1136,29 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*): input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred. The channel dimension format of the input image. If not provided, it will be inferred.
update_bboxes (`bool`, *optional*, defaults to `True`):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
""" """
pad_size = get_max_height_width(images, input_data_format=input_data_format) pad_size = get_max_height_width(images, input_data_format=input_data_format)
padded_images = [ annotation_list = annotations if annotations is not None else [None] * len(images)
self._pad_image( padded_images = []
padded_annotations = []
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image, image,
pad_size, pad_size,
annotation,
constant_values=constant_values, constant_values=constant_values,
data_format=data_format, data_format=data_format,
input_data_format=input_data_format, input_data_format=input_data_format,
update_bboxes=update_bboxes,
) )
for image in images padded_images.append(padded_image)
] padded_annotations.append(padded_annotation)
data = {"pixel_values": padded_images} data = {"pixel_values": padded_images}
if return_pixel_mask: if return_pixel_mask:
...@@ -1093,7 +1168,14 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1093,7 +1168,14 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
] ]
data["pixel_mask"] = masks data["pixel_mask"] = masks
return BatchFeature(data=data, tensor_type=return_tensors) encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
]
return encoded_inputs
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
def preprocess( def preprocess(
...@@ -1108,6 +1190,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1108,6 +1190,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
do_rescale: Optional[bool] = None, do_rescale: Optional[bool] = None,
rescale_factor: Optional[Union[int, float]] = None, rescale_factor: Optional[Union[int, float]] = None,
do_normalize: Optional[bool] = None, do_normalize: Optional[bool] = None,
do_convert_annotations: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None, image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None,
do_pad: Optional[bool] = None, do_pad: Optional[bool] = None,
...@@ -1151,12 +1234,17 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1151,12 +1234,17 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
Rescale factor to use when rescaling the image. Rescale factor to use when rescaling the image.
do_normalize (`bool`, *optional*, defaults to self.do_normalize): do_normalize (`bool`, *optional*, defaults to self.do_normalize):
Whether to normalize the image. Whether to normalize the image.
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
Whether to convert the annotations to the format expected by the model. Converts the bounding
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
and in relative coordinates.
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean): image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
Mean to use when normalizing the image. Mean to use when normalizing the image.
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image. Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad): do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations. Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
...@@ -1197,6 +1285,9 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1197,6 +1285,9 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
do_normalize = self.do_normalize if do_normalize is None else do_normalize do_normalize = self.do_normalize if do_normalize is None else do_normalize
image_mean = self.image_mean if image_mean is None else image_mean image_mean = self.image_mean if image_mean is None else image_mean
image_std = self.image_std if image_std is None else image_std image_std = self.image_std if image_std is None else image_std
do_convert_annotations = (
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad do_pad = self.do_pad if do_pad is None else do_pad
format = self.format if format is None else format format = self.format if format is None else format
...@@ -1300,7 +1391,8 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1300,7 +1391,8 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
images = [ images = [
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
] ]
if annotations is not None:
if do_convert_annotations and annotations is not None:
annotations = [ annotations = [
self.normalize_annotation(annotation, get_image_size(image, input_data_format)) self.normalize_annotation(annotation, get_image_size(image, input_data_format))
for annotation, image in zip(annotations, images) for annotation, image in zip(annotations, images)
...@@ -1308,17 +1400,21 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1308,17 +1400,21 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
if do_pad: if do_pad:
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
data = self.pad( encoded_inputs = self.pad(
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format images,
annotations=annotations,
return_pixel_mask=True,
data_format=data_format,
input_data_format=input_data_format,
return_tensors=return_tensors,
update_bboxes=do_convert_annotations,
) )
else: else:
images = [ images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
for image in images for image in images
] ]
data = {"pixel_values": images} encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None: if annotations is not None:
encoded_inputs["labels"] = [ encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
......
...@@ -783,9 +783,14 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -783,9 +783,14 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`): do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
overridden by the `do_pad` parameter in the `preprocess` method. method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
""" """
model_input_names = ["pixel_values", "pixel_mask"] model_input_names = ["pixel_values", "pixel_mask"]
...@@ -802,6 +807,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -802,6 +807,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
do_normalize: bool = True, do_normalize: bool = True,
image_mean: Union[float, List[float]] = None, image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None, image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True, do_pad: bool = True,
**kwargs, **kwargs,
) -> None: ) -> None:
...@@ -820,6 +826,10 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -820,6 +826,10 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
size = get_size_dict(size, max_size=max_size, default_to_square=False) size = get_size_dict(size, max_size=max_size, default_to_square=False)
# Backwards compatibility
if do_convert_annotations is None:
do_convert_annotations = do_normalize
super().__init__(**kwargs) super().__init__(**kwargs)
self.format = format self.format = format
self.do_resize = do_resize self.do_resize = do_resize
...@@ -828,6 +838,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -828,6 +838,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
self.do_rescale = do_rescale self.do_rescale = do_rescale
self.rescale_factor = rescale_factor self.rescale_factor = rescale_factor
self.do_normalize = do_normalize self.do_normalize = do_normalize
self.do_convert_annotations = do_convert_annotations
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad self.do_pad = do_pad
...@@ -1005,18 +1016,64 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1005,18 +1016,64 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
""" """
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format. `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
""" """
return normalize_annotation(annotation, image_size=image_size) return normalize_annotation(annotation, image_size=image_size)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size
for key, value in annotation.items():
if key == "masks":
masks = value
masks = pad(
masks,
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.FIRST,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "boxes" and update_bboxes:
boxes = value
boxes *= np.asarray(
[
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
]
)
new_annotation["boxes"] = boxes
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image( def _pad_image(
self, self,
image: np.ndarray, image: np.ndarray,
output_size: Tuple[int, int], output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0, constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> np.ndarray: ) -> np.ndarray:
""" """
Pad an image with zeros to the given size. Pad an image with zeros to the given size.
...@@ -1035,25 +1092,33 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1035,25 +1092,33 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
data_format=data_format, data_format=data_format,
input_data_format=input_data_format, input_data_format=input_data_format,
) )
return padded_image if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
)
return padded_image, annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad( def pad(
self, self,
images: List[np.ndarray], images: List[np.ndarray],
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
constant_values: Union[float, Iterable[float]] = 0, constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True, return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> BatchFeature: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask. in the batch and optionally returns their corresponding pixel mask.
Args: Args:
image (`np.ndarray`): images (List[`np.ndarray`]):
Image to pad. Images to pad.
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
Annotations to transform according to the padding that is applied to the images.
constant_values (`float` or `Iterable[float]`, *optional*): constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`. The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`): return_pixel_mask (`bool`, *optional*, defaults to `True`):
...@@ -1069,19 +1134,29 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1069,19 +1134,29 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*): input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred. The channel dimension format of the input image. If not provided, it will be inferred.
update_bboxes (`bool`, *optional*, defaults to `True`):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
""" """
pad_size = get_max_height_width(images, input_data_format=input_data_format) pad_size = get_max_height_width(images, input_data_format=input_data_format)
padded_images = [ annotation_list = annotations if annotations is not None else [None] * len(images)
self._pad_image( padded_images = []
padded_annotations = []
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image, image,
pad_size, pad_size,
annotation,
constant_values=constant_values, constant_values=constant_values,
data_format=data_format, data_format=data_format,
input_data_format=input_data_format, input_data_format=input_data_format,
update_bboxes=update_bboxes,
) )
for image in images padded_images.append(padded_image)
] padded_annotations.append(padded_annotation)
data = {"pixel_values": padded_images} data = {"pixel_values": padded_images}
if return_pixel_mask: if return_pixel_mask:
...@@ -1091,7 +1166,14 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1091,7 +1166,14 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
] ]
data["pixel_mask"] = masks data["pixel_mask"] = masks
return BatchFeature(data=data, tensor_type=return_tensors) encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
]
return encoded_inputs
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess
def preprocess( def preprocess(
...@@ -1106,6 +1188,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1106,6 +1188,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
do_rescale: Optional[bool] = None, do_rescale: Optional[bool] = None,
rescale_factor: Optional[Union[int, float]] = None, rescale_factor: Optional[Union[int, float]] = None,
do_normalize: Optional[bool] = None, do_normalize: Optional[bool] = None,
do_convert_annotations: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None, image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None,
do_pad: Optional[bool] = None, do_pad: Optional[bool] = None,
...@@ -1149,12 +1232,17 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1149,12 +1232,17 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
Rescale factor to use when rescaling the image. Rescale factor to use when rescaling the image.
do_normalize (`bool`, *optional*, defaults to self.do_normalize): do_normalize (`bool`, *optional*, defaults to self.do_normalize):
Whether to normalize the image. Whether to normalize the image.
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
Whether to convert the annotations to the format expected by the model. Converts the bounding
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
and in relative coordinates.
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean): image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
Mean to use when normalizing the image. Mean to use when normalizing the image.
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image. Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad): do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations. Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
...@@ -1195,6 +1283,9 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1195,6 +1283,9 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
do_normalize = self.do_normalize if do_normalize is None else do_normalize do_normalize = self.do_normalize if do_normalize is None else do_normalize
image_mean = self.image_mean if image_mean is None else image_mean image_mean = self.image_mean if image_mean is None else image_mean
image_std = self.image_std if image_std is None else image_std image_std = self.image_std if image_std is None else image_std
do_convert_annotations = (
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad do_pad = self.do_pad if do_pad is None else do_pad
format = self.format if format is None else format format = self.format if format is None else format
...@@ -1298,7 +1389,8 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1298,7 +1389,8 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
images = [ images = [
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
] ]
if annotations is not None:
if do_convert_annotations and annotations is not None:
annotations = [ annotations = [
self.normalize_annotation(annotation, get_image_size(image, input_data_format)) self.normalize_annotation(annotation, get_image_size(image, input_data_format))
for annotation, image in zip(annotations, images) for annotation, image in zip(annotations, images)
...@@ -1306,17 +1398,21 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1306,17 +1398,21 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
if do_pad: if do_pad:
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
data = self.pad( encoded_inputs = self.pad(
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format images,
annotations=annotations,
return_pixel_mask=True,
data_format=data_format,
input_data_format=input_data_format,
return_tensors=return_tensors,
update_bboxes=do_convert_annotations,
) )
else: else:
images = [ images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
for image in images for image in images
] ]
data = {"pixel_values": images} encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None: if annotations is not None:
encoded_inputs["labels"] = [ encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
......
...@@ -35,6 +35,7 @@ from ...image_utils import ( ...@@ -35,6 +35,7 @@ from ...image_utils import (
IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD, IMAGENET_DEFAULT_STD,
AnnotationFormat, AnnotationFormat,
AnnotationType,
ChannelDimension, ChannelDimension,
ImageInput, ImageInput,
PILImageResampling, PILImageResampling,
...@@ -492,9 +493,14 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -492,9 +493,14 @@ class DetaImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`): do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
overridden by the `do_pad` parameter in the `preprocess` method. method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
""" """
model_input_names = ["pixel_values", "pixel_mask"] model_input_names = ["pixel_values", "pixel_mask"]
...@@ -510,6 +516,7 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -510,6 +516,7 @@ class DetaImageProcessor(BaseImageProcessor):
do_normalize: bool = True, do_normalize: bool = True,
image_mean: Union[float, List[float]] = None, image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None, image_std: Union[float, List[float]] = None,
do_convert_annotations: bool = True,
do_pad: bool = True, do_pad: bool = True,
**kwargs, **kwargs,
) -> None: ) -> None:
...@@ -519,6 +526,9 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -519,6 +526,9 @@ class DetaImageProcessor(BaseImageProcessor):
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
size = get_size_dict(size, default_to_square=False) size = get_size_dict(size, default_to_square=False)
if do_convert_annotations is None:
do_convert_annotations = do_normalize
super().__init__(**kwargs) super().__init__(**kwargs)
self.format = format self.format = format
self.do_resize = do_resize self.do_resize = do_resize
...@@ -527,6 +537,7 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -527,6 +537,7 @@ class DetaImageProcessor(BaseImageProcessor):
self.do_rescale = do_rescale self.do_rescale = do_rescale
self.rescale_factor = rescale_factor self.rescale_factor = rescale_factor
self.do_normalize = do_normalize self.do_normalize = do_normalize
self.do_convert_annotations = do_convert_annotations
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad self.do_pad = do_pad
...@@ -680,18 +691,64 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -680,18 +691,64 @@ class DetaImageProcessor(BaseImageProcessor):
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
""" """
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format. `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
""" """
return normalize_annotation(annotation, image_size=image_size) return normalize_annotation(annotation, image_size=image_size)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size
for key, value in annotation.items():
if key == "masks":
masks = value
masks = pad(
masks,
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.FIRST,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "boxes" and update_bboxes:
boxes = value
boxes *= np.asarray(
[
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
]
)
new_annotation["boxes"] = boxes
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image( def _pad_image(
self, self,
image: np.ndarray, image: np.ndarray,
output_size: Tuple[int, int], output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0, constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> np.ndarray: ) -> np.ndarray:
""" """
Pad an image with zeros to the given size. Pad an image with zeros to the given size.
...@@ -710,25 +767,33 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -710,25 +767,33 @@ class DetaImageProcessor(BaseImageProcessor):
data_format=data_format, data_format=data_format,
input_data_format=input_data_format, input_data_format=input_data_format,
) )
return padded_image if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
)
return padded_image, annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad( def pad(
self, self,
images: List[np.ndarray], images: List[np.ndarray],
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
constant_values: Union[float, Iterable[float]] = 0, constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True, return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> BatchFeature: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask. in the batch and optionally returns their corresponding pixel mask.
Args: Args:
image (`np.ndarray`): images (List[`np.ndarray`]):
Image to pad. Images to pad.
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
Annotations to transform according to the padding that is applied to the images.
constant_values (`float` or `Iterable[float]`, *optional*): constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`. The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`): return_pixel_mask (`bool`, *optional*, defaults to `True`):
...@@ -744,19 +809,29 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -744,19 +809,29 @@ class DetaImageProcessor(BaseImageProcessor):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*): input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred. The channel dimension format of the input image. If not provided, it will be inferred.
update_bboxes (`bool`, *optional*, defaults to `True`):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
""" """
pad_size = get_max_height_width(images, input_data_format=input_data_format) pad_size = get_max_height_width(images, input_data_format=input_data_format)
padded_images = [ annotation_list = annotations if annotations is not None else [None] * len(images)
self._pad_image( padded_images = []
padded_annotations = []
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image, image,
pad_size, pad_size,
annotation,
constant_values=constant_values, constant_values=constant_values,
data_format=data_format, data_format=data_format,
input_data_format=input_data_format, input_data_format=input_data_format,
update_bboxes=update_bboxes,
) )
for image in images padded_images.append(padded_image)
] padded_annotations.append(padded_annotation)
data = {"pixel_values": padded_images} data = {"pixel_values": padded_images}
if return_pixel_mask: if return_pixel_mask:
...@@ -766,7 +841,14 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -766,7 +841,14 @@ class DetaImageProcessor(BaseImageProcessor):
] ]
data["pixel_mask"] = masks data["pixel_mask"] = masks
return BatchFeature(data=data, tensor_type=return_tensors) encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
]
return encoded_inputs
def preprocess( def preprocess(
self, self,
...@@ -782,6 +864,7 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -782,6 +864,7 @@ class DetaImageProcessor(BaseImageProcessor):
do_normalize: Optional[bool] = None, do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None, image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: Optional[bool] = None, do_pad: Optional[bool] = None,
format: Optional[Union[str, AnnotationFormat]] = None, format: Optional[Union[str, AnnotationFormat]] = None,
return_tensors: Optional[Union[TensorType, str]] = None, return_tensors: Optional[Union[TensorType, str]] = None,
...@@ -827,8 +910,13 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -827,8 +910,13 @@ class DetaImageProcessor(BaseImageProcessor):
Mean to use when normalizing the image. Mean to use when normalizing the image.
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image. Standard deviation to use when normalizing the image.
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
Whether to convert the annotations to the format expected by the model. Converts the bounding
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
and in relative coordinates.
do_pad (`bool`, *optional*, defaults to self.do_pad): do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations. Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
...@@ -861,6 +949,9 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -861,6 +949,9 @@ class DetaImageProcessor(BaseImageProcessor):
do_normalize = self.do_normalize if do_normalize is None else do_normalize do_normalize = self.do_normalize if do_normalize is None else do_normalize
image_mean = self.image_mean if image_mean is None else image_mean image_mean = self.image_mean if image_mean is None else image_mean
image_std = self.image_std if image_std is None else image_std image_std = self.image_std if image_std is None else image_std
do_convert_annotations = (
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad do_pad = self.do_pad if do_pad is None else do_pad
format = self.format if format is None else format format = self.format if format is None else format
...@@ -964,7 +1055,8 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -964,7 +1055,8 @@ class DetaImageProcessor(BaseImageProcessor):
images = [ images = [
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
] ]
if annotations is not None:
if do_convert_annotations and annotations is not None:
annotations = [ annotations = [
self.normalize_annotation(annotation, get_image_size(image, input_data_format)) self.normalize_annotation(annotation, get_image_size(image, input_data_format))
for annotation, image in zip(annotations, images) for annotation, image in zip(annotations, images)
...@@ -972,17 +1064,21 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -972,17 +1064,21 @@ class DetaImageProcessor(BaseImageProcessor):
if do_pad: if do_pad:
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
data = self.pad( encoded_inputs = self.pad(
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format images,
annotations=annotations,
return_pixel_mask=True,
data_format=data_format,
input_data_format=input_data_format,
return_tensors=return_tensors,
update_bboxes=do_convert_annotations,
) )
else: else:
images = [ images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
for image in images for image in images
] ]
data = {"pixel_values": images} encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None: if annotations is not None:
encoded_inputs["labels"] = [ encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
......
...@@ -760,7 +760,7 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -760,7 +760,7 @@ class DetrImageProcessor(BaseImageProcessor):
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method. `preprocess` method.
do_normalize: do_normalize (`bool`, *optional*, defaults to True):
Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
`preprocess` method. `preprocess` method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
...@@ -769,9 +769,14 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -769,9 +769,14 @@ class DetrImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`): do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
overridden by the `do_pad` parameter in the `preprocess` method. method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
""" """
model_input_names = ["pixel_values", "pixel_mask"] model_input_names = ["pixel_values", "pixel_mask"]
...@@ -787,6 +792,7 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -787,6 +792,7 @@ class DetrImageProcessor(BaseImageProcessor):
do_normalize: bool = True, do_normalize: bool = True,
image_mean: Union[float, List[float]] = None, image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None, image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True, do_pad: bool = True,
**kwargs, **kwargs,
) -> None: ) -> None:
...@@ -805,6 +811,10 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -805,6 +811,10 @@ class DetrImageProcessor(BaseImageProcessor):
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
size = get_size_dict(size, max_size=max_size, default_to_square=False) size = get_size_dict(size, max_size=max_size, default_to_square=False)
# Backwards compatibility
if do_convert_annotations is None:
do_convert_annotations = do_normalize
super().__init__(**kwargs) super().__init__(**kwargs)
self.format = format self.format = format
self.do_resize = do_resize self.do_resize = do_resize
...@@ -813,6 +823,7 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -813,6 +823,7 @@ class DetrImageProcessor(BaseImageProcessor):
self.do_rescale = do_rescale self.do_rescale = do_rescale
self.rescale_factor = rescale_factor self.rescale_factor = rescale_factor
self.do_normalize = do_normalize self.do_normalize = do_normalize
self.do_convert_annotations = do_convert_annotations
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad self.do_pad = do_pad
...@@ -981,17 +992,62 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -981,17 +992,62 @@ class DetrImageProcessor(BaseImageProcessor):
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
""" """
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format. `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
""" """
return normalize_annotation(annotation, image_size=image_size) return normalize_annotation(annotation, image_size=image_size)
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size
for key, value in annotation.items():
if key == "masks":
masks = value
masks = pad(
masks,
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.FIRST,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "boxes" and update_bboxes:
boxes = value
boxes *= np.asarray(
[
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
]
)
new_annotation["boxes"] = boxes
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation
def _pad_image( def _pad_image(
self, self,
image: np.ndarray, image: np.ndarray,
output_size: Tuple[int, int], output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0, constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> np.ndarray: ) -> np.ndarray:
""" """
Pad an image with zeros to the given size. Pad an image with zeros to the given size.
...@@ -1010,24 +1066,32 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1010,24 +1066,32 @@ class DetrImageProcessor(BaseImageProcessor):
data_format=data_format, data_format=data_format,
input_data_format=input_data_format, input_data_format=input_data_format,
) )
return padded_image if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
)
return padded_image, annotation
def pad( def pad(
self, self,
images: List[np.ndarray], images: List[np.ndarray],
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
constant_values: Union[float, Iterable[float]] = 0, constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True, return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> BatchFeature: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask. in the batch and optionally returns their corresponding pixel mask.
Args: Args:
image (`np.ndarray`): images (List[`np.ndarray`]):
Image to pad. Images to pad.
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
Annotations to transform according to the padding that is applied to the images.
constant_values (`float` or `Iterable[float]`, *optional*): constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`. The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`): return_pixel_mask (`bool`, *optional*, defaults to `True`):
...@@ -1043,19 +1107,29 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1043,19 +1107,29 @@ class DetrImageProcessor(BaseImageProcessor):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*): input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred. The channel dimension format of the input image. If not provided, it will be inferred.
update_bboxes (`bool`, *optional*, defaults to `True`):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
""" """
pad_size = get_max_height_width(images, input_data_format=input_data_format) pad_size = get_max_height_width(images, input_data_format=input_data_format)
padded_images = [ annotation_list = annotations if annotations is not None else [None] * len(images)
self._pad_image( padded_images = []
padded_annotations = []
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image, image,
pad_size, pad_size,
annotation,
constant_values=constant_values, constant_values=constant_values,
data_format=data_format, data_format=data_format,
input_data_format=input_data_format, input_data_format=input_data_format,
update_bboxes=update_bboxes,
) )
for image in images padded_images.append(padded_image)
] padded_annotations.append(padded_annotation)
data = {"pixel_values": padded_images} data = {"pixel_values": padded_images}
if return_pixel_mask: if return_pixel_mask:
...@@ -1065,7 +1139,14 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1065,7 +1139,14 @@ class DetrImageProcessor(BaseImageProcessor):
] ]
data["pixel_mask"] = masks data["pixel_mask"] = masks
return BatchFeature(data=data, tensor_type=return_tensors) encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
]
return encoded_inputs
def preprocess( def preprocess(
self, self,
...@@ -1079,6 +1160,7 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1079,6 +1160,7 @@ class DetrImageProcessor(BaseImageProcessor):
do_rescale: Optional[bool] = None, do_rescale: Optional[bool] = None,
rescale_factor: Optional[Union[int, float]] = None, rescale_factor: Optional[Union[int, float]] = None,
do_normalize: Optional[bool] = None, do_normalize: Optional[bool] = None,
do_convert_annotations: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None, image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None,
do_pad: Optional[bool] = None, do_pad: Optional[bool] = None,
...@@ -1122,12 +1204,17 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1122,12 +1204,17 @@ class DetrImageProcessor(BaseImageProcessor):
Rescale factor to use when rescaling the image. Rescale factor to use when rescaling the image.
do_normalize (`bool`, *optional*, defaults to self.do_normalize): do_normalize (`bool`, *optional*, defaults to self.do_normalize):
Whether to normalize the image. Whether to normalize the image.
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
Whether to convert the annotations to the format expected by the model. Converts the bounding
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
and in relative coordinates.
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean): image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
Mean to use when normalizing the image. Mean to use when normalizing the image.
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image. Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad): do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations. Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
...@@ -1168,6 +1255,9 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1168,6 +1255,9 @@ class DetrImageProcessor(BaseImageProcessor):
do_normalize = self.do_normalize if do_normalize is None else do_normalize do_normalize = self.do_normalize if do_normalize is None else do_normalize
image_mean = self.image_mean if image_mean is None else image_mean image_mean = self.image_mean if image_mean is None else image_mean
image_std = self.image_std if image_std is None else image_std image_std = self.image_std if image_std is None else image_std
do_convert_annotations = (
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad do_pad = self.do_pad if do_pad is None else do_pad
format = self.format if format is None else format format = self.format if format is None else format
...@@ -1271,7 +1361,8 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1271,7 +1361,8 @@ class DetrImageProcessor(BaseImageProcessor):
images = [ images = [
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
] ]
if annotations is not None:
if do_convert_annotations and annotations is not None:
annotations = [ annotations = [
self.normalize_annotation(annotation, get_image_size(image, input_data_format)) self.normalize_annotation(annotation, get_image_size(image, input_data_format))
for annotation, image in zip(annotations, images) for annotation, image in zip(annotations, images)
...@@ -1279,17 +1370,21 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1279,17 +1370,21 @@ class DetrImageProcessor(BaseImageProcessor):
if do_pad: if do_pad:
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
data = self.pad( encoded_inputs = self.pad(
images, return_pixel_mask=True, data_format=data_format, input_data_format=input_data_format images,
annotations=annotations,
return_pixel_mask=True,
data_format=data_format,
input_data_format=input_data_format,
return_tensors=return_tensors,
update_bboxes=do_convert_annotations,
) )
else: else:
images = [ images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
for image in images for image in images
] ]
data = {"pixel_values": images} encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None: if annotations is not None:
encoded_inputs["labels"] = [ encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
......
...@@ -771,7 +771,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor): ...@@ -771,7 +771,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
) )
return encoded_inputs return encoded_inputs
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
def _pad_image( def _pad_image(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -799,7 +799,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor): ...@@ -799,7 +799,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
) )
return padded_image return padded_image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
def pad( def pad(
self, self,
images: List[np.ndarray], images: List[np.ndarray],
......
...@@ -788,7 +788,7 @@ class MaskFormerImageProcessor(BaseImageProcessor): ...@@ -788,7 +788,7 @@ class MaskFormerImageProcessor(BaseImageProcessor):
) )
return encoded_inputs return encoded_inputs
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
def _pad_image( def _pad_image(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -816,7 +816,7 @@ class MaskFormerImageProcessor(BaseImageProcessor): ...@@ -816,7 +816,7 @@ class MaskFormerImageProcessor(BaseImageProcessor):
) )
return padded_image return padded_image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
def pad( def pad(
self, self,
images: List[np.ndarray], images: List[np.ndarray],
......
...@@ -770,7 +770,7 @@ class OneFormerImageProcessor(BaseImageProcessor): ...@@ -770,7 +770,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
) )
return encoded_inputs return encoded_inputs
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
def _pad_image( def _pad_image(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -798,7 +798,7 @@ class OneFormerImageProcessor(BaseImageProcessor): ...@@ -798,7 +798,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
) )
return padded_image return padded_image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
def pad( def pad(
self, self,
images: List[np.ndarray], images: List[np.ndarray],
......
...@@ -251,7 +251,6 @@ class ViltImageProcessor(BaseImageProcessor): ...@@ -251,7 +251,6 @@ class ViltImageProcessor(BaseImageProcessor):
**kwargs, **kwargs,
) )
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image( def _pad_image(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -279,7 +278,6 @@ class ViltImageProcessor(BaseImageProcessor): ...@@ -279,7 +278,6 @@ class ViltImageProcessor(BaseImageProcessor):
) )
return padded_image return padded_image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad( def pad(
self, self,
images: List[np.ndarray], images: List[np.ndarray],
......
...@@ -696,8 +696,9 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -696,8 +696,9 @@ class YolosImageProcessor(BaseImageProcessor):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`): do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
overridden by the `do_pad` parameter in the `preprocess` method. method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
""" """
model_input_names = ["pixel_values", "pixel_mask"] model_input_names = ["pixel_values", "pixel_mask"]
...@@ -713,6 +714,7 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -713,6 +714,7 @@ class YolosImageProcessor(BaseImageProcessor):
do_normalize: bool = True, do_normalize: bool = True,
image_mean: Union[float, List[float]] = None, image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None, image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True, do_pad: bool = True,
**kwargs, **kwargs,
) -> None: ) -> None:
...@@ -731,6 +733,10 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -731,6 +733,10 @@ class YolosImageProcessor(BaseImageProcessor):
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
size = get_size_dict(size, max_size=max_size, default_to_square=False) size = get_size_dict(size, max_size=max_size, default_to_square=False)
# Backwards compatibility
if do_convert_annotations is None:
do_convert_annotations = do_normalize
super().__init__(**kwargs) super().__init__(**kwargs)
self.format = format self.format = format
self.do_resize = do_resize self.do_resize = do_resize
...@@ -739,6 +745,7 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -739,6 +745,7 @@ class YolosImageProcessor(BaseImageProcessor):
self.do_rescale = do_rescale self.do_rescale = do_rescale
self.rescale_factor = rescale_factor self.rescale_factor = rescale_factor
self.do_normalize = do_normalize self.do_normalize = do_normalize
self.do_convert_annotations = do_convert_annotations
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad self.do_pad = do_pad
...@@ -916,18 +923,64 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -916,18 +923,64 @@ class YolosImageProcessor(BaseImageProcessor):
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
""" """
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format. `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
""" """
return normalize_annotation(annotation, image_size=image_size) return normalize_annotation(annotation, image_size=image_size)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size
for key, value in annotation.items():
if key == "masks":
masks = value
masks = pad(
masks,
padding,
mode=PaddingMode.CONSTANT,
constant_values=0,
input_data_format=ChannelDimension.FIRST,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "boxes" and update_bboxes:
boxes = value
boxes *= np.asarray(
[
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
input_image_size[1] / output_image_size[1],
input_image_size[0] / output_image_size[0],
]
)
new_annotation["boxes"] = boxes
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image( def _pad_image(
self, self,
image: np.ndarray, image: np.ndarray,
output_size: Tuple[int, int], output_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
constant_values: Union[float, Iterable[float]] = 0, constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> np.ndarray: ) -> np.ndarray:
""" """
Pad an image with zeros to the given size. Pad an image with zeros to the given size.
...@@ -946,16 +999,22 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -946,16 +999,22 @@ class YolosImageProcessor(BaseImageProcessor):
data_format=data_format, data_format=data_format,
input_data_format=input_data_format, input_data_format=input_data_format,
) )
return padded_image if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
)
return padded_image, annotation
def pad( def pad(
self, self,
images: List[np.ndarray], images: List[np.ndarray],
annotations: Optional[List[Dict[str, Any]]] = None,
constant_values: Union[float, Iterable[float]] = 0, constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = False, return_pixel_mask: bool = False,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
) -> BatchFeature: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
...@@ -964,6 +1023,9 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -964,6 +1023,9 @@ class YolosImageProcessor(BaseImageProcessor):
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
Image to pad. Image to pad.
annotations (`List[Dict[str, any]]`, *optional*):
Annotations to pad along with the images. If provided, the bounding boxes will be updated to match the
padded images.
constant_values (`float` or `Iterable[float]`, *optional*): constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`. The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`): return_pixel_mask (`bool`, *optional*, defaults to `True`):
...@@ -979,19 +1041,29 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -979,19 +1041,29 @@ class YolosImageProcessor(BaseImageProcessor):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*): input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred. The channel dimension format of the input image. If not provided, it will be inferred.
update_bboxes (`bool`, *optional*, defaults to `True`):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
""" """
pad_size = get_max_height_width(images, input_data_format=input_data_format) pad_size = get_max_height_width(images, input_data_format=input_data_format)
padded_images = [ annotation_list = annotations if annotations is not None else [None] * len(images)
self._pad_image( padded_images = []
padded_annotations = []
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image, image,
pad_size, pad_size,
annotation,
constant_values=constant_values, constant_values=constant_values,
data_format=data_format, data_format=data_format,
input_data_format=input_data_format, input_data_format=input_data_format,
update_bboxes=update_bboxes,
) )
for image in images padded_images.append(padded_image)
] padded_annotations.append(padded_annotation)
data = {"pixel_values": padded_images} data = {"pixel_values": padded_images}
if return_pixel_mask: if return_pixel_mask:
...@@ -1017,6 +1089,7 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1017,6 +1089,7 @@ class YolosImageProcessor(BaseImageProcessor):
do_normalize: Optional[bool] = None, do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None, image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: Optional[bool] = None, do_pad: Optional[bool] = None,
format: Optional[Union[str, AnnotationFormat]] = None, format: Optional[Union[str, AnnotationFormat]] = None,
return_tensors: Optional[Union[TensorType, str]] = None, return_tensors: Optional[Union[TensorType, str]] = None,
...@@ -1062,8 +1135,13 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1062,8 +1135,13 @@ class YolosImageProcessor(BaseImageProcessor):
Mean to use when normalizing the image. Mean to use when normalizing the image.
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image. Standard deviation to use when normalizing the image.
do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
Whether to convert the annotations to the format expected by the model. Converts the bounding
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
and in relative coordinates.
do_pad (`bool`, *optional*, defaults to self.do_pad): do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations. Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
...@@ -1101,6 +1179,9 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1101,6 +1179,9 @@ class YolosImageProcessor(BaseImageProcessor):
do_normalize = self.do_normalize if do_normalize is None else do_normalize do_normalize = self.do_normalize if do_normalize is None else do_normalize
image_mean = self.image_mean if image_mean is None else image_mean image_mean = self.image_mean if image_mean is None else image_mean
image_std = self.image_std if image_std is None else image_std image_std = self.image_std if image_std is None else image_std
do_convert_annotations = (
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad do_pad = self.do_pad if do_pad is None else do_pad
format = self.format if format is None else format format = self.format if format is None else format
...@@ -1204,22 +1285,30 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1204,22 +1285,30 @@ class YolosImageProcessor(BaseImageProcessor):
images = [ images = [
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
] ]
if annotations is not None:
if do_convert_annotations and annotations is not None:
annotations = [ annotations = [
self.normalize_annotation(annotation, get_image_size(image)) self.normalize_annotation(annotation, get_image_size(image))
for annotation, image in zip(annotations, images) for annotation, image in zip(annotations, images)
] ]
if do_pad: if do_pad:
data = self.pad(images, data_format=data_format, input_data_format=input_data_format) # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
encoded_inputs = self.pad(
images,
annotations=annotations,
return_pixel_mask=True,
data_format=data_format,
input_data_format=input_data_format,
update_bboxes=do_convert_annotations,
return_tensors=return_tensors,
)
else: else:
images = [ images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
for image in images for image in images
] ]
data = {"pixel_values": images} encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None: if annotations is not None:
encoded_inputs["labels"] = [ encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
......
...@@ -248,3 +248,246 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess ...@@ -248,3 +248,246 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
# verify size # verify size
expected_size = torch.tensor([800, 1066]) expected_size = torch.tensor([800, 1066])
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
@slow
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->ConditionalDetr, facebook/detr-resnet-50 ->microsoft/conditional-detr-resnet-50
def test_batched_coco_detection_annotations(self):
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
target = json.loads(f.read())
annotations_0 = {"image_id": 39769, "annotations": target}
annotations_1 = {"image_id": 39769, "annotations": target}
# Adjust the bounding boxes for the resized image
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotations_1["annotations"])):
coords = annotations_1["annotations"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotations_1["annotations"][i]["bbox"] = new_bbox
images = [image_0, image_1]
annotations = [annotations_0, annotations_1]
image_processing = ConditionalDetrImageProcessor()
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
return_tensors="pt", # do_convert_annotations=True
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.6879, 0.4609, 0.0755, 0.3691],
[0.2118, 0.3359, 0.2601, 0.1566],
[0.5011, 0.5000, 0.9979, 1.0000],
[0.5010, 0.5020, 0.9979, 0.9959],
[0.3284, 0.5944, 0.5884, 0.8112],
[0.8394, 0.5445, 0.3213, 0.9110],
]
)
expected_boxes_1 = torch.tensor(
[
[0.4130, 0.2765, 0.0453, 0.2215],
[0.1272, 0.2016, 0.1561, 0.0940],
[0.3757, 0.4933, 0.7488, 0.9865],
[0.3759, 0.5002, 0.7492, 0.9955],
[0.1971, 0.5456, 0.3532, 0.8646],
[0.5790, 0.4115, 0.3430, 0.7161],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
@slow
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->ConditionalDetr
def test_batched_coco_panoptic_annotations(self):
# prepare image, target and masks_path
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
target = json.loads(f.read())
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotation_1["segments_info"])):
coords = annotation_1["segments_info"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotation_1["segments_info"][i]["bbox"] = new_bbox
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
images = [image_0, image_1]
annotations = [annotation_0, annotation_1]
# encode them
image_processing = ConditionalDetrImageProcessor(format="coco_panoptic")
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_tensors="pt",
return_segmentation_masks=True,
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.2625, 0.5437, 0.4688, 0.8625],
[0.7719, 0.4104, 0.4531, 0.7125],
[0.5000, 0.4927, 0.9969, 0.9854],
[0.1688, 0.2000, 0.2063, 0.0917],
[0.5492, 0.2760, 0.0578, 0.2187],
[0.4992, 0.4990, 0.9984, 0.9979],
]
)
expected_boxes_1 = torch.tensor(
[
[0.1576, 0.3262, 0.2814, 0.5175],
[0.4634, 0.2463, 0.2720, 0.4275],
[0.3002, 0.2956, 0.5985, 0.5913],
[0.1013, 0.1200, 0.1238, 0.0550],
[0.3297, 0.1656, 0.0347, 0.1312],
[0.2997, 0.2994, 0.5994, 0.5987],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
...@@ -250,3 +250,246 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi ...@@ -250,3 +250,246 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
# verify size # verify size
expected_size = torch.tensor([800, 1066]) expected_size = torch.tensor([800, 1066])
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
@slow
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->DeformableDetr
def test_batched_coco_detection_annotations(self):
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
target = json.loads(f.read())
annotations_0 = {"image_id": 39769, "annotations": target}
annotations_1 = {"image_id": 39769, "annotations": target}
# Adjust the bounding boxes for the resized image
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotations_1["annotations"])):
coords = annotations_1["annotations"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotations_1["annotations"][i]["bbox"] = new_bbox
images = [image_0, image_1]
annotations = [annotations_0, annotations_1]
image_processing = DeformableDetrImageProcessor()
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
return_tensors="pt", # do_convert_annotations=True
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.6879, 0.4609, 0.0755, 0.3691],
[0.2118, 0.3359, 0.2601, 0.1566],
[0.5011, 0.5000, 0.9979, 1.0000],
[0.5010, 0.5020, 0.9979, 0.9959],
[0.3284, 0.5944, 0.5884, 0.8112],
[0.8394, 0.5445, 0.3213, 0.9110],
]
)
expected_boxes_1 = torch.tensor(
[
[0.4130, 0.2765, 0.0453, 0.2215],
[0.1272, 0.2016, 0.1561, 0.0940],
[0.3757, 0.4933, 0.7488, 0.9865],
[0.3759, 0.5002, 0.7492, 0.9955],
[0.1971, 0.5456, 0.3532, 0.8646],
[0.5790, 0.4115, 0.3430, 0.7161],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
@slow
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->DeformableDetr
def test_batched_coco_panoptic_annotations(self):
# prepare image, target and masks_path
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
target = json.loads(f.read())
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotation_1["segments_info"])):
coords = annotation_1["segments_info"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotation_1["segments_info"][i]["bbox"] = new_bbox
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
images = [image_0, image_1]
annotations = [annotation_0, annotation_1]
# encode them
image_processing = DeformableDetrImageProcessor(format="coco_panoptic")
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_tensors="pt",
return_segmentation_masks=True,
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.2625, 0.5437, 0.4688, 0.8625],
[0.7719, 0.4104, 0.4531, 0.7125],
[0.5000, 0.4927, 0.9969, 0.9854],
[0.1688, 0.2000, 0.2063, 0.0917],
[0.5492, 0.2760, 0.0578, 0.2187],
[0.4992, 0.4990, 0.9984, 0.9979],
]
)
expected_boxes_1 = torch.tensor(
[
[0.1576, 0.3262, 0.2814, 0.5175],
[0.4634, 0.2463, 0.2720, 0.4275],
[0.3002, 0.2956, 0.5985, 0.5913],
[0.1013, 0.1200, 0.1238, 0.0550],
[0.3297, 0.1656, 0.0347, 0.1312],
[0.2997, 0.2994, 0.5994, 0.5987],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
...@@ -244,3 +244,246 @@ class DetaImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi ...@@ -244,3 +244,246 @@ class DetaImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
# verify size # verify size
expected_size = torch.tensor([800, 1066]) expected_size = torch.tensor([800, 1066])
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
@slow
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->Deta
def test_batched_coco_detection_annotations(self):
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
target = json.loads(f.read())
annotations_0 = {"image_id": 39769, "annotations": target}
annotations_1 = {"image_id": 39769, "annotations": target}
# Adjust the bounding boxes for the resized image
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotations_1["annotations"])):
coords = annotations_1["annotations"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotations_1["annotations"][i]["bbox"] = new_bbox
images = [image_0, image_1]
annotations = [annotations_0, annotations_1]
image_processing = DetaImageProcessor()
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
return_tensors="pt", # do_convert_annotations=True
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.6879, 0.4609, 0.0755, 0.3691],
[0.2118, 0.3359, 0.2601, 0.1566],
[0.5011, 0.5000, 0.9979, 1.0000],
[0.5010, 0.5020, 0.9979, 0.9959],
[0.3284, 0.5944, 0.5884, 0.8112],
[0.8394, 0.5445, 0.3213, 0.9110],
]
)
expected_boxes_1 = torch.tensor(
[
[0.4130, 0.2765, 0.0453, 0.2215],
[0.1272, 0.2016, 0.1561, 0.0940],
[0.3757, 0.4933, 0.7488, 0.9865],
[0.3759, 0.5002, 0.7492, 0.9955],
[0.1971, 0.5456, 0.3532, 0.8646],
[0.5790, 0.4115, 0.3430, 0.7161],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
@slow
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->Deta
def test_batched_coco_panoptic_annotations(self):
# prepare image, target and masks_path
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
target = json.loads(f.read())
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotation_1["segments_info"])):
coords = annotation_1["segments_info"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotation_1["segments_info"][i]["bbox"] = new_bbox
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
images = [image_0, image_1]
annotations = [annotation_0, annotation_1]
# encode them
image_processing = DetaImageProcessor(format="coco_panoptic")
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_tensors="pt",
return_segmentation_masks=True,
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.2625, 0.5437, 0.4688, 0.8625],
[0.7719, 0.4104, 0.4531, 0.7125],
[0.5000, 0.4927, 0.9969, 0.9854],
[0.1688, 0.2000, 0.2063, 0.0917],
[0.5492, 0.2760, 0.0578, 0.2187],
[0.4992, 0.4990, 0.9984, 0.9979],
]
)
expected_boxes_1 = torch.tensor(
[
[0.1576, 0.3262, 0.2814, 0.5175],
[0.4634, 0.2463, 0.2720, 0.4275],
[0.3002, 0.2956, 0.5985, 0.5913],
[0.1013, 0.1200, 0.1238, 0.0550],
[0.3297, 0.1656, 0.0347, 0.1312],
[0.2997, 0.2994, 0.5994, 0.5987],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import json import json
import pathlib import pathlib
import unittest import unittest
...@@ -308,3 +307,244 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi ...@@ -308,3 +307,244 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
# verify size # verify size
expected_size = torch.tensor([800, 1066]) expected_size = torch.tensor([800, 1066])
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
@slow
def test_batched_coco_detection_annotations(self):
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
target = json.loads(f.read())
annotations_0 = {"image_id": 39769, "annotations": target}
annotations_1 = {"image_id": 39769, "annotations": target}
# Adjust the bounding boxes for the resized image
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotations_1["annotations"])):
coords = annotations_1["annotations"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotations_1["annotations"][i]["bbox"] = new_bbox
images = [image_0, image_1]
annotations = [annotations_0, annotations_1]
image_processing = DetrImageProcessor()
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
return_tensors="pt", # do_convert_annotations=True
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.6879, 0.4609, 0.0755, 0.3691],
[0.2118, 0.3359, 0.2601, 0.1566],
[0.5011, 0.5000, 0.9979, 1.0000],
[0.5010, 0.5020, 0.9979, 0.9959],
[0.3284, 0.5944, 0.5884, 0.8112],
[0.8394, 0.5445, 0.3213, 0.9110],
]
)
expected_boxes_1 = torch.tensor(
[
[0.4130, 0.2765, 0.0453, 0.2215],
[0.1272, 0.2016, 0.1561, 0.0940],
[0.3757, 0.4933, 0.7488, 0.9865],
[0.3759, 0.5002, 0.7492, 0.9955],
[0.1971, 0.5456, 0.3532, 0.8646],
[0.5790, 0.4115, 0.3430, 0.7161],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
@slow
def test_batched_coco_panoptic_annotations(self):
# prepare image, target and masks_path
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
target = json.loads(f.read())
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotation_1["segments_info"])):
coords = annotation_1["segments_info"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotation_1["segments_info"][i]["bbox"] = new_bbox
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
images = [image_0, image_1]
annotations = [annotation_0, annotation_1]
# encode them
image_processing = DetrImageProcessor(format="coco_panoptic")
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_tensors="pt",
return_segmentation_masks=True,
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.2625, 0.5437, 0.4688, 0.8625],
[0.7719, 0.4104, 0.4531, 0.7125],
[0.5000, 0.4927, 0.9969, 0.9854],
[0.1688, 0.2000, 0.2063, 0.0917],
[0.5492, 0.2760, 0.0578, 0.2187],
[0.4992, 0.4990, 0.9984, 0.9979],
]
)
expected_boxes_1 = torch.tensor(
[
[0.1576, 0.3262, 0.2814, 0.5175],
[0.4634, 0.2463, 0.2720, 0.4275],
[0.3002, 0.2956, 0.5985, 0.5913],
[0.1013, 0.1200, 0.1238, 0.0550],
[0.3297, 0.1656, 0.0347, 0.1312],
[0.2997, 0.2994, 0.5994, 0.5987],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
...@@ -287,3 +287,246 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix ...@@ -287,3 +287,246 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
# verify size # verify size
expected_size = torch.tensor([800, 1056]) expected_size = torch.tensor([800, 1056])
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
@slow
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->Yolos
def test_batched_coco_detection_annotations(self):
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
target = json.loads(f.read())
annotations_0 = {"image_id": 39769, "annotations": target}
annotations_1 = {"image_id": 39769, "annotations": target}
# Adjust the bounding boxes for the resized image
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotations_1["annotations"])):
coords = annotations_1["annotations"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotations_1["annotations"][i]["bbox"] = new_bbox
images = [image_0, image_1]
annotations = [annotations_0, annotations_1]
image_processing = YolosImageProcessor()
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
return_tensors="pt", # do_convert_annotations=True
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.6879, 0.4609, 0.0755, 0.3691],
[0.2118, 0.3359, 0.2601, 0.1566],
[0.5011, 0.5000, 0.9979, 1.0000],
[0.5010, 0.5020, 0.9979, 0.9959],
[0.3284, 0.5944, 0.5884, 0.8112],
[0.8394, 0.5445, 0.3213, 0.9110],
]
)
expected_boxes_1 = torch.tensor(
[
[0.4130, 0.2765, 0.0453, 0.2215],
[0.1272, 0.2016, 0.1561, 0.0940],
[0.3757, 0.4933, 0.7488, 0.9865],
[0.3759, 0.5002, 0.7492, 0.9955],
[0.1971, 0.5456, 0.3532, 0.8646],
[0.5790, 0.4115, 0.3430, 0.7161],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
@slow
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->Yolos
def test_batched_coco_panoptic_annotations(self):
# prepare image, target and masks_path
image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
target = json.loads(f.read())
annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
w_0, h_0 = image_0.size
w_1, h_1 = image_1.size
for i in range(len(annotation_1["segments_info"])):
coords = annotation_1["segments_info"][i]["bbox"]
new_bbox = [
coords[0] * w_1 / w_0,
coords[1] * h_1 / h_0,
coords[2] * w_1 / w_0,
coords[3] * h_1 / h_0,
]
annotation_1["segments_info"][i]["bbox"] = new_bbox
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
images = [image_0, image_1]
annotations = [annotation_0, annotation_1]
# encode them
image_processing = YolosImageProcessor(format="coco_panoptic")
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_tensors="pt",
return_segmentation_masks=True,
)
# Check the pixel values have been padded
postprocessed_height, postprocessed_width = 800, 1066
expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
# Check the bounding boxes have been adjusted for padded images
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
expected_boxes_0 = torch.tensor(
[
[0.2625, 0.5437, 0.4688, 0.8625],
[0.7719, 0.4104, 0.4531, 0.7125],
[0.5000, 0.4927, 0.9969, 0.9854],
[0.1688, 0.2000, 0.2063, 0.0917],
[0.5492, 0.2760, 0.0578, 0.2187],
[0.4992, 0.4990, 0.9984, 0.9979],
]
)
expected_boxes_1 = torch.tensor(
[
[0.1576, 0.3262, 0.2814, 0.5175],
[0.4634, 0.2463, 0.2720, 0.4275],
[0.3002, 0.2956, 0.5985, 0.5913],
[0.1013, 0.1200, 0.1238, 0.0550],
[0.3297, 0.1656, 0.0347, 0.1312],
[0.2997, 0.2994, 0.5994, 0.5987],
]
)
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3))
# Check the masks have also been padded
self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066]))
self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066]))
# Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
# format and not in the range [0, 1]
encoding = image_processing(
images=images,
annotations=annotations,
masks_path=masks_path,
return_segmentation_masks=True,
do_convert_annotations=False,
return_tensors="pt",
)
self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4]))
self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4]))
# Convert to absolute coordinates
unnormalized_boxes_0 = torch.vstack(
[
expected_boxes_0[:, 0] * postprocessed_width,
expected_boxes_0[:, 1] * postprocessed_height,
expected_boxes_0[:, 2] * postprocessed_width,
expected_boxes_0[:, 3] * postprocessed_height,
]
).T
unnormalized_boxes_1 = torch.vstack(
[
expected_boxes_1[:, 0] * postprocessed_width,
expected_boxes_1[:, 1] * postprocessed_height,
expected_boxes_1[:, 2] * postprocessed_width,
expected_boxes_1[:, 3] * postprocessed_height,
]
).T
# Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
expected_boxes_0 = torch.vstack(
[
unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
]
).T
expected_boxes_1 = torch.vstack(
[
unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
]
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment