Unverified Commit bf646fbf authored by Pavel Iakubovskii's avatar Pavel Iakubovskii Committed by GitHub
Browse files

Add fixed resize and pad strategy for object detection (#30742)

* Add resize and pad strategy

* Merge get_size functions

* Add pad_size + tests to object detection models

* Fixup

* Update docstrings

* Fixup
parent e9a8041d
...@@ -662,7 +662,13 @@ class BaseImageProcessor(ImageProcessingMixin): ...@@ -662,7 +662,13 @@ class BaseImageProcessor(ImageProcessingMixin):
) )
VALID_SIZE_DICT_KEYS = ({"height", "width"}, {"shortest_edge"}, {"shortest_edge", "longest_edge"}, {"longest_edge"}) VALID_SIZE_DICT_KEYS = (
{"height", "width"},
{"shortest_edge"},
{"shortest_edge", "longest_edge"},
{"longest_edge"},
{"max_height", "max_width"},
)
def is_valid_size_dict(size_dict): def is_valid_size_dict(size_dict):
......
...@@ -147,6 +147,42 @@ def get_resize_output_image_size( ...@@ -147,6 +147,42 @@ def get_resize_output_image_size(
return get_size_with_aspect_ratio(image_size, size, max_size) return get_size_with_aspect_ratio(image_size, size, max_size)
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
def get_image_size_for_max_height_width(
input_image: np.ndarray,
max_height: int,
max_width: int,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
Important, even if image_height < max_height and image_width < max_width, the image will be resized
to at least one of the edges be equal to max_height or max_width.
For example:
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
Args:
input_image (`np.ndarray`):
The image to resize.
max_height (`int`):
The maximum allowed height.
max_width (`int`):
The maximum allowed width.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
height, width = image_size
height_scale = max_height / height
width_scale = max_width / width
min_scale = min(height_scale, width_scale)
new_height = int(height * min_scale)
new_width = int(width * min_scale)
return new_height, new_width
# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn # Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
def get_numpy_to_framework_fn(arr) -> Callable: def get_numpy_to_framework_fn(arr) -> Callable:
""" """
...@@ -768,8 +804,16 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -768,8 +804,16 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method. overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
the `preprocess` method. in the `preprocess` method. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`): do_rescale (`bool`, *optional*, defaults to `True`):
...@@ -793,8 +837,13 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -793,8 +837,13 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`): do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch. method. If `True`, padding will be applied to the bottom and right of the image with zeros.
Padding will be applied to the bottom and right of the image with zeros. If `pad_size` is provided, the image will be padded to the specified dimensions.
Otherwise, the image will be padded to the maximum height and width of the batch.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
model_input_names = ["pixel_values", "pixel_mask"] model_input_names = ["pixel_values", "pixel_mask"]
...@@ -813,6 +862,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -813,6 +862,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
image_std: Union[float, List[float]] = None, image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None, do_convert_annotations: Optional[bool] = None,
do_pad: bool = True, do_pad: bool = True,
pad_size: Optional[Dict[str, int]] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
...@@ -846,6 +896,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -846,6 +896,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad self.do_pad = do_pad
self.pad_size = pad_size
self._valid_processor_keys = [ self._valid_processor_keys = [
"images", "images",
"annotations", "annotations",
...@@ -861,6 +912,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -861,6 +912,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
"image_mean", "image_mean",
"image_std", "image_std",
"do_pad", "do_pad",
"pad_size",
"format", "format",
"return_tensors", "return_tensors",
"data_format", "data_format",
...@@ -933,8 +985,15 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -933,8 +985,15 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or Size of the image's `(height, width)` dimensions after resizing. Available options are:
`height` and `width`. - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Resampling filter to use if resizing the image.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
...@@ -953,18 +1012,27 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -953,18 +1012,27 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
max_size = None max_size = None
size = get_size_dict(size, max_size=max_size, default_to_square=False) size = get_size_dict(size, max_size=max_size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size: if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size( new_size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
) )
elif "max_height" in size and "max_width" in size:
new_size = get_image_size_for_max_height_width(
image, size["max_height"], size["max_width"], input_data_format=input_data_format
)
elif "height" in size and "width" in size: elif "height" in size and "width" in size:
size = (size["height"], size["width"]) new_size = (size["height"], size["width"])
else: else:
raise ValueError( raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}." f" {size.keys()}."
) )
image = resize( image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs image,
size=new_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
) )
return image return image
...@@ -1108,6 +1176,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1108,6 +1176,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True, update_bboxes: bool = True,
pad_size: Optional[Dict[str, int]] = None,
) -> BatchFeature: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
...@@ -1137,8 +1206,16 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1137,8 +1206,16 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
Whether to update the bounding boxes in the annotations to match the padded images. If the Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated. format, the bounding boxes will not be updated.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
pad_size = get_max_height_width(images, input_data_format=input_data_format) pad_size = pad_size if pad_size is not None else self.pad_size
if pad_size is not None:
padded_size = (pad_size["height"], pad_size["width"])
else:
padded_size = get_max_height_width(images, input_data_format=input_data_format)
annotation_list = annotations if annotations is not None else [None] * len(images) annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = [] padded_images = []
...@@ -1146,7 +1223,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1146,7 +1223,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
for image, annotation in zip(images, annotation_list): for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image( padded_image, padded_annotation = self._pad_image(
image, image,
pad_size, padded_size,
annotation, annotation,
constant_values=constant_values, constant_values=constant_values,
data_format=data_format, data_format=data_format,
...@@ -1160,7 +1237,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1160,7 +1237,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
if return_pixel_mask: if return_pixel_mask:
masks = [ masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
for image in images for image in images
] ]
data["pixel_mask"] = masks data["pixel_mask"] = masks
...@@ -1195,6 +1272,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1195,6 +1272,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
return_tensors: Optional[Union[TensorType, str]] = None, return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
pad_size: Optional[Dict[str, int]] = None,
**kwargs, **kwargs,
) -> BatchFeature: ) -> BatchFeature:
""" """
...@@ -1222,7 +1300,15 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1222,7 +1300,15 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
do_resize (`bool`, *optional*, defaults to self.do_resize): do_resize (`bool`, *optional*, defaults to self.do_resize):
Whether to resize the image. Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to self.size): size (`Dict[str, int]`, *optional*, defaults to self.size):
Size of the image after resizing. Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to self.resample): resample (`PILImageResampling`, *optional*, defaults to self.resample):
Resampling filter to use when resizing the image. Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to self.do_rescale): do_rescale (`bool`, *optional*, defaults to self.do_rescale):
...@@ -1240,8 +1326,9 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1240,8 +1326,9 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image. Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad): do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch Whether to pad the image. If `True`, padding will be applied to the bottom and right of
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. the image with zeros. If `pad_size` is provided, the image will be padded to the specified
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations. Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
...@@ -1257,6 +1344,10 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1257,6 +1344,10 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
logger.warning_once( logger.warning_once(
...@@ -1286,6 +1377,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1286,6 +1377,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
) )
do_pad = self.do_pad if do_pad is None else do_pad do_pad = self.do_pad if do_pad is None else do_pad
pad_size = self.pad_size if pad_size is None else pad_size
format = self.format if format is None else format format = self.format if format is None else format
images = make_list_of_images(images) images = make_list_of_images(images)
...@@ -1410,6 +1502,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -1410,6 +1502,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
input_data_format=input_data_format, input_data_format=input_data_format,
update_bboxes=do_convert_annotations, update_bboxes=do_convert_annotations,
return_tensors=return_tensors, return_tensors=return_tensors,
pad_size=pad_size,
) )
else: else:
images = [ images = [
......
...@@ -145,6 +145,42 @@ def get_resize_output_image_size( ...@@ -145,6 +145,42 @@ def get_resize_output_image_size(
return get_size_with_aspect_ratio(image_size, size, max_size) return get_size_with_aspect_ratio(image_size, size, max_size)
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
def get_image_size_for_max_height_width(
input_image: np.ndarray,
max_height: int,
max_width: int,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
Important, even if image_height < max_height and image_width < max_width, the image will be resized
to at least one of the edges be equal to max_height or max_width.
For example:
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
Args:
input_image (`np.ndarray`):
The image to resize.
max_height (`int`):
The maximum allowed height.
max_width (`int`):
The maximum allowed width.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
height, width = image_size
height_scale = max_height / height
width_scale = max_width / width
min_scale = min(height_scale, width_scale)
new_height = int(height * min_scale)
new_width = int(width * min_scale)
return new_height, new_width
# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn # Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
def get_numpy_to_framework_fn(arr) -> Callable: def get_numpy_to_framework_fn(arr) -> Callable:
""" """
...@@ -766,8 +802,16 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -766,8 +802,16 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method. overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
the `preprocess` method. in the `preprocess` method. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`): do_rescale (`bool`, *optional*, defaults to `True`):
...@@ -791,8 +835,13 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -791,8 +835,13 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`): do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch. method. If `True`, padding will be applied to the bottom and right of the image with zeros.
Padding will be applied to the bottom and right of the image with zeros. If `pad_size` is provided, the image will be padded to the specified dimensions.
Otherwise, the image will be padded to the maximum height and width of the batch.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
model_input_names = ["pixel_values", "pixel_mask"] model_input_names = ["pixel_values", "pixel_mask"]
...@@ -811,6 +860,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -811,6 +860,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
image_std: Union[float, List[float]] = None, image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None, do_convert_annotations: Optional[bool] = None,
do_pad: bool = True, do_pad: bool = True,
pad_size: Optional[Dict[str, int]] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
...@@ -844,6 +894,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -844,6 +894,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad self.do_pad = do_pad
self.pad_size = pad_size
self._valid_processor_keys = [ self._valid_processor_keys = [
"images", "images",
"annotations", "annotations",
...@@ -859,6 +910,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -859,6 +910,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
"image_mean", "image_mean",
"image_std", "image_std",
"do_pad", "do_pad",
"pad_size",
"format", "format",
"return_tensors", "return_tensors",
"data_format", "data_format",
...@@ -931,8 +983,15 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -931,8 +983,15 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or Size of the image's `(height, width)` dimensions after resizing. Available options are:
`height` and `width`. - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Resampling filter to use if resizing the image.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
...@@ -951,18 +1010,27 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -951,18 +1010,27 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
max_size = None max_size = None
size = get_size_dict(size, max_size=max_size, default_to_square=False) size = get_size_dict(size, max_size=max_size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size: if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size( new_size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
) )
elif "max_height" in size and "max_width" in size:
new_size = get_image_size_for_max_height_width(
image, size["max_height"], size["max_width"], input_data_format=input_data_format
)
elif "height" in size and "width" in size: elif "height" in size and "width" in size:
size = (size["height"], size["width"]) new_size = (size["height"], size["width"])
else: else:
raise ValueError( raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}." f" {size.keys()}."
) )
image = resize( image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs image,
size=new_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
) )
return image return image
...@@ -1106,6 +1174,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1106,6 +1174,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True, update_bboxes: bool = True,
pad_size: Optional[Dict[str, int]] = None,
) -> BatchFeature: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
...@@ -1135,8 +1204,16 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1135,8 +1204,16 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
Whether to update the bounding boxes in the annotations to match the padded images. If the Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated. format, the bounding boxes will not be updated.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
pad_size = get_max_height_width(images, input_data_format=input_data_format) pad_size = pad_size if pad_size is not None else self.pad_size
if pad_size is not None:
padded_size = (pad_size["height"], pad_size["width"])
else:
padded_size = get_max_height_width(images, input_data_format=input_data_format)
annotation_list = annotations if annotations is not None else [None] * len(images) annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = [] padded_images = []
...@@ -1144,7 +1221,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1144,7 +1221,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
for image, annotation in zip(images, annotation_list): for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image( padded_image, padded_annotation = self._pad_image(
image, image,
pad_size, padded_size,
annotation, annotation,
constant_values=constant_values, constant_values=constant_values,
data_format=data_format, data_format=data_format,
...@@ -1158,7 +1235,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1158,7 +1235,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
if return_pixel_mask: if return_pixel_mask:
masks = [ masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
for image in images for image in images
] ]
data["pixel_mask"] = masks data["pixel_mask"] = masks
...@@ -1193,6 +1270,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1193,6 +1270,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
return_tensors: Optional[Union[TensorType, str]] = None, return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
pad_size: Optional[Dict[str, int]] = None,
**kwargs, **kwargs,
) -> BatchFeature: ) -> BatchFeature:
""" """
...@@ -1220,7 +1298,15 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1220,7 +1298,15 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
do_resize (`bool`, *optional*, defaults to self.do_resize): do_resize (`bool`, *optional*, defaults to self.do_resize):
Whether to resize the image. Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to self.size): size (`Dict[str, int]`, *optional*, defaults to self.size):
Size of the image after resizing. Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to self.resample): resample (`PILImageResampling`, *optional*, defaults to self.resample):
Resampling filter to use when resizing the image. Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to self.do_rescale): do_rescale (`bool`, *optional*, defaults to self.do_rescale):
...@@ -1238,8 +1324,9 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1238,8 +1324,9 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image. Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad): do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch Whether to pad the image. If `True`, padding will be applied to the bottom and right of
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. the image with zeros. If `pad_size` is provided, the image will be padded to the specified
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations. Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
...@@ -1255,6 +1342,10 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1255,6 +1342,10 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
logger.warning_once( logger.warning_once(
...@@ -1284,6 +1375,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1284,6 +1375,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
) )
do_pad = self.do_pad if do_pad is None else do_pad do_pad = self.do_pad if do_pad is None else do_pad
pad_size = self.pad_size if pad_size is None else pad_size
format = self.format if format is None else format format = self.format if format is None else format
images = make_list_of_images(images) images = make_list_of_images(images)
...@@ -1408,6 +1500,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -1408,6 +1500,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
input_data_format=input_data_format, input_data_format=input_data_format,
update_bboxes=do_convert_annotations, update_bboxes=do_convert_annotations,
return_tensors=return_tensors, return_tensors=return_tensors,
pad_size=pad_size,
) )
else: else:
images = [ images = [
......
...@@ -139,6 +139,42 @@ def get_resize_output_image_size( ...@@ -139,6 +139,42 @@ def get_resize_output_image_size(
return get_size_with_aspect_ratio(image_size, size, max_size) return get_size_with_aspect_ratio(image_size, size, max_size)
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
def get_image_size_for_max_height_width(
input_image: np.ndarray,
max_height: int,
max_width: int,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
Important, even if image_height < max_height and image_width < max_width, the image will be resized
to at least one of the edges be equal to max_height or max_width.
For example:
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
Args:
input_image (`np.ndarray`):
The image to resize.
max_height (`int`):
The maximum allowed height.
max_width (`int`):
The maximum allowed width.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
height, width = image_size
height_scale = max_height / height
width_scale = max_width / width
min_scale = min(height_scale, width_scale)
new_height = int(height * min_scale)
new_width = int(width * min_scale)
return new_height, new_width
# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn # Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
def get_numpy_to_framework_fn(arr) -> Callable: def get_numpy_to_framework_fn(arr) -> Callable:
""" """
...@@ -475,8 +511,16 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -475,8 +511,16 @@ class DetaImageProcessor(BaseImageProcessor):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method. overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
the `preprocess` method. in the `preprocess` method. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`): do_rescale (`bool`, *optional*, defaults to `True`):
...@@ -500,8 +544,13 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -500,8 +544,13 @@ class DetaImageProcessor(BaseImageProcessor):
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`): do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch. method. If `True`, padding will be applied to the bottom and right of the image with zeros.
Padding will be applied to the bottom and right of the image with zeros. If `pad_size` is provided, the image will be padded to the specified dimensions.
Otherwise, the image will be padded to the maximum height and width of the batch.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
model_input_names = ["pixel_values", "pixel_mask"] model_input_names = ["pixel_values", "pixel_mask"]
...@@ -519,6 +568,7 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -519,6 +568,7 @@ class DetaImageProcessor(BaseImageProcessor):
image_std: Union[float, List[float]] = None, image_std: Union[float, List[float]] = None,
do_convert_annotations: bool = True, do_convert_annotations: bool = True,
do_pad: bool = True, do_pad: bool = True,
pad_size: Optional[Dict[str, int]] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
...@@ -542,6 +592,7 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -542,6 +592,7 @@ class DetaImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad self.do_pad = do_pad
self.pad_size = pad_size
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DETA # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DETA
def prepare_annotation( def prepare_annotation(
...@@ -593,7 +644,15 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -593,7 +644,15 @@ class DetaImageProcessor(BaseImageProcessor):
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
The desired output size. Can contain keys `shortest_edge` and `longest_edge` or `height` and `width`. Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Resampling filter to use if resizing the image.
data_format (`ChannelDimension`, *optional*): data_format (`ChannelDimension`, *optional*):
...@@ -605,18 +664,22 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -605,18 +664,22 @@ class DetaImageProcessor(BaseImageProcessor):
""" """
size = get_size_dict(size, default_to_square=False) size = get_size_dict(size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size: if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size( new_size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
) )
elif "height" in size and "width" in size: elif "height" in size and "width" in size:
size = (size["height"], size["width"]) new_size = (size["height"], size["width"])
elif "max_height" in size and "max_width" in size:
new_size = get_image_size_for_max_height_width(
image, size["max_height"], size["max_width"], input_data_format=input_data_format
)
else: else:
raise ValueError( raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}." f" {size.keys()}."
) )
image = resize( image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format image, size=new_size, resample=resample, data_format=data_format, input_data_format=input_data_format
) )
return image return image
...@@ -760,6 +823,7 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -760,6 +823,7 @@ class DetaImageProcessor(BaseImageProcessor):
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True, update_bboxes: bool = True,
pad_size: Optional[Dict[str, int]] = None,
) -> BatchFeature: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
...@@ -789,8 +853,16 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -789,8 +853,16 @@ class DetaImageProcessor(BaseImageProcessor):
Whether to update the bounding boxes in the annotations to match the padded images. If the Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated. format, the bounding boxes will not be updated.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
pad_size = get_max_height_width(images, input_data_format=input_data_format) pad_size = pad_size if pad_size is not None else self.pad_size
if pad_size is not None:
padded_size = (pad_size["height"], pad_size["width"])
else:
padded_size = get_max_height_width(images, input_data_format=input_data_format)
annotation_list = annotations if annotations is not None else [None] * len(images) annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = [] padded_images = []
...@@ -798,7 +870,7 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -798,7 +870,7 @@ class DetaImageProcessor(BaseImageProcessor):
for image, annotation in zip(images, annotation_list): for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image( padded_image, padded_annotation = self._pad_image(
image, image,
pad_size, padded_size,
annotation, annotation,
constant_values=constant_values, constant_values=constant_values,
data_format=data_format, data_format=data_format,
...@@ -812,7 +884,7 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -812,7 +884,7 @@ class DetaImageProcessor(BaseImageProcessor):
if return_pixel_mask: if return_pixel_mask:
masks = [ masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
for image in images for image in images
] ]
data["pixel_mask"] = masks data["pixel_mask"] = masks
...@@ -846,6 +918,7 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -846,6 +918,7 @@ class DetaImageProcessor(BaseImageProcessor):
return_tensors: Optional[Union[TensorType, str]] = None, return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
pad_size: Optional[Dict[str, int]] = None,
**kwargs, **kwargs,
) -> BatchFeature: ) -> BatchFeature:
""" """
...@@ -873,7 +946,15 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -873,7 +946,15 @@ class DetaImageProcessor(BaseImageProcessor):
do_resize (`bool`, *optional*, defaults to self.do_resize): do_resize (`bool`, *optional*, defaults to self.do_resize):
Whether to resize the image. Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to self.size): size (`Dict[str, int]`, *optional*, defaults to self.size):
Size of the image after resizing. Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to self.resample): resample (`PILImageResampling`, *optional*, defaults to self.resample):
Resampling filter to use when resizing the image. Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to self.do_rescale): do_rescale (`bool`, *optional*, defaults to self.do_rescale):
...@@ -891,8 +972,9 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -891,8 +972,9 @@ class DetaImageProcessor(BaseImageProcessor):
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
and in relative coordinates. and in relative coordinates.
do_pad (`bool`, *optional*, defaults to self.do_pad): do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch Whether to pad the image. If `True`, padding will be applied to the bottom and right of
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. the image with zeros. If `pad_size` is provided, the image will be padded to the specified
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations. Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
...@@ -908,6 +990,10 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -908,6 +990,10 @@ class DetaImageProcessor(BaseImageProcessor):
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
logger.warning_once( logger.warning_once(
...@@ -929,6 +1015,7 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -929,6 +1015,7 @@ class DetaImageProcessor(BaseImageProcessor):
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
) )
do_pad = self.do_pad if do_pad is None else do_pad do_pad = self.do_pad if do_pad is None else do_pad
pad_size = self.pad_size if pad_size is None else pad_size
format = self.format if format is None else format format = self.format if format is None else format
# Here, the pad() method pads to the maximum of (width, height). It does not need to be validated. # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated.
...@@ -1051,6 +1138,7 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -1051,6 +1138,7 @@ class DetaImageProcessor(BaseImageProcessor):
input_data_format=input_data_format, input_data_format=input_data_format,
return_tensors=return_tensors, return_tensors=return_tensors,
update_bboxes=do_convert_annotations, update_bboxes=do_convert_annotations,
pad_size=pad_size,
) )
else: else:
images = [ images = [
......
...@@ -116,6 +116,41 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in ...@@ -116,6 +116,41 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
return (oh, ow) return (oh, ow)
def get_image_size_for_max_height_width(
input_image: np.ndarray,
max_height: int,
max_width: int,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
Important, even if image_height < max_height and image_width < max_width, the image will be resized
to at least one of the edges be equal to max_height or max_width.
For example:
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
Args:
input_image (`np.ndarray`):
The image to resize.
max_height (`int`):
The maximum allowed height.
max_width (`int`):
The maximum allowed width.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
height, width = image_size
height_scale = max_height / height
width_scale = max_width / width
min_scale = min(height_scale, width_scale)
new_height = int(height * min_scale)
new_width = int(width * min_scale)
return new_height, new_width
def get_resize_output_image_size( def get_resize_output_image_size(
input_image: np.ndarray, input_image: np.ndarray,
size: Union[int, Tuple[int, int], List[int]], size: Union[int, Tuple[int, int], List[int]],
...@@ -753,7 +788,15 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -753,7 +788,15 @@ class DetrImageProcessor(BaseImageProcessor):
overridden by the `do_resize` parameter in the `preprocess` method. overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
in the `preprocess` method. in the `preprocess` method. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`): do_rescale (`bool`, *optional*, defaults to `True`):
...@@ -777,8 +820,13 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -777,8 +820,13 @@ class DetrImageProcessor(BaseImageProcessor):
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`): do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch. method. If `True`, padding will be applied to the bottom and right of the image with zeros.
Padding will be applied to the bottom and right of the image with zeros. If `pad_size` is provided, the image will be padded to the specified dimensions.
Otherwise, the image will be padded to the maximum height and width of the batch.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
model_input_names = ["pixel_values", "pixel_mask"] model_input_names = ["pixel_values", "pixel_mask"]
...@@ -796,6 +844,7 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -796,6 +844,7 @@ class DetrImageProcessor(BaseImageProcessor):
image_std: Union[float, List[float]] = None, image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None, do_convert_annotations: Optional[bool] = None,
do_pad: bool = True, do_pad: bool = True,
pad_size: Optional[Dict[str, int]] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
...@@ -829,6 +878,7 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -829,6 +878,7 @@ class DetrImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad self.do_pad = do_pad
self.pad_size = pad_size
self._valid_processor_keys = [ self._valid_processor_keys = [
"images", "images",
"annotations", "annotations",
...@@ -844,6 +894,7 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -844,6 +894,7 @@ class DetrImageProcessor(BaseImageProcessor):
"image_mean", "image_mean",
"image_std", "image_std",
"do_pad", "do_pad",
"pad_size",
"format", "format",
"return_tensors", "return_tensors",
"data_format", "data_format",
...@@ -913,8 +964,15 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -913,8 +964,15 @@ class DetrImageProcessor(BaseImageProcessor):
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or Size of the image's `(height, width)` dimensions after resizing. Available options are:
`height` and `width`. - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Resampling filter to use if resizing the image.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
...@@ -933,18 +991,27 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -933,18 +991,27 @@ class DetrImageProcessor(BaseImageProcessor):
max_size = None max_size = None
size = get_size_dict(size, max_size=max_size, default_to_square=False) size = get_size_dict(size, max_size=max_size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size: if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size( new_size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
) )
elif "max_height" in size and "max_width" in size:
new_size = get_image_size_for_max_height_width(
image, size["max_height"], size["max_width"], input_data_format=input_data_format
)
elif "height" in size and "width" in size: elif "height" in size and "width" in size:
size = (size["height"], size["width"]) new_size = (size["height"], size["width"])
else: else:
raise ValueError( raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}." f" {size.keys()}."
) )
image = resize( image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs image,
size=new_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
) )
return image return image
...@@ -1083,6 +1150,7 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1083,6 +1150,7 @@ class DetrImageProcessor(BaseImageProcessor):
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True, update_bboxes: bool = True,
pad_size: Optional[Dict[str, int]] = None,
) -> BatchFeature: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
...@@ -1112,8 +1180,16 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1112,8 +1180,16 @@ class DetrImageProcessor(BaseImageProcessor):
Whether to update the bounding boxes in the annotations to match the padded images. If the Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated. format, the bounding boxes will not be updated.
""" pad_size (`Dict[str, int]`, *optional*):
pad_size = get_max_height_width(images, input_data_format=input_data_format) The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
pad_size = pad_size if pad_size is not None else self.pad_size
if pad_size is not None:
padded_size = (pad_size["height"], pad_size["width"])
else:
padded_size = get_max_height_width(images, input_data_format=input_data_format)
annotation_list = annotations if annotations is not None else [None] * len(images) annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = [] padded_images = []
...@@ -1121,7 +1197,7 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1121,7 +1197,7 @@ class DetrImageProcessor(BaseImageProcessor):
for image, annotation in zip(images, annotation_list): for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image( padded_image, padded_annotation = self._pad_image(
image, image,
pad_size, padded_size,
annotation, annotation,
constant_values=constant_values, constant_values=constant_values,
data_format=data_format, data_format=data_format,
...@@ -1135,7 +1211,7 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1135,7 +1211,7 @@ class DetrImageProcessor(BaseImageProcessor):
if return_pixel_mask: if return_pixel_mask:
masks = [ masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
for image in images for image in images
] ]
data["pixel_mask"] = masks data["pixel_mask"] = masks
...@@ -1169,6 +1245,7 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1169,6 +1245,7 @@ class DetrImageProcessor(BaseImageProcessor):
return_tensors: Optional[Union[TensorType, str]] = None, return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
pad_size: Optional[Dict[str, int]] = None,
**kwargs, **kwargs,
) -> BatchFeature: ) -> BatchFeature:
""" """
...@@ -1196,7 +1273,15 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1196,7 +1273,15 @@ class DetrImageProcessor(BaseImageProcessor):
do_resize (`bool`, *optional*, defaults to self.do_resize): do_resize (`bool`, *optional*, defaults to self.do_resize):
Whether to resize the image. Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to self.size): size (`Dict[str, int]`, *optional*, defaults to self.size):
Size of the image after resizing. Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to self.resample): resample (`PILImageResampling`, *optional*, defaults to self.resample):
Resampling filter to use when resizing the image. Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to self.do_rescale): do_rescale (`bool`, *optional*, defaults to self.do_rescale):
...@@ -1214,8 +1299,9 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1214,8 +1299,9 @@ class DetrImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image. Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad): do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch Whether to pad the image. If `True`, padding will be applied to the bottom and right of
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. the image with zeros. If `pad_size` is provided, the image will be padded to the specified
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations. Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
...@@ -1231,6 +1317,10 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1231,6 +1317,10 @@ class DetrImageProcessor(BaseImageProcessor):
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
logger.warning_once( logger.warning_once(
...@@ -1260,6 +1350,7 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1260,6 +1350,7 @@ class DetrImageProcessor(BaseImageProcessor):
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
) )
do_pad = self.do_pad if do_pad is None else do_pad do_pad = self.do_pad if do_pad is None else do_pad
pad_size = self.pad_size if pad_size is None else pad_size
format = self.format if format is None else format format = self.format if format is None else format
images = make_list_of_images(images) images = make_list_of_images(images)
...@@ -1384,6 +1475,7 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -1384,6 +1475,7 @@ class DetrImageProcessor(BaseImageProcessor):
input_data_format=input_data_format, input_data_format=input_data_format,
update_bboxes=do_convert_annotations, update_bboxes=do_convert_annotations,
return_tensors=return_tensors, return_tensors=return_tensors,
pad_size=pad_size,
) )
else: else:
images = [ images = [
......
...@@ -152,6 +152,42 @@ def get_resize_output_image_size( ...@@ -152,6 +152,42 @@ def get_resize_output_image_size(
return get_size_with_aspect_ratio(image_size, size, max_size) return get_size_with_aspect_ratio(image_size, size, max_size)
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
def get_image_size_for_max_height_width(
input_image: np.ndarray,
max_height: int,
max_width: int,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
Important, even if image_height < max_height and image_width < max_width, the image will be resized
to at least one of the edges be equal to max_height or max_width.
For example:
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
Args:
input_image (`np.ndarray`):
The image to resize.
max_height (`int`):
The maximum allowed height.
max_width (`int`):
The maximum allowed width.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
height, width = image_size
height_scale = max_height / height
width_scale = max_width / width
min_scale = min(height_scale, width_scale)
new_height = int(height * min_scale)
new_width = int(width * min_scale)
return new_height, new_width
# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn # Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
def get_numpy_to_framework_fn(arr) -> Callable: def get_numpy_to_framework_fn(arr) -> Callable:
""" """
...@@ -773,8 +809,16 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -773,8 +809,16 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method. overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
the `preprocess` method. in the `preprocess` method. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
Resampling filter to use if resizing the image. Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`): do_rescale (`bool`, *optional*, defaults to `True`):
...@@ -798,8 +842,14 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -798,8 +842,14 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`): do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
overridden by the `do_pad` parameter in the `preprocess` method. method. If `True`, padding will be applied to the bottom and right of the image with zeros.
If `pad_size` is provided, the image will be padded to the specified dimensions.
Otherwise, the image will be padded to the maximum height and width of the batch.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
model_input_names = ["pixel_values", "pixel_mask"] model_input_names = ["pixel_values", "pixel_mask"]
...@@ -818,6 +868,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -818,6 +868,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
image_std: Union[float, List[float]] = None, image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None, do_convert_annotations: Optional[bool] = None,
do_pad: bool = True, do_pad: bool = True,
pad_size: Optional[Dict[str, int]] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
...@@ -851,6 +902,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -851,6 +902,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad self.do_pad = do_pad
self.pad_size = pad_size
self._valid_processor_keys = [ self._valid_processor_keys = [
"images", "images",
"annotations", "annotations",
...@@ -866,6 +918,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -866,6 +918,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
"image_mean", "image_mean",
"image_std", "image_std",
"do_pad", "do_pad",
"pad_size",
"format", "format",
"return_tensors", "return_tensors",
"data_format", "data_format",
...@@ -938,8 +991,15 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -938,8 +991,15 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or Size of the image's `(height, width)` dimensions after resizing. Available options are:
`height` and `width`. - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Resampling filter to use if resizing the image.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
...@@ -958,18 +1018,27 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -958,18 +1018,27 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
max_size = None max_size = None
size = get_size_dict(size, max_size=max_size, default_to_square=False) size = get_size_dict(size, max_size=max_size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size: if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size( new_size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
) )
elif "max_height" in size and "max_width" in size:
new_size = get_image_size_for_max_height_width(
image, size["max_height"], size["max_width"], input_data_format=input_data_format
)
elif "height" in size and "width" in size: elif "height" in size and "width" in size:
size = (size["height"], size["width"]) new_size = (size["height"], size["width"])
else: else:
raise ValueError( raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}." f" {size.keys()}."
) )
image = resize( image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs image,
size=new_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
) )
return image return image
...@@ -1113,6 +1182,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -1113,6 +1182,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True, update_bboxes: bool = True,
pad_size: Optional[Dict[str, int]] = None,
) -> BatchFeature: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
...@@ -1142,8 +1212,16 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -1142,8 +1212,16 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
Whether to update the bounding boxes in the annotations to match the padded images. If the Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated. format, the bounding boxes will not be updated.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
pad_size = get_max_height_width(images, input_data_format=input_data_format) pad_size = pad_size if pad_size is not None else self.pad_size
if pad_size is not None:
padded_size = (pad_size["height"], pad_size["width"])
else:
padded_size = get_max_height_width(images, input_data_format=input_data_format)
annotation_list = annotations if annotations is not None else [None] * len(images) annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = [] padded_images = []
...@@ -1151,7 +1229,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -1151,7 +1229,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
for image, annotation in zip(images, annotation_list): for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image( padded_image, padded_annotation = self._pad_image(
image, image,
pad_size, padded_size,
annotation, annotation,
constant_values=constant_values, constant_values=constant_values,
data_format=data_format, data_format=data_format,
...@@ -1165,7 +1243,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -1165,7 +1243,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
if return_pixel_mask: if return_pixel_mask:
masks = [ masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
for image in images for image in images
] ]
data["pixel_mask"] = masks data["pixel_mask"] = masks
...@@ -1200,6 +1278,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -1200,6 +1278,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
return_tensors: Optional[Union[TensorType, str]] = None, return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
pad_size: Optional[Dict[str, int]] = None,
**kwargs, **kwargs,
) -> BatchFeature: ) -> BatchFeature:
""" """
...@@ -1227,7 +1306,15 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -1227,7 +1306,15 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
do_resize (`bool`, *optional*, defaults to self.do_resize): do_resize (`bool`, *optional*, defaults to self.do_resize):
Whether to resize the image. Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to self.size): size (`Dict[str, int]`, *optional*, defaults to self.size):
Size of the image after resizing. Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to self.resample): resample (`PILImageResampling`, *optional*, defaults to self.resample):
Resampling filter to use when resizing the image. Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to self.do_rescale): do_rescale (`bool`, *optional*, defaults to self.do_rescale):
...@@ -1245,8 +1332,9 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -1245,8 +1332,9 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image. Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad): do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch Whether to pad the image. If `True`, padding will be applied to the bottom and right of
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. the image with zeros. If `pad_size` is provided, the image will be padded to the specified
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations. Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
...@@ -1262,6 +1350,10 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -1262,6 +1350,10 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
logger.warning_once( logger.warning_once(
...@@ -1291,6 +1383,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -1291,6 +1383,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
) )
do_pad = self.do_pad if do_pad is None else do_pad do_pad = self.do_pad if do_pad is None else do_pad
pad_size = self.pad_size if pad_size is None else pad_size
format = self.format if format is None else format format = self.format if format is None else format
images = make_list_of_images(images) images = make_list_of_images(images)
...@@ -1415,6 +1508,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor): ...@@ -1415,6 +1508,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
input_data_format=input_data_format, input_data_format=input_data_format,
update_bboxes=do_convert_annotations, update_bboxes=do_convert_annotations,
return_tensors=return_tensors, return_tensors=return_tensors,
pad_size=pad_size,
) )
else: else:
images = [ images = [
......
...@@ -133,6 +133,42 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in ...@@ -133,6 +133,42 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
return (height, width) return (height, width)
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
def get_image_size_for_max_height_width(
input_image: np.ndarray,
max_height: int,
max_width: int,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
Important, even if image_height < max_height and image_width < max_width, the image will be resized
to at least one of the edges be equal to max_height or max_width.
For example:
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
Args:
input_image (`np.ndarray`):
The image to resize.
max_height (`int`):
The maximum allowed height.
max_width (`int`):
The maximum allowed width.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
height, width = image_size
height_scale = max_height / height
width_scale = max_width / width
min_scale = min(height_scale, width_scale)
new_height = int(height * min_scale)
new_width = int(width * min_scale)
return new_height, new_width
# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size # Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
def get_resize_output_image_size( def get_resize_output_image_size(
input_image: np.ndarray, input_image: np.ndarray,
...@@ -678,8 +714,16 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -678,8 +714,16 @@ class YolosImageProcessor(BaseImageProcessor):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method. overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
the `preprocess` method. in the `preprocess` method. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`): do_rescale (`bool`, *optional*, defaults to `True`):
...@@ -699,8 +743,13 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -699,8 +743,13 @@ class YolosImageProcessor(BaseImageProcessor):
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`): do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch. method. If `True`, padding will be applied to the bottom and right of the image with zeros.
Padding will be applied to the bottom and right of the image with zeros. If `pad_size` is provided, the image will be padded to the specified dimensions.
Otherwise, the image will be padded to the maximum height and width of the batch.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
model_input_names = ["pixel_values", "pixel_mask"] model_input_names = ["pixel_values", "pixel_mask"]
...@@ -718,6 +767,7 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -718,6 +767,7 @@ class YolosImageProcessor(BaseImageProcessor):
image_std: Union[float, List[float]] = None, image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None, do_convert_annotations: Optional[bool] = None,
do_pad: bool = True, do_pad: bool = True,
pad_size: Optional[Dict[str, int]] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
...@@ -751,6 +801,7 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -751,6 +801,7 @@ class YolosImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad self.do_pad = do_pad
self.pad_size = pad_size
self._valid_processor_keys = [ self._valid_processor_keys = [
"images", "images",
"annotations", "annotations",
...@@ -766,6 +817,7 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -766,6 +817,7 @@ class YolosImageProcessor(BaseImageProcessor):
"image_std", "image_std",
"do_convert_annotations", "do_convert_annotations",
"do_pad", "do_pad",
"pad_size",
"format", "format",
"return_tensors", "return_tensors",
"data_format", "data_format",
...@@ -838,8 +890,15 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -838,8 +890,15 @@ class YolosImageProcessor(BaseImageProcessor):
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or Size of the image's `(height, width)` dimensions after resizing. Available options are:
`height` and `width`. - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Resampling filter to use if resizing the image.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
...@@ -858,18 +917,27 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -858,18 +917,27 @@ class YolosImageProcessor(BaseImageProcessor):
max_size = None max_size = None
size = get_size_dict(size, max_size=max_size, default_to_square=False) size = get_size_dict(size, max_size=max_size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size: if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size( new_size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
) )
elif "max_height" in size and "max_width" in size:
new_size = get_image_size_for_max_height_width(
image, size["max_height"], size["max_width"], input_data_format=input_data_format
)
elif "height" in size and "width" in size: elif "height" in size and "width" in size:
size = (size["height"], size["width"]) new_size = (size["height"], size["width"])
else: else:
raise ValueError( raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}." f" {size.keys()}."
) )
image = resize( image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs image,
size=new_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
) )
return image return image
...@@ -1012,6 +1080,7 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1012,6 +1080,7 @@ class YolosImageProcessor(BaseImageProcessor):
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True, update_bboxes: bool = True,
pad_size: Optional[Dict[str, int]] = None,
) -> BatchFeature: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
...@@ -1042,8 +1111,16 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1042,8 +1111,16 @@ class YolosImageProcessor(BaseImageProcessor):
Whether to update the bounding boxes in the annotations to match the padded images. If the Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated. format, the bounding boxes will not be updated.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
pad_size = get_max_height_width(images, input_data_format=input_data_format) pad_size = pad_size if pad_size is not None else self.pad_size
if pad_size is not None:
padded_size = (pad_size["height"], pad_size["width"])
else:
padded_size = get_max_height_width(images, input_data_format=input_data_format)
annotation_list = annotations if annotations is not None else [None] * len(images) annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = [] padded_images = []
...@@ -1051,7 +1128,7 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1051,7 +1128,7 @@ class YolosImageProcessor(BaseImageProcessor):
for image, annotation in zip(images, annotation_list): for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image( padded_image, padded_annotation = self._pad_image(
image, image,
pad_size, padded_size,
annotation, annotation,
constant_values=constant_values, constant_values=constant_values,
data_format=data_format, data_format=data_format,
...@@ -1065,7 +1142,7 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1065,7 +1142,7 @@ class YolosImageProcessor(BaseImageProcessor):
if return_pixel_mask: if return_pixel_mask:
masks = [ masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
for image in images for image in images
] ]
data["pixel_mask"] = masks data["pixel_mask"] = masks
...@@ -1099,6 +1176,7 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1099,6 +1176,7 @@ class YolosImageProcessor(BaseImageProcessor):
return_tensors: Optional[Union[TensorType, str]] = None, return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None,
pad_size: Optional[Dict[str, int]] = None,
**kwargs, **kwargs,
) -> BatchFeature: ) -> BatchFeature:
""" """
...@@ -1126,7 +1204,15 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1126,7 +1204,15 @@ class YolosImageProcessor(BaseImageProcessor):
do_resize (`bool`, *optional*, defaults to self.do_resize): do_resize (`bool`, *optional*, defaults to self.do_resize):
Whether to resize the image. Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to self.size): size (`Dict[str, int]`, *optional*, defaults to self.size):
Size of the image after resizing. Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to self.resample): resample (`PILImageResampling`, *optional*, defaults to self.resample):
Resampling filter to use when resizing the image. Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to self.do_rescale): do_rescale (`bool`, *optional*, defaults to self.do_rescale):
...@@ -1144,8 +1230,9 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1144,8 +1230,9 @@ class YolosImageProcessor(BaseImageProcessor):
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
and in relative coordinates. and in relative coordinates.
do_pad (`bool`, *optional*, defaults to self.do_pad): do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch Whether to pad the image. If `True`, padding will be applied to the bottom and right of
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. the image with zeros. If `pad_size` is provided, the image will be padded to the specified
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations. Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
...@@ -1158,6 +1245,10 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1158,6 +1245,10 @@ class YolosImageProcessor(BaseImageProcessor):
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
""" """
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
logger.warning_once( logger.warning_once(
...@@ -1187,6 +1278,7 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1187,6 +1278,7 @@ class YolosImageProcessor(BaseImageProcessor):
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
) )
do_pad = self.do_pad if do_pad is None else do_pad do_pad = self.do_pad if do_pad is None else do_pad
pad_size = self.pad_size if pad_size is None else pad_size
format = self.format if format is None else format format = self.format if format is None else format
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
...@@ -1310,6 +1402,7 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1310,6 +1402,7 @@ class YolosImageProcessor(BaseImageProcessor):
input_data_format=input_data_format, input_data_format=input_data_format,
update_bboxes=do_convert_annotations, update_bboxes=do_convert_annotations,
return_tensors=return_tensors, return_tensors=return_tensors,
pad_size=pad_size,
) )
else: else:
images = [ images = [
......
...@@ -490,3 +490,50 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess ...@@ -490,3 +490,50 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
).T ).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->ConditionalDetr
def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = ConditionalDetrImageProcessor(
size={"max_height": 100, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = ConditionalDetrImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = ConditionalDetrImageProcessor(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = ConditionalDetrImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=True,
pad_size={"height": 301, "width": 101},
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = ConditionalDetrImageProcessor(
size={"max_height": 150, "max_width": 100},
do_pad=True,
pad_size={"height": 150, "width": 100},
)
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
...@@ -492,3 +492,50 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi ...@@ -492,3 +492,50 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
).T ).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->DeformableDetr
def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = DeformableDetrImageProcessor(
size={"max_height": 100, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = DeformableDetrImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = DeformableDetrImageProcessor(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = DeformableDetrImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=True,
pad_size={"height": 301, "width": 101},
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = DeformableDetrImageProcessor(
size={"max_height": 150, "max_width": 100},
do_pad=True,
pad_size={"height": 150, "width": 100},
)
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
...@@ -486,3 +486,50 @@ class DetaImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi ...@@ -486,3 +486,50 @@ class DetaImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
).T ).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Deta
def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = DetaImageProcessor(
size={"max_height": 100, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = DetaImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = DetaImageProcessor(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = DetaImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=True,
pad_size={"height": 301, "width": 101},
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = DetaImageProcessor(
size={"max_height": 150, "max_width": 100},
do_pad=True,
pad_size={"height": 150, "width": 100},
)
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
...@@ -547,3 +547,49 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi ...@@ -547,3 +547,49 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
).T ).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = DetrImageProcessor(
size={"max_height": 100, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = DetrImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = DetrImageProcessor(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = DetrImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=True,
pad_size={"height": 301, "width": 101},
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = DetrImageProcessor(
size={"max_height": 150, "max_width": 100},
do_pad=True,
pad_size={"height": 150, "width": 100},
)
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
...@@ -528,3 +528,50 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin ...@@ -528,3 +528,50 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
).T ).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->GroundingDino
def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = GroundingDinoImageProcessor(
size={"max_height": 100, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = GroundingDinoImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = GroundingDinoImageProcessor(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = GroundingDinoImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=True,
pad_size={"height": 301, "width": 101},
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = GroundingDinoImageProcessor(
size={"max_height": 150, "max_width": 100},
do_pad=True,
pad_size={"height": 150, "width": 100},
)
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
...@@ -546,3 +546,50 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix ...@@ -546,3 +546,50 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
).T ).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Yolos
def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = YolosImageProcessor(
size={"max_height": 100, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = YolosImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = YolosImageProcessor(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = YolosImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=True,
pad_size={"height": 301, "width": 101},
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = YolosImageProcessor(
size={"max_height": 150, "max_width": 100},
do_pad=True,
pad_size={"height": 150, "width": 100},
)
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment