Unverified Commit 6bca43bb authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Input data format (#25464)

* Add copied from statements for image processors

* Move out rescale and normalize to base image processor

* Remove rescale and normalize from vit (post rebase)

* Update docstrings and tidy up

* PR comments

* Add input_data_format as preprocess argument

* Resolve tests and tidy up

* Remove num_channels argument

* Update doc strings -> default ints not in code formatting
parent a6609caf
...@@ -26,6 +26,7 @@ from ...image_utils import ( ...@@ -26,6 +26,7 @@ from ...image_utils import (
ChannelDimension, ChannelDimension,
ImageInput, ImageInput,
PILImageResampling, PILImageResampling,
infer_channel_dimension_format,
make_list_of_images, make_list_of_images,
to_numpy_array, to_numpy_array,
valid_images, valid_images,
...@@ -99,6 +100,7 @@ class ViTImageProcessor(BaseImageProcessor): ...@@ -99,6 +100,7 @@ class ViTImageProcessor(BaseImageProcessor):
size: Dict[str, int], size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR, resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None, data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
""" """
...@@ -116,6 +118,13 @@ class ViTImageProcessor(BaseImageProcessor): ...@@ -116,6 +118,13 @@ class ViTImageProcessor(BaseImageProcessor):
image is used. Can be one of: image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
Returns: Returns:
`np.ndarray`: The resized image. `np.ndarray`: The resized image.
...@@ -124,7 +133,14 @@ class ViTImageProcessor(BaseImageProcessor): ...@@ -124,7 +133,14 @@ class ViTImageProcessor(BaseImageProcessor):
if "height" not in size or "width" not in size: if "height" not in size or "width" not in size:
raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
output_size = (size["height"], size["width"]) output_size = (size["height"], size["width"])
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs) return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def preprocess( def preprocess(
self, self,
...@@ -139,6 +155,7 @@ class ViTImageProcessor(BaseImageProcessor): ...@@ -139,6 +155,7 @@ class ViTImageProcessor(BaseImageProcessor):
image_std: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs, **kwargs,
): ):
""" """
...@@ -177,6 +194,12 @@ class ViTImageProcessor(BaseImageProcessor): ...@@ -177,6 +194,12 @@ class ViTImageProcessor(BaseImageProcessor):
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image. - Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
""" """
do_resize = do_resize if do_resize is not None else self.do_resize do_resize = do_resize if do_resize is not None else self.do_resize
do_rescale = do_rescale if do_rescale is not None else self.do_rescale do_rescale = do_rescale if do_rescale is not None else self.do_rescale
...@@ -206,16 +229,31 @@ class ViTImageProcessor(BaseImageProcessor): ...@@ -206,16 +229,31 @@ class ViTImageProcessor(BaseImageProcessor):
# All transformations expect numpy arrays. # All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images] images = [to_numpy_array(image) for image in images]
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
if do_resize: if do_resize:
images = [self.resize(image=image, size=size_dict, resample=resample) for image in images] images = [
self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
for image in images
]
if do_rescale: if do_rescale:
images = [self.rescale(image=image, scale=rescale_factor) for image in images] images = [
self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
for image in images
]
if do_normalize: if do_normalize:
images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images] images = [
self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
for image in images
]
images = [to_channel_dimension_format(image, data_format) for image in images] images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
]
data = {"pixel_values": images} data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors) return BatchFeature(data=data, tensor_type=return_tensors)
...@@ -31,6 +31,7 @@ from ...image_utils import ( ...@@ -31,6 +31,7 @@ from ...image_utils import (
ChannelDimension, ChannelDimension,
ImageInput, ImageInput,
PILImageResampling, PILImageResampling,
infer_channel_dimension_format,
make_list_of_images, make_list_of_images,
to_numpy_array, to_numpy_array,
valid_images, valid_images,
...@@ -125,6 +126,7 @@ class ViTHybridImageProcessor(BaseImageProcessor): ...@@ -125,6 +126,7 @@ class ViTHybridImageProcessor(BaseImageProcessor):
size: Dict[str, int], size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC, resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None, data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
""" """
...@@ -140,12 +142,23 @@ class ViTHybridImageProcessor(BaseImageProcessor): ...@@ -140,12 +142,23 @@ class ViTHybridImageProcessor(BaseImageProcessor):
Resampling filter to use when resiizing the image. Resampling filter to use when resiizing the image.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
""" """
size = get_size_dict(size, default_to_square=False) size = get_size_dict(size, default_to_square=False)
if "shortest_edge" not in size: if "shortest_edge" not in size:
raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}") raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False) output_size = get_resize_output_image_size(
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs) image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
)
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def preprocess( def preprocess(
self, self,
...@@ -163,6 +176,7 @@ class ViTHybridImageProcessor(BaseImageProcessor): ...@@ -163,6 +176,7 @@ class ViTHybridImageProcessor(BaseImageProcessor):
do_convert_rgb: bool = None, do_convert_rgb: bool = None,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs, **kwargs,
) -> PIL.Image.Image: ) -> PIL.Image.Image:
""" """
...@@ -208,6 +222,12 @@ class ViTHybridImageProcessor(BaseImageProcessor): ...@@ -208,6 +222,12 @@ class ViTHybridImageProcessor(BaseImageProcessor):
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: defaults to the channel dimension format of the input image. - Unset: defaults to the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
""" """
do_resize = do_resize if do_resize is not None else self.do_resize do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size size = size if size is not None else self.size
...@@ -250,19 +270,36 @@ class ViTHybridImageProcessor(BaseImageProcessor): ...@@ -250,19 +270,36 @@ class ViTHybridImageProcessor(BaseImageProcessor):
# All transformations expect numpy arrays. # All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images] images = [to_numpy_array(image) for image in images]
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
if do_resize: if do_resize:
images = [self.resize(image=image, size=size, resample=resample) for image in images] images = [
self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
for image in images
]
if do_center_crop: if do_center_crop:
images = [self.center_crop(image=image, size=crop_size) for image in images] images = [
self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
]
if do_rescale: if do_rescale:
images = [self.rescale(image=image, scale=rescale_factor) for image in images] images = [
self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
for image in images
]
if do_normalize: if do_normalize:
images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images] images = [
self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
for image in images
]
images = [to_channel_dimension_format(image, data_format) for image in images] images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
]
data = {"pixel_values": images} data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors) return BatchFeature(data=data, tensor_type=return_tensors)
...@@ -33,6 +33,7 @@ from ...image_utils import ( ...@@ -33,6 +33,7 @@ from ...image_utils import (
ChannelDimension, ChannelDimension,
ImageInput, ImageInput,
PILImageResampling, PILImageResampling,
infer_channel_dimension_format,
is_valid_image, is_valid_image,
to_numpy_array, to_numpy_array,
valid_images, valid_images,
...@@ -141,6 +142,7 @@ class VivitImageProcessor(BaseImageProcessor): ...@@ -141,6 +142,7 @@ class VivitImageProcessor(BaseImageProcessor):
size: Dict[str, int], size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR, resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None, data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
""" """
...@@ -157,15 +159,26 @@ class VivitImageProcessor(BaseImageProcessor): ...@@ -157,15 +159,26 @@ class VivitImageProcessor(BaseImageProcessor):
Resampling filter to use when resiizing the image. Resampling filter to use when resiizing the image.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
""" """
size = get_size_dict(size, default_to_square=False) size = get_size_dict(size, default_to_square=False)
if "shortest_edge" in size: if "shortest_edge" in size:
output_size = get_resize_output_image_size(image, size["shortest_edge"], default_to_square=False) output_size = get_resize_output_image_size(
image, size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
)
elif "height" in size and "width" in size: elif "height" in size and "width" in size:
output_size = (size["height"], size["width"]) output_size = (size["height"], size["width"])
else: else:
raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}") raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs) return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
# Copied from transformers.models.efficientnet.image_processing_efficientnet.EfficientNetImageProcessor.rescale # Copied from transformers.models.efficientnet.image_processing_efficientnet.EfficientNetImageProcessor.rescale
def rescale( def rescale(
...@@ -174,6 +187,7 @@ class VivitImageProcessor(BaseImageProcessor): ...@@ -174,6 +187,7 @@ class VivitImageProcessor(BaseImageProcessor):
scale: Union[int, float], scale: Union[int, float],
offset: bool = True, offset: bool = True,
data_format: Optional[Union[str, ChannelDimension]] = None, data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs, **kwargs,
): ):
""" """
...@@ -195,8 +209,12 @@ class VivitImageProcessor(BaseImageProcessor): ...@@ -195,8 +209,12 @@ class VivitImageProcessor(BaseImageProcessor):
Whether to scale the image in both negative and positive directions. Whether to scale the image in both negative and positive directions.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
""" """
rescaled_image = rescale(image, scale=scale, data_format=data_format, **kwargs) rescaled_image = rescale(
image, scale=scale, data_format=data_format, input_data_format=input_data_format, **kwargs
)
if offset: if offset:
rescaled_image = rescaled_image - 1 rescaled_image = rescaled_image - 1
...@@ -218,6 +236,7 @@ class VivitImageProcessor(BaseImageProcessor): ...@@ -218,6 +236,7 @@ class VivitImageProcessor(BaseImageProcessor):
image_mean: Optional[Union[float, List[float]]] = None, image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray: ) -> np.ndarray:
"""Preprocesses a single image.""" """Preprocesses a single image."""
if do_resize and size is None or resample is None: if do_resize and size is None or resample is None:
...@@ -238,19 +257,22 @@ class VivitImageProcessor(BaseImageProcessor): ...@@ -238,19 +257,22 @@ class VivitImageProcessor(BaseImageProcessor):
# All transformations expect numpy arrays. # All transformations expect numpy arrays.
image = to_numpy_array(image) image = to_numpy_array(image)
if input_data_format is None:
input_data_format = infer_channel_dimension_format(image)
if do_resize: if do_resize:
image = self.resize(image=image, size=size, resample=resample) image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
if do_center_crop: if do_center_crop:
image = self.center_crop(image, size=crop_size) image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)
if do_rescale: if do_rescale:
image = self.rescale(image=image, scale=rescale_factor, offset=offset) image = self.rescale(image=image, scale=rescale_factor, offset=offset, input_data_format=input_data_format)
if do_normalize: if do_normalize:
image = self.normalize(image=image, mean=image_mean, std=image_std) image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
image = to_channel_dimension_format(image, data_format) image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
return image return image
def preprocess( def preprocess(
...@@ -269,6 +291,7 @@ class VivitImageProcessor(BaseImageProcessor): ...@@ -269,6 +291,7 @@ class VivitImageProcessor(BaseImageProcessor):
image_std: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST, data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs, **kwargs,
) -> PIL.Image.Image: ) -> PIL.Image.Image:
""" """
...@@ -312,6 +335,12 @@ class VivitImageProcessor(BaseImageProcessor): ...@@ -312,6 +335,12 @@ class VivitImageProcessor(BaseImageProcessor):
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the inferred channel dimension format of the input image. - Unset: Use the inferred channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
""" """
do_resize = do_resize if do_resize is not None else self.do_resize do_resize = do_resize if do_resize is not None else self.do_resize
resample = resample if resample is not None else self.resample resample = resample if resample is not None else self.resample
...@@ -352,6 +381,7 @@ class VivitImageProcessor(BaseImageProcessor): ...@@ -352,6 +381,7 @@ class VivitImageProcessor(BaseImageProcessor):
image_mean=image_mean, image_mean=image_mean,
image_std=image_std, image_std=image_std,
data_format=data_format, data_format=data_format,
input_data_format=input_data_format,
) )
for img in video for img in video
] ]
......
...@@ -88,18 +88,21 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.CO ...@@ -88,18 +88,21 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.CO
# Copied from transformers.models.detr.image_processing_detr.get_max_height_width # Copied from transformers.models.detr.image_processing_detr.get_max_height_width
def get_max_height_width(images: List[np.ndarray]) -> List[int]: def get_max_height_width(
images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
""" """
Get the maximum height and width across all images in a batch. Get the maximum height and width across all images in a batch.
""" """
input_channel_dimension = infer_channel_dimension_format(images[0]) if input_data_format is None:
input_data_format = infer_channel_dimension_format(images[0])
if input_channel_dimension == ChannelDimension.FIRST: if input_data_format == ChannelDimension.FIRST:
_, max_height, max_width = max_across_indices([img.shape for img in images]) _, max_height, max_width = max_across_indices([img.shape for img in images])
elif input_channel_dimension == ChannelDimension.LAST: elif input_data_format == ChannelDimension.LAST:
max_height, max_width, _ = max_across_indices([img.shape for img in images]) max_height, max_width, _ = max_across_indices([img.shape for img in images])
else: else:
raise ValueError(f"Invalid channel dimension format: {input_channel_dimension}") raise ValueError(f"Invalid channel dimension format: {input_data_format}")
return (max_height, max_width) return (max_height, max_width)
...@@ -137,7 +140,10 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in ...@@ -137,7 +140,10 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size # Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
def get_resize_output_image_size( def get_resize_output_image_size(
input_image: np.ndarray, size: Union[int, Tuple[int, int], List[int]], max_size: Optional[int] = None input_image: np.ndarray,
size: Union[int, Tuple[int, int], List[int]],
max_size: Optional[int] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]: ) -> Tuple[int, int]:
""" """
Computes the output image size given the input image size and the desired output size. If the desired output size Computes the output image size given the input image size and the desired output size. If the desired output size
...@@ -151,8 +157,10 @@ def get_resize_output_image_size( ...@@ -151,8 +157,10 @@ def get_resize_output_image_size(
The desired output size. The desired output size.
max_size (`int`, *optional*): max_size (`int`, *optional*):
The maximum allowed output size. The maximum allowed output size.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
""" """
image_size = get_image_size(input_image) image_size = get_image_size(input_image, input_data_format)
if isinstance(size, (list, tuple)): if isinstance(size, (list, tuple)):
return size return size
...@@ -222,7 +230,9 @@ def max_across_indices(values: Iterable[Any]) -> List[Any]: ...@@ -222,7 +230,9 @@ def max_across_indices(values: Iterable[Any]) -> List[Any]:
# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask # Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarray: def make_pixel_mask(
image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
""" """
Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
...@@ -232,7 +242,7 @@ def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarr ...@@ -232,7 +242,7 @@ def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarr
output_size (`Tuple[int, int]`): output_size (`Tuple[int, int]`):
Output size of the mask. Output size of the mask.
""" """
input_height, input_width = get_image_size(image) input_height, input_width = get_image_size(image, channel_dim=input_data_format)
mask = np.zeros(output_size, dtype=np.int64) mask = np.zeros(output_size, dtype=np.int64)
mask[:input_height, :input_width] = 1 mask[:input_height, :input_width] = 1
return mask return mask
...@@ -274,11 +284,16 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndar ...@@ -274,11 +284,16 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndar
# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation # Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation
def prepare_coco_detection_annotation(image, target, return_segmentation_masks: bool = False): def prepare_coco_detection_annotation(
image,
target,
return_segmentation_masks: bool = False,
input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
""" """
Convert the target in COCO format into the format expected by DETR. Convert the target in COCO format into the format expected by DETR.
""" """
image_height, image_width = get_image_size(image) image_height, image_width = get_image_size(image, channel_dim=input_data_format)
image_id = target["image_id"] image_id = target["image_id"]
image_id = np.asarray([image_id], dtype=np.int64) image_id = np.asarray([image_id], dtype=np.int64)
...@@ -363,12 +378,16 @@ def masks_to_boxes(masks: np.ndarray) -> np.ndarray: ...@@ -363,12 +378,16 @@ def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->YOLOS # Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->YOLOS
def prepare_coco_panoptic_annotation( def prepare_coco_panoptic_annotation(
image: np.ndarray, target: Dict, masks_path: Union[str, pathlib.Path], return_masks: bool = True image: np.ndarray,
target: Dict,
masks_path: Union[str, pathlib.Path],
return_masks: bool = True,
input_data_format: Union[ChannelDimension, str] = None,
) -> Dict: ) -> Dict:
""" """
Prepare a coco panoptic annotation for YOLOS. Prepare a coco panoptic annotation for YOLOS.
""" """
image_height, image_width = get_image_size(image) image_height, image_width = get_image_size(image, channel_dim=input_data_format)
annotation_path = pathlib.Path(masks_path) / target["file_name"] annotation_path = pathlib.Path(masks_path) / target["file_name"]
new_target = {} new_target = {}
...@@ -751,6 +770,7 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -751,6 +770,7 @@ class YolosImageProcessor(BaseImageProcessor):
format: Optional[AnnotionFormat] = None, format: Optional[AnnotionFormat] = None,
return_segmentation_masks: bool = None, return_segmentation_masks: bool = None,
masks_path: Optional[Union[str, pathlib.Path]] = None, masks_path: Optional[Union[str, pathlib.Path]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Dict: ) -> Dict:
""" """
Prepare an annotation for feeding into DETR model. Prepare an annotation for feeding into DETR model.
...@@ -759,11 +779,17 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -759,11 +779,17 @@ class YolosImageProcessor(BaseImageProcessor):
if format == AnnotionFormat.COCO_DETECTION: if format == AnnotionFormat.COCO_DETECTION:
return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
target = prepare_coco_detection_annotation(image, target, return_segmentation_masks) target = prepare_coco_detection_annotation(
image, target, return_segmentation_masks, input_data_format=input_data_format
)
elif format == AnnotionFormat.COCO_PANOPTIC: elif format == AnnotionFormat.COCO_PANOPTIC:
return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
target = prepare_coco_panoptic_annotation( target = prepare_coco_panoptic_annotation(
image, target, masks_path=masks_path, return_masks=return_segmentation_masks image,
target,
masks_path=masks_path,
return_masks=return_segmentation_masks,
input_data_format=input_data_format,
) )
else: else:
raise ValueError(f"Format {format} is not supported.") raise ValueError(f"Format {format} is not supported.")
...@@ -801,11 +827,26 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -801,11 +827,26 @@ class YolosImageProcessor(BaseImageProcessor):
size: Dict[str, int], size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR, resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
""" """
Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
int, smaller edge of the image will be matched to this number. int, smaller edge of the image will be matched to this number.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
`height` and `width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
""" """
if "max_size" in kwargs: if "max_size" in kwargs:
logger.warning_once( logger.warning_once(
...@@ -817,7 +858,9 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -817,7 +858,9 @@ class YolosImageProcessor(BaseImageProcessor):
max_size = None max_size = None
size = get_size_dict(size, max_size=max_size, default_to_square=False) size = get_size_dict(size, max_size=max_size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size: if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size(image, size["shortest_edge"], size["longest_edge"]) size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
)
elif "height" in size and "width" in size: elif "height" in size and "width" in size:
size = (size["height"], size["width"]) size = (size["height"], size["width"])
else: else:
...@@ -825,7 +868,9 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -825,7 +868,9 @@ class YolosImageProcessor(BaseImageProcessor):
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}." f" {size.keys()}."
) )
image = resize(image, size=size, resample=resample, data_format=data_format) image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
)
return image return image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
...@@ -844,7 +889,11 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -844,7 +889,11 @@ class YolosImageProcessor(BaseImageProcessor):
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
def rescale( def rescale(
self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None self,
image: np.ndarray,
rescale_factor: float,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray: ) -> np.ndarray:
""" """
Rescale the image by the given factor. image = image * rescale_factor. Rescale the image by the given factor. image = image * rescale_factor.
...@@ -859,8 +908,13 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -859,8 +908,13 @@ class YolosImageProcessor(BaseImageProcessor):
image is used. Can be one of: image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the input image. If unset, is inferred from the input image. Can be
one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
""" """
return rescale(image, rescale_factor, data_format=data_format) return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
...@@ -877,28 +931,36 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -877,28 +931,36 @@ class YolosImageProcessor(BaseImageProcessor):
output_size: Tuple[int, int], output_size: Tuple[int, int],
constant_values: Union[float, Iterable[float]] = 0, constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray: ) -> np.ndarray:
""" """
Pad an image with zeros to the given size. Pad an image with zeros to the given size.
""" """
input_height, input_width = get_image_size(image) input_height, input_width = get_image_size(image, channel_dim=input_data_format)
output_height, output_width = output_size output_height, output_width = output_size
pad_bottom = output_height - input_height pad_bottom = output_height - input_height
pad_right = output_width - input_width pad_right = output_width - input_width
padding = ((0, pad_bottom), (0, pad_right)) padding = ((0, pad_bottom), (0, pad_right))
padded_image = pad( padded_image = pad(
image, padding, mode=PaddingMode.CONSTANT, constant_values=constant_values, data_format=data_format image,
padding,
mode=PaddingMode.CONSTANT,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
) )
return padded_image return padded_image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad( def pad(
self, self,
images: List[np.ndarray], images: List[np.ndarray],
constant_values: Union[float, Iterable[float]] = 0, constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = False, return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> BatchFeature: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
...@@ -920,17 +982,28 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -920,17 +982,28 @@ class YolosImageProcessor(BaseImageProcessor):
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
""" """
pad_size = get_max_height_width(images) pad_size = get_max_height_width(images, input_data_format=input_data_format)
padded_images = [ padded_images = [
self._pad_image(image, pad_size, constant_values=constant_values, data_format=data_format) self._pad_image(
image,
pad_size,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
for image in images for image in images
] ]
data = {"pixel_values": padded_images} data = {"pixel_values": padded_images}
if return_pixel_mask: if return_pixel_mask:
masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images] masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
for image in images
]
data["pixel_mask"] = masks data["pixel_mask"] = masks
return BatchFeature(data=data, tensor_type=return_tensors) return BatchFeature(data=data, tensor_type=return_tensors)
...@@ -953,6 +1026,7 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -953,6 +1026,7 @@ class YolosImageProcessor(BaseImageProcessor):
format: Optional[Union[str, AnnotionFormat]] = None, format: Optional[Union[str, AnnotionFormat]] = None,
return_tensors: Optional[Union[TensorType, str]] = None, return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs, **kwargs,
) -> BatchFeature: ) -> BatchFeature:
""" """
...@@ -1000,6 +1074,12 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1000,6 +1074,12 @@ class YolosImageProcessor(BaseImageProcessor):
Type of tensors to return. If `None`, will return the list of images. Type of tensors to return. If `None`, will return the list of images.
data_format (`str` or `ChannelDimension`, *optional*, defaults to self.data_format): data_format (`str` or `ChannelDimension`, *optional*, defaults to self.data_format):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
""" """
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
logger.warning_once( logger.warning_once(
...@@ -1084,13 +1164,22 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1084,13 +1164,22 @@ class YolosImageProcessor(BaseImageProcessor):
# All transformations expect numpy arrays # All transformations expect numpy arrays
images = [to_numpy_array(image) for image in images] images = [to_numpy_array(image) for image in images]
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
# prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
if annotations is not None: if annotations is not None:
prepared_images = [] prepared_images = []
prepared_annotations = [] prepared_annotations = []
for image, target in zip(images, annotations): for image, target in zip(images, annotations):
target = self.prepare_annotation( target = self.prepare_annotation(
image, target, format, return_segmentation_masks=return_segmentation_masks, masks_path=masks_path image,
target,
format,
return_segmentation_masks=return_segmentation_masks,
masks_path=masks_path,
input_data_format=input_data_format,
) )
prepared_images.append(image) prepared_images.append(image)
prepared_annotations.append(target) prepared_annotations.append(target)
...@@ -1103,22 +1192,31 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1103,22 +1192,31 @@ class YolosImageProcessor(BaseImageProcessor):
if annotations is not None: if annotations is not None:
resized_images, resized_annotations = [], [] resized_images, resized_annotations = [], []
for image, target in zip(images, annotations): for image, target in zip(images, annotations):
orig_size = get_image_size(image) orig_size = get_image_size(image, input_data_format)
resized_image = self.resize(image, size=size, max_size=max_size, resample=resample) resized_image = self.resize(
resized_annotation = self.resize_annotation(target, orig_size, get_image_size(resized_image)) image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
)
resized_annotation = self.resize_annotation(
target, orig_size, get_image_size(resized_image, input_data_format)
)
resized_images.append(resized_image) resized_images.append(resized_image)
resized_annotations.append(resized_annotation) resized_annotations.append(resized_annotation)
images = resized_images images = resized_images
annotations = resized_annotations annotations = resized_annotations
del resized_images, resized_annotations del resized_images, resized_annotations
else: else:
images = [self.resize(image, size=size, resample=resample) for image in images] images = [
self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
for image in images
]
if do_rescale: if do_rescale:
images = [self.rescale(image, rescale_factor) for image in images] images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
if do_normalize: if do_normalize:
images = [self.normalize(image, image_mean, image_std) for image in images] images = [
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
]
if annotations is not None: if annotations is not None:
annotations = [ annotations = [
self.normalize_annotation(annotation, get_image_size(image)) self.normalize_annotation(annotation, get_image_size(image))
...@@ -1126,9 +1224,12 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -1126,9 +1224,12 @@ class YolosImageProcessor(BaseImageProcessor):
] ]
if do_pad: if do_pad:
data = self.pad(images, data_format=data_format) data = self.pad(images, data_format=data_format, input_data_format=input_data_format)
else: else:
images = [to_channel_dimension_format(image, data_format) for image in images] images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
for image in images
]
data = {"pixel_values": images} data = {"pixel_values": images}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
......
...@@ -70,7 +70,7 @@ class BlipImageProcessingTester(unittest.TestCase): ...@@ -70,7 +70,7 @@ class BlipImageProcessingTester(unittest.TestCase):
} }
def expected_output_image_shape(self, images): def expected_output_image_shape(self, images):
return 3, self.size["height"], self.size["width"] return self.num_channels, self.size["height"], self.size["width"]
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs( return prepare_image_inputs(
...@@ -135,3 +135,11 @@ class BlipImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.Tes ...@@ -135,3 +135,11 @@ class BlipImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.Tes
@unittest.skip("BlipImageProcessor does not support 4 channels yet") # FIXME Amy @unittest.skip("BlipImageProcessor does not support 4 channels yet") # FIXME Amy
def test_call_pytorch(self): def test_call_pytorch(self):
return super().test_call_torch() return super().test_call_torch()
@unittest.skip("BLIP doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
def test_call_pil(self):
pass
@unittest.skip("BLIP doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
def test_call_numpy_4_channels(self):
pass
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
import unittest import unittest
from transformers.testing_utils import require_torch, require_vision from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_vision_available from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
...@@ -26,6 +26,10 @@ if is_vision_available(): ...@@ -26,6 +26,10 @@ if is_vision_available():
from transformers import ChineseCLIPImageProcessor from transformers import ChineseCLIPImageProcessor
if is_torch_available():
pass
class ChineseCLIPImageProcessingTester(unittest.TestCase): class ChineseCLIPImageProcessingTester(unittest.TestCase):
def __init__( def __init__(
self, self,
...@@ -120,6 +124,10 @@ class ChineseCLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase ...@@ -120,6 +124,10 @@ class ChineseCLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
self.assertEqual(image_processor.size, {"shortest_edge": 42}) self.assertEqual(image_processor.size, {"shortest_edge": 42})
self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
@unittest.skip("ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
def test_call_numpy_4_channels(self):
pass
@require_torch @require_torch
@require_vision @require_vision
...@@ -152,3 +160,7 @@ class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingTestMixin, unitt ...@@ -152,3 +160,7 @@ class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingTestMixin, unitt
@unittest.skip("ChineseCLIPImageProcessor does not support 4 channels yet") # FIXME Amy @unittest.skip("ChineseCLIPImageProcessor does not support 4 channels yet") # FIXME Amy
def test_call_pytorch(self): def test_call_pytorch(self):
return super().test_call_torch() return super().test_call_torch()
@unittest.skip("ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
def test_call_numpy_4_channels(self):
pass
...@@ -337,6 +337,11 @@ class FlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): ...@@ -337,6 +337,11 @@ class FlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def test_call_numpy(self): def test_call_numpy(self):
self._test_call_framework(np.ndarray, prepare_kwargs={"numpify": True}) self._test_call_framework(np.ndarray, prepare_kwargs={"numpify": True})
def test_call_numpy_4_channels(self):
self.image_processing_class.num_channels = 4
self._test_call_framework(np.ndarray, prepare_kwargs={"numpify": True})
self.image_processing_class.num_channels = 3
def test_call_pytorch(self): def test_call_pytorch(self):
self._test_call_framework(torch.Tensor, prepare_kwargs={"torchify": True}) self._test_call_framework(torch.Tensor, prepare_kwargs={"torchify": True})
......
...@@ -144,3 +144,18 @@ class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): ...@@ -144,3 +144,18 @@ class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape)) self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape))
def test_call_numpy_4_channels(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
self.image_processing_class.num_channels = 4
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
# Test not batched input (GLPNImageProcessor doesn't support batching)
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape))
self.image_processing_class.num_channels = 3
...@@ -198,6 +198,10 @@ class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): ...@@ -198,6 +198,10 @@ class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
) )
@unittest.skip("ImageGPT assumes clusters for 3 channels")
def test_call_numpy_4_channels(self):
pass
# Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input
def test_call_pytorch(self): def test_call_pytorch(self):
# Initialize image_processing # Initialize image_processing
......
...@@ -222,6 +222,40 @@ class Pix2StructImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase) ...@@ -222,6 +222,40 @@ class Pix2StructImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
(self.image_processor_tester.batch_size, max_patch, expected_hidden_dim), (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
) )
def test_call_numpy_4_channels(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
self.image_processor_tester.num_channels = 4
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
expected_hidden_dim = (
(self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
* self.image_processor_tester.num_channels
) + 2
for max_patch in self.image_processor_tester.max_patches:
# Test not batched input
encoded_images = image_processor(
image_inputs[0], return_tensors="pt", max_patches=max_patch, input_data_format="channels_first"
).flattened_patches
self.assertEqual(
encoded_images.shape,
(1, max_patch, expected_hidden_dim),
)
# Test batched
encoded_images = image_processor(
image_inputs, return_tensors="pt", max_patches=max_patch, input_data_format="channels_first"
).flattened_patches
self.assertEqual(
encoded_images.shape,
(self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
)
self.image_processor_tester.num_channels = 3
def test_call_pytorch(self): def test_call_pytorch(self):
# Initialize image_processor # Initialize image_processor
image_processor = self.image_processing_class(**self.image_processor_dict) image_processor = self.image_processing_class(**self.image_processor_dict)
...@@ -318,3 +352,7 @@ class Pix2StructImageProcessingTestFourChannels(ImageProcessingTestMixin, unitte ...@@ -318,3 +352,7 @@ class Pix2StructImageProcessingTestFourChannels(ImageProcessingTestMixin, unitte
@unittest.skip("Pix2StructImageProcessor does not support 4 channels yet") # FIXME Amy @unittest.skip("Pix2StructImageProcessor does not support 4 channels yet") # FIXME Amy
def test_call_pytorch(self): def test_call_pytorch(self):
return super().test_call_torch() return super().test_call_torch()
@unittest.skip("Pix2StructImageProcessor does treat numpy and PIL 4 channel images consistently") # FIXME Amy
def test_call_numpy_4_channels(self):
return super().test_call_torch()
...@@ -147,6 +147,24 @@ class Swin2SRImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): ...@@ -147,6 +147,24 @@ class Swin2SRImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Swin2SRImageProcessor does not support batched input
def test_call_numpy_4_channels(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
self.image_processor_tester.num_channels = 4
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = image_processing(
image_inputs[0], return_tensors="pt", input_data_format="channels_first"
).pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
self.image_processor_tester.num_channels = 3
# Swin2SRImageProcessor does not support batched input # Swin2SRImageProcessor does not support batched input
def test_call_pytorch(self): def test_call_pytorch(self):
# Initialize image_processing # Initialize image_processing
......
...@@ -217,6 +217,47 @@ class TvltImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase): ...@@ -217,6 +217,47 @@ class TvltImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase):
), ),
) )
def test_call_numpy_4_channels(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
self.image_processor_tester.num_channels = 4
video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
for video in video_inputs:
self.assertIsInstance(video, list)
self.assertIsInstance(video[0], np.ndarray)
# Test not batched input
encoded_videos = image_processor(
video_inputs[0], return_tensors="pt", input_data_format="channels_first", image_mean=0, image_std=1
).pixel_values
self.assertEqual(
encoded_videos.shape,
(
1,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
# Test batched
encoded_videos = image_processor(
video_inputs, return_tensors="pt", input_data_format="channels_first", image_mean=0, image_std=1
).pixel_values
self.assertEqual(
encoded_videos.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
self.image_processor_tester.num_channels = 3
def test_call_pytorch(self): def test_call_pytorch(self):
# Initialize image_processor # Initialize image_processor
image_processor = self.image_processing_class(**self.image_processor_dict) image_processor = self.image_processing_class(**self.image_processor_dict)
......
...@@ -165,6 +165,33 @@ class VideoMAEImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): ...@@ -165,6 +165,33 @@ class VideoMAEImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape) tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
) )
def test_call_numpy_4_channels(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
self.image_processor_tester.num_channels = 4
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
for video in video_inputs:
self.assertIsInstance(video, list)
self.assertIsInstance(video[0], np.ndarray)
# Test not batched input
encoded_videos = image_processing(
video_inputs[0], return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
).pixel_values
expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
# Test batched
encoded_videos = image_processing(
video_inputs, return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
).pixel_values
expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
self.assertEqual(
tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
)
self.image_processor_tester.num_channels = 3
def test_call_pytorch(self): def test_call_pytorch(self):
# Initialize image_processing # Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict) image_processing = self.image_processing_class(**self.image_processor_dict)
......
...@@ -179,6 +179,33 @@ class VivitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): ...@@ -179,6 +179,33 @@ class VivitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape) tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
) )
def test_call_numpy_4_channels(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
self.image_processor_tester.num_channels = 4
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
for video in video_inputs:
self.assertIsInstance(video, list)
self.assertIsInstance(video[0], np.ndarray)
# Test not batched input
encoded_videos = image_processing(
video_inputs[0], return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
).pixel_values
expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
# Test batched
encoded_videos = image_processing(
video_inputs, return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
).pixel_values
expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
self.assertEqual(
tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
)
self.image_processor_tester.num_channels = 3
def test_call_pytorch(self): def test_call_pytorch(self):
# Initialize image_processing # Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict) image_processing = self.image_processing_class(**self.image_processor_dict)
......
...@@ -252,3 +252,36 @@ class ImageProcessingTestMixin: ...@@ -252,3 +252,36 @@ class ImageProcessingTestMixin:
tuple(encoded_images.shape), tuple(encoded_images.shape),
(self.image_processor_tester.batch_size, *expected_output_image_shape), (self.image_processor_tester.batch_size, *expected_output_image_shape),
) )
def test_call_numpy_4_channels(self):
# Test that can process images which have an arbitrary number of channels
# Initialize image_processing
image_processor = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
self.image_processor_tester.num_channels = 4
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
# Test not batched input
encoded_images = image_processor(
image_inputs[0],
return_tensors="pt",
input_data_format="channels_first",
image_mean=0,
image_std=1,
).pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processor(
image_inputs,
return_tensors="pt",
input_data_format="channels_first",
image_mean=0,
image_std=1,
).pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment