Unverified Commit 6bca43bb authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Input data format (#25464)

* Add copied from statements for image processors

* Move out rescale and normalize to base image processor

* Remove rescale and normalize from vit (post rebase)

* Update docstrings and tidy up

* PR comments

* Add input_data_format as preprocess argument

* Resolve tests and tidy up

* Remove num_channels argument

* Update doc strings -> default ints not in code formatting
parent a6609caf
......@@ -26,6 +26,7 @@ from ...image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
make_list_of_images,
to_numpy_array,
valid_images,
......@@ -99,6 +100,7 @@ class ViTImageProcessor(BaseImageProcessor):
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
......@@ -116,6 +118,13 @@ class ViTImageProcessor(BaseImageProcessor):
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
Returns:
`np.ndarray`: The resized image.
......@@ -124,7 +133,14 @@ class ViTImageProcessor(BaseImageProcessor):
if "height" not in size or "width" not in size:
raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
output_size = (size["height"], size["width"])
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def preprocess(
self,
......@@ -139,6 +155,7 @@ class ViTImageProcessor(BaseImageProcessor):
image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
"""
......@@ -177,6 +194,12 @@ class ViTImageProcessor(BaseImageProcessor):
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
......@@ -206,16 +229,31 @@ class ViTImageProcessor(BaseImageProcessor):
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
if do_resize:
images = [self.resize(image=image, size=size_dict, resample=resample) for image in images]
images = [
self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
for image in images
]
if do_rescale:
images = [self.rescale(image=image, scale=rescale_factor) for image in images]
images = [
self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
for image in images
]
if do_normalize:
images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
images = [
self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
for image in images
]
images = [to_channel_dimension_format(image, data_format) for image in images]
images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
]
data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors)
......@@ -31,6 +31,7 @@ from ...image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
make_list_of_images,
to_numpy_array,
valid_images,
......@@ -125,6 +126,7 @@ class ViTHybridImageProcessor(BaseImageProcessor):
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
......@@ -140,12 +142,23 @@ class ViTHybridImageProcessor(BaseImageProcessor):
Resampling filter to use when resiizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
size = get_size_dict(size, default_to_square=False)
if "shortest_edge" not in size:
raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
output_size = get_resize_output_image_size(
image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
)
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
def preprocess(
self,
......@@ -163,6 +176,7 @@ class ViTHybridImageProcessor(BaseImageProcessor):
do_convert_rgb: bool = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> PIL.Image.Image:
"""
......@@ -208,6 +222,12 @@ class ViTHybridImageProcessor(BaseImageProcessor):
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: defaults to the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
......@@ -250,19 +270,36 @@ class ViTHybridImageProcessor(BaseImageProcessor):
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
if do_resize:
images = [self.resize(image=image, size=size, resample=resample) for image in images]
images = [
self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
for image in images
]
if do_center_crop:
images = [self.center_crop(image=image, size=crop_size) for image in images]
images = [
self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
]
if do_rescale:
images = [self.rescale(image=image, scale=rescale_factor) for image in images]
images = [
self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
for image in images
]
if do_normalize:
images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
images = [
self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
for image in images
]
images = [to_channel_dimension_format(image, data_format) for image in images]
images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
]
data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors)
......@@ -33,6 +33,7 @@ from ...image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
infer_channel_dimension_format,
is_valid_image,
to_numpy_array,
valid_images,
......@@ -141,6 +142,7 @@ class VivitImageProcessor(BaseImageProcessor):
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
......@@ -157,15 +159,26 @@ class VivitImageProcessor(BaseImageProcessor):
Resampling filter to use when resiizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
size = get_size_dict(size, default_to_square=False)
if "shortest_edge" in size:
output_size = get_resize_output_image_size(image, size["shortest_edge"], default_to_square=False)
output_size = get_resize_output_image_size(
image, size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
)
elif "height" in size and "width" in size:
output_size = (size["height"], size["width"])
else:
raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
return resize(
image,
size=output_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
# Copied from transformers.models.efficientnet.image_processing_efficientnet.EfficientNetImageProcessor.rescale
def rescale(
......@@ -174,6 +187,7 @@ class VivitImageProcessor(BaseImageProcessor):
scale: Union[int, float],
offset: bool = True,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
"""
......@@ -195,8 +209,12 @@ class VivitImageProcessor(BaseImageProcessor):
Whether to scale the image in both negative and positive directions.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
rescaled_image = rescale(image, scale=scale, data_format=data_format, **kwargs)
rescaled_image = rescale(
image, scale=scale, data_format=data_format, input_data_format=input_data_format, **kwargs
)
if offset:
rescaled_image = rescaled_image - 1
......@@ -218,6 +236,7 @@ class VivitImageProcessor(BaseImageProcessor):
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""Preprocesses a single image."""
if do_resize and size is None or resample is None:
......@@ -238,19 +257,22 @@ class VivitImageProcessor(BaseImageProcessor):
# All transformations expect numpy arrays.
image = to_numpy_array(image)
if input_data_format is None:
input_data_format = infer_channel_dimension_format(image)
if do_resize:
image = self.resize(image=image, size=size, resample=resample)
image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
if do_center_crop:
image = self.center_crop(image, size=crop_size)
image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)
if do_rescale:
image = self.rescale(image=image, scale=rescale_factor, offset=offset)
image = self.rescale(image=image, scale=rescale_factor, offset=offset, input_data_format=input_data_format)
if do_normalize:
image = self.normalize(image=image, mean=image_mean, std=image_std)
image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
image = to_channel_dimension_format(image, data_format)
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
return image
def preprocess(
......@@ -269,6 +291,7 @@ class VivitImageProcessor(BaseImageProcessor):
image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> PIL.Image.Image:
"""
......@@ -312,6 +335,12 @@ class VivitImageProcessor(BaseImageProcessor):
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the inferred channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
resample = resample if resample is not None else self.resample
......@@ -352,6 +381,7 @@ class VivitImageProcessor(BaseImageProcessor):
image_mean=image_mean,
image_std=image_std,
data_format=data_format,
input_data_format=input_data_format,
)
for img in video
]
......
......@@ -88,18 +88,21 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.CO
# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
def get_max_height_width(images: List[np.ndarray]) -> List[int]:
def get_max_height_width(
images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
"""
Get the maximum height and width across all images in a batch.
"""
input_channel_dimension = infer_channel_dimension_format(images[0])
if input_data_format is None:
input_data_format = infer_channel_dimension_format(images[0])
if input_channel_dimension == ChannelDimension.FIRST:
if input_data_format == ChannelDimension.FIRST:
_, max_height, max_width = max_across_indices([img.shape for img in images])
elif input_channel_dimension == ChannelDimension.LAST:
elif input_data_format == ChannelDimension.LAST:
max_height, max_width, _ = max_across_indices([img.shape for img in images])
else:
raise ValueError(f"Invalid channel dimension format: {input_channel_dimension}")
raise ValueError(f"Invalid channel dimension format: {input_data_format}")
return (max_height, max_width)
......@@ -137,7 +140,10 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
def get_resize_output_image_size(
input_image: np.ndarray, size: Union[int, Tuple[int, int], List[int]], max_size: Optional[int] = None
input_image: np.ndarray,
size: Union[int, Tuple[int, int], List[int]],
max_size: Optional[int] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image size and the desired output size. If the desired output size
......@@ -151,8 +157,10 @@ def get_resize_output_image_size(
The desired output size.
max_size (`int`, *optional*):
The maximum allowed output size.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image)
image_size = get_image_size(input_image, input_data_format)
if isinstance(size, (list, tuple)):
return size
......@@ -222,7 +230,9 @@ def max_across_indices(values: Iterable[Any]) -> List[Any]:
# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarray:
def make_pixel_mask(
image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
"""
Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
......@@ -232,7 +242,7 @@ def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarr
output_size (`Tuple[int, int]`):
Output size of the mask.
"""
input_height, input_width = get_image_size(image)
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
mask = np.zeros(output_size, dtype=np.int64)
mask[:input_height, :input_width] = 1
return mask
......@@ -274,11 +284,16 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndar
# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation
def prepare_coco_detection_annotation(image, target, return_segmentation_masks: bool = False):
def prepare_coco_detection_annotation(
image,
target,
return_segmentation_masks: bool = False,
input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
"""
Convert the target in COCO format into the format expected by DETR.
"""
image_height, image_width = get_image_size(image)
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
image_id = target["image_id"]
image_id = np.asarray([image_id], dtype=np.int64)
......@@ -363,12 +378,16 @@ def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->YOLOS
def prepare_coco_panoptic_annotation(
image: np.ndarray, target: Dict, masks_path: Union[str, pathlib.Path], return_masks: bool = True
image: np.ndarray,
target: Dict,
masks_path: Union[str, pathlib.Path],
return_masks: bool = True,
input_data_format: Union[ChannelDimension, str] = None,
) -> Dict:
"""
Prepare a coco panoptic annotation for YOLOS.
"""
image_height, image_width = get_image_size(image)
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
annotation_path = pathlib.Path(masks_path) / target["file_name"]
new_target = {}
......@@ -751,6 +770,7 @@ class YolosImageProcessor(BaseImageProcessor):
format: Optional[AnnotionFormat] = None,
return_segmentation_masks: bool = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Dict:
"""
Prepare an annotation for feeding into DETR model.
......@@ -759,11 +779,17 @@ class YolosImageProcessor(BaseImageProcessor):
if format == AnnotionFormat.COCO_DETECTION:
return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
target = prepare_coco_detection_annotation(image, target, return_segmentation_masks)
target = prepare_coco_detection_annotation(
image, target, return_segmentation_masks, input_data_format=input_data_format
)
elif format == AnnotionFormat.COCO_PANOPTIC:
return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
target = prepare_coco_panoptic_annotation(
image, target, masks_path=masks_path, return_masks=return_segmentation_masks
image,
target,
masks_path=masks_path,
return_masks=return_segmentation_masks,
input_data_format=input_data_format,
)
else:
raise ValueError(f"Format {format} is not supported.")
......@@ -801,11 +827,26 @@ class YolosImageProcessor(BaseImageProcessor):
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
int, smaller edge of the image will be matched to this number.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
`height` and `width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
if "max_size" in kwargs:
logger.warning_once(
......@@ -817,7 +858,9 @@ class YolosImageProcessor(BaseImageProcessor):
max_size = None
size = get_size_dict(size, max_size=max_size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size(image, size["shortest_edge"], size["longest_edge"])
size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
)
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
else:
......@@ -825,7 +868,9 @@ class YolosImageProcessor(BaseImageProcessor):
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}."
)
image = resize(image, size=size, resample=resample, data_format=data_format)
image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
)
return image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
......@@ -844,7 +889,11 @@ class YolosImageProcessor(BaseImageProcessor):
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
def rescale(
self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
self,
image: np.ndarray,
rescale_factor: float,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Rescale the image by the given factor. image = image * rescale_factor.
......@@ -859,8 +908,13 @@ class YolosImageProcessor(BaseImageProcessor):
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the input image. If unset, is inferred from the input image. Can be
one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
"""
return rescale(image, rescale_factor, data_format=data_format)
return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
......@@ -877,28 +931,36 @@ class YolosImageProcessor(BaseImageProcessor):
output_size: Tuple[int, int],
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> np.ndarray:
"""
Pad an image with zeros to the given size.
"""
input_height, input_width = get_image_size(image)
input_height, input_width = get_image_size(image, channel_dim=input_data_format)
output_height, output_width = output_size
pad_bottom = output_height - input_height
pad_right = output_width - input_width
padding = ((0, pad_bottom), (0, pad_right))
padded_image = pad(
image, padding, mode=PaddingMode.CONSTANT, constant_values=constant_values, data_format=data_format
image,
padding,
mode=PaddingMode.CONSTANT,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
return padded_image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad(
self,
images: List[np.ndarray],
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = False,
return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> BatchFeature:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
......@@ -920,17 +982,28 @@ class YolosImageProcessor(BaseImageProcessor):
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred.
"""
pad_size = get_max_height_width(images)
pad_size = get_max_height_width(images, input_data_format=input_data_format)
padded_images = [
self._pad_image(image, pad_size, constant_values=constant_values, data_format=data_format)
self._pad_image(
image,
pad_size,
constant_values=constant_values,
data_format=data_format,
input_data_format=input_data_format,
)
for image in images
]
data = {"pixel_values": padded_images}
if return_pixel_mask:
masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images]
masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
for image in images
]
data["pixel_mask"] = masks
return BatchFeature(data=data, tensor_type=return_tensors)
......@@ -953,6 +1026,7 @@ class YolosImageProcessor(BaseImageProcessor):
format: Optional[Union[str, AnnotionFormat]] = None,
return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> BatchFeature:
"""
......@@ -1000,6 +1074,12 @@ class YolosImageProcessor(BaseImageProcessor):
Type of tensors to return. If `None`, will return the list of images.
data_format (`str` or `ChannelDimension`, *optional*, defaults to self.data_format):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
if "pad_and_return_pixel_mask" in kwargs:
logger.warning_once(
......@@ -1084,13 +1164,22 @@ class YolosImageProcessor(BaseImageProcessor):
# All transformations expect numpy arrays
images = [to_numpy_array(image) for image in images]
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
# prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
if annotations is not None:
prepared_images = []
prepared_annotations = []
for image, target in zip(images, annotations):
target = self.prepare_annotation(
image, target, format, return_segmentation_masks=return_segmentation_masks, masks_path=masks_path
image,
target,
format,
return_segmentation_masks=return_segmentation_masks,
masks_path=masks_path,
input_data_format=input_data_format,
)
prepared_images.append(image)
prepared_annotations.append(target)
......@@ -1103,22 +1192,31 @@ class YolosImageProcessor(BaseImageProcessor):
if annotations is not None:
resized_images, resized_annotations = [], []
for image, target in zip(images, annotations):
orig_size = get_image_size(image)
resized_image = self.resize(image, size=size, max_size=max_size, resample=resample)
resized_annotation = self.resize_annotation(target, orig_size, get_image_size(resized_image))
orig_size = get_image_size(image, input_data_format)
resized_image = self.resize(
image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
)
resized_annotation = self.resize_annotation(
target, orig_size, get_image_size(resized_image, input_data_format)
)
resized_images.append(resized_image)
resized_annotations.append(resized_annotation)
images = resized_images
annotations = resized_annotations
del resized_images, resized_annotations
else:
images = [self.resize(image, size=size, resample=resample) for image in images]
images = [
self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
for image in images
]
if do_rescale:
images = [self.rescale(image, rescale_factor) for image in images]
images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
if do_normalize:
images = [self.normalize(image, image_mean, image_std) for image in images]
images = [
self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
]
if annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image))
......@@ -1126,9 +1224,12 @@ class YolosImageProcessor(BaseImageProcessor):
]
if do_pad:
data = self.pad(images, data_format=data_format)
data = self.pad(images, data_format=data_format, input_data_format=input_data_format)
else:
images = [to_channel_dimension_format(image, data_format) for image in images]
images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
for image in images
]
data = {"pixel_values": images}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
......
......@@ -70,7 +70,7 @@ class BlipImageProcessingTester(unittest.TestCase):
}
def expected_output_image_shape(self, images):
return 3, self.size["height"], self.size["width"]
return self.num_channels, self.size["height"], self.size["width"]
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs(
......@@ -135,3 +135,11 @@ class BlipImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.Tes
@unittest.skip("BlipImageProcessor does not support 4 channels yet") # FIXME Amy
def test_call_pytorch(self):
return super().test_call_torch()
@unittest.skip("BLIP doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
def test_call_pil(self):
pass
@unittest.skip("BLIP doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
def test_call_numpy_4_channels(self):
pass
......@@ -17,7 +17,7 @@
import unittest
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_vision_available
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
......@@ -26,6 +26,10 @@ if is_vision_available():
from transformers import ChineseCLIPImageProcessor
if is_torch_available():
pass
class ChineseCLIPImageProcessingTester(unittest.TestCase):
def __init__(
self,
......@@ -120,6 +124,10 @@ class ChineseCLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
self.assertEqual(image_processor.size, {"shortest_edge": 42})
self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
@unittest.skip("ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
def test_call_numpy_4_channels(self):
pass
@require_torch
@require_vision
......@@ -152,3 +160,7 @@ class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingTestMixin, unitt
@unittest.skip("ChineseCLIPImageProcessor does not support 4 channels yet") # FIXME Amy
def test_call_pytorch(self):
return super().test_call_torch()
@unittest.skip("ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
def test_call_numpy_4_channels(self):
pass
......@@ -337,6 +337,11 @@ class FlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def test_call_numpy(self):
self._test_call_framework(np.ndarray, prepare_kwargs={"numpify": True})
def test_call_numpy_4_channels(self):
self.image_processing_class.num_channels = 4
self._test_call_framework(np.ndarray, prepare_kwargs={"numpify": True})
self.image_processing_class.num_channels = 3
def test_call_pytorch(self):
self._test_call_framework(torch.Tensor, prepare_kwargs={"torchify": True})
......
......@@ -144,3 +144,18 @@ class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape))
def test_call_numpy_4_channels(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
self.image_processing_class.num_channels = 4
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
# Test not batched input (GLPNImageProcessor doesn't support batching)
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape))
self.image_processing_class.num_channels = 3
......@@ -198,6 +198,10 @@ class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
@unittest.skip("ImageGPT assumes clusters for 3 channels")
def test_call_numpy_4_channels(self):
pass
# Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input
def test_call_pytorch(self):
# Initialize image_processing
......
......@@ -222,6 +222,40 @@ class Pix2StructImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
(self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
)
def test_call_numpy_4_channels(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
self.image_processor_tester.num_channels = 4
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
expected_hidden_dim = (
(self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
* self.image_processor_tester.num_channels
) + 2
for max_patch in self.image_processor_tester.max_patches:
# Test not batched input
encoded_images = image_processor(
image_inputs[0], return_tensors="pt", max_patches=max_patch, input_data_format="channels_first"
).flattened_patches
self.assertEqual(
encoded_images.shape,
(1, max_patch, expected_hidden_dim),
)
# Test batched
encoded_images = image_processor(
image_inputs, return_tensors="pt", max_patches=max_patch, input_data_format="channels_first"
).flattened_patches
self.assertEqual(
encoded_images.shape,
(self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
)
self.image_processor_tester.num_channels = 3
def test_call_pytorch(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.image_processor_dict)
......@@ -318,3 +352,7 @@ class Pix2StructImageProcessingTestFourChannels(ImageProcessingTestMixin, unitte
@unittest.skip("Pix2StructImageProcessor does not support 4 channels yet") # FIXME Amy
def test_call_pytorch(self):
return super().test_call_torch()
@unittest.skip("Pix2StructImageProcessor does treat numpy and PIL 4 channel images consistently") # FIXME Amy
def test_call_numpy_4_channels(self):
return super().test_call_torch()
......@@ -147,6 +147,24 @@ class Swin2SRImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Swin2SRImageProcessor does not support batched input
def test_call_numpy_4_channels(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
self.image_processor_tester.num_channels = 4
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = image_processing(
image_inputs[0], return_tensors="pt", input_data_format="channels_first"
).pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
self.image_processor_tester.num_channels = 3
# Swin2SRImageProcessor does not support batched input
def test_call_pytorch(self):
# Initialize image_processing
......
......@@ -217,6 +217,47 @@ class TvltImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase):
),
)
def test_call_numpy_4_channels(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
self.image_processor_tester.num_channels = 4
video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
for video in video_inputs:
self.assertIsInstance(video, list)
self.assertIsInstance(video[0], np.ndarray)
# Test not batched input
encoded_videos = image_processor(
video_inputs[0], return_tensors="pt", input_data_format="channels_first", image_mean=0, image_std=1
).pixel_values
self.assertEqual(
encoded_videos.shape,
(
1,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
# Test batched
encoded_videos = image_processor(
video_inputs, return_tensors="pt", input_data_format="channels_first", image_mean=0, image_std=1
).pixel_values
self.assertEqual(
encoded_videos.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_frames,
self.image_processor_tester.num_channels,
self.image_processor_tester.crop_size["height"],
self.image_processor_tester.crop_size["width"],
),
)
self.image_processor_tester.num_channels = 3
def test_call_pytorch(self):
# Initialize image_processor
image_processor = self.image_processing_class(**self.image_processor_dict)
......
......@@ -165,6 +165,33 @@ class VideoMAEImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
)
def test_call_numpy_4_channels(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
self.image_processor_tester.num_channels = 4
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
for video in video_inputs:
self.assertIsInstance(video, list)
self.assertIsInstance(video[0], np.ndarray)
# Test not batched input
encoded_videos = image_processing(
video_inputs[0], return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
).pixel_values
expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
# Test batched
encoded_videos = image_processing(
video_inputs, return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
).pixel_values
expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
self.assertEqual(
tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
)
self.image_processor_tester.num_channels = 3
def test_call_pytorch(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
......
......@@ -179,6 +179,33 @@ class VivitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
)
def test_call_numpy_4_channels(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
self.image_processor_tester.num_channels = 4
video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
for video in video_inputs:
self.assertIsInstance(video, list)
self.assertIsInstance(video[0], np.ndarray)
# Test not batched input
encoded_videos = image_processing(
video_inputs[0], return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
).pixel_values
expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
# Test batched
encoded_videos = image_processing(
video_inputs, return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
).pixel_values
expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
self.assertEqual(
tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
)
self.image_processor_tester.num_channels = 3
def test_call_pytorch(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
......
......@@ -252,3 +252,36 @@ class ImageProcessingTestMixin:
tuple(encoded_images.shape),
(self.image_processor_tester.batch_size, *expected_output_image_shape),
)
def test_call_numpy_4_channels(self):
# Test that can process images which have an arbitrary number of channels
# Initialize image_processing
image_processor = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
self.image_processor_tester.num_channels = 4
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
# Test not batched input
encoded_images = image_processor(
image_inputs[0],
return_tensors="pt",
input_data_format="channels_first",
image_mean=0,
image_std=1,
).pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
# Test batched
encoded_images = image_processor(
image_inputs,
return_tensors="pt",
input_data_format="channels_first",
image_mean=0,
image_std=1,
).pixel_values
expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
self.assertEqual(
tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment