Unverified Commit e3490104 authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Add copied from for image processor methods (#25121)

* Add copied from statements for image processors

* Move out rescale and normalize to base image processor

* Remove rescale and normalize from vit (post rebase)

* Update docstrings and tidy up

* PR comments
parent 5b517e17
...@@ -107,23 +107,24 @@ class MobileViTImageProcessor(BaseImageProcessor): ...@@ -107,23 +107,24 @@ class MobileViTImageProcessor(BaseImageProcessor):
self.crop_size = crop_size self.crop_size = crop_size
self.do_flip_channel_order = do_flip_channel_order self.do_flip_channel_order = do_flip_channel_order
# Copied from transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize with PILImageResampling.BICUBIC->PILImageResampling.BILINEAR
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
size: Dict[str, int], size: Dict[str, int],
resample: PILImageResampling = PIL.Image.BILINEAR, resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None, data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
""" """
Resize an image. Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
resized to keep the input aspect ratio.
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Controls the size of the output image. The shortest edge of the image will be resized to Size of the output image.
`size["shortest_edge"]` while maintaining the aspect ratio.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use when resiizing the image. Resampling filter to use when resiizing the image.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
...@@ -131,7 +132,7 @@ class MobileViTImageProcessor(BaseImageProcessor): ...@@ -131,7 +132,7 @@ class MobileViTImageProcessor(BaseImageProcessor):
""" """
size = get_size_dict(size, default_to_square=False) size = get_size_dict(size, default_to_square=False)
if "shortest_edge" not in size: if "shortest_edge" not in size:
raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}") raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False) output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs) return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
...@@ -251,6 +252,7 @@ class MobileViTImageProcessor(BaseImageProcessor): ...@@ -251,6 +252,7 @@ class MobileViTImageProcessor(BaseImageProcessor):
data = {"pixel_values": images} data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors) return BatchFeature(data=data, tensor_type=return_tensors)
# Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->MobileViT
def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None): def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
""" """
Converts the output of [`MobileViTForSemanticSegmentation`] into semantic segmentation maps. Only supports Converts the output of [`MobileViTForSemanticSegmentation`] into semantic segmentation maps. Only supports
...@@ -259,15 +261,14 @@ class MobileViTImageProcessor(BaseImageProcessor): ...@@ -259,15 +261,14 @@ class MobileViTImageProcessor(BaseImageProcessor):
Args: Args:
outputs ([`MobileViTForSemanticSegmentation`]): outputs ([`MobileViTForSemanticSegmentation`]):
Raw outputs of the model. Raw outputs of the model.
target_sizes (`List[Tuple]`, *optional*): target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
A list of length `batch_size`, where each item is a `Tuple[int, int]` corresponding to the requested List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
final size (height, width) of each prediction. If left to None, predictions will not be resized. predictions will not be resized.
Returns: Returns:
`List[torch.Tensor]`: semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width) segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
`torch.Tensor` correspond to a semantic class id.
""" """
# TODO: add support for other frameworks # TODO: add support for other frameworks
logits = outputs.logits logits = outputs.logits
......
...@@ -477,12 +477,23 @@ class OneFormerImageProcessor(BaseImageProcessor): ...@@ -477,12 +477,23 @@ class OneFormerImageProcessor(BaseImageProcessor):
image = resize(image, size=size, resample=resample, data_format=data_format) image = resize(image, size=size, resample=resample, data_format=data_format)
return image return image
# Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.rescale # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
def rescale( def rescale(
self, image: np.ndarray, rescale_factor: float, data_format: Optional[ChannelDimension] = None self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray: ) -> np.ndarray:
""" """
Rescale the image by the given factor. Rescale the image by the given factor. image = image * rescale_factor.
Args:
image (`np.ndarray`):
Image to rescale.
rescale_factor (`float`):
The value to use for rescaling.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
""" """
return rescale(image, rescale_factor, data_format=data_format) return rescale(image, rescale_factor, data_format=data_format)
...@@ -493,7 +504,6 @@ class OneFormerImageProcessor(BaseImageProcessor): ...@@ -493,7 +504,6 @@ class OneFormerImageProcessor(BaseImageProcessor):
instance_id_to_semantic_id: Optional[Dict[int, int]] = None, instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
ignore_index: Optional[int] = None, ignore_index: Optional[int] = None,
reduce_labels: bool = False, reduce_labels: bool = False,
**kwargs,
): ):
reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels
ignore_index = ignore_index if ignore_index is not None else self.ignore_index ignore_index = ignore_index if ignore_index is not None else self.ignore_index
...@@ -730,7 +740,7 @@ class OneFormerImageProcessor(BaseImageProcessor): ...@@ -730,7 +740,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
return_pixel_mask: bool = True, return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
) -> np.ndarray: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask. in the batch and optionally returns their corresponding pixel mask.
...@@ -742,8 +752,13 @@ class OneFormerImageProcessor(BaseImageProcessor): ...@@ -742,8 +752,13 @@ class OneFormerImageProcessor(BaseImageProcessor):
The value to use for the padding if `mode` is `"constant"`. The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`): return_pixel_mask (`bool`, *optional*, defaults to `True`):
Whether to return a pixel mask. Whether to return a pixel mask.
input_channel_dimension (`ChannelDimension`, *optional*): return_tensors (`str` or `TensorType`, *optional*):
The channel dimension format of the image. If not provided, it will be inferred from the input image. The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
""" """
......
...@@ -196,17 +196,25 @@ class OwlViTImageProcessor(BaseImageProcessor): ...@@ -196,17 +196,25 @@ class OwlViTImageProcessor(BaseImageProcessor):
return center_crop(image, (crop_size["height"], crop_size["width"]), data_format=data_format, **kwargs) return center_crop(image, (crop_size["height"], crop_size["width"]), data_format=data_format, **kwargs)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
def rescale( def rescale(
self, self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
image: np.ndarray,
rescale_factor: float,
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray: ) -> np.ndarray:
""" """
Rescale an image by a certain factor. Rescale the image by the given factor. image = image * rescale_factor.
Args:
image (`np.ndarray`):
Image to rescale.
rescale_factor (`float`):
The value to use for rescaling.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
""" """
return rescale(image, rescale_factor, data_format=data_format, **kwargs) return rescale(image, rescale_factor, data_format=data_format)
def preprocess( def preprocess(
self, self,
......
...@@ -146,11 +146,12 @@ class PerceiverImageProcessor(BaseImageProcessor): ...@@ -146,11 +146,12 @@ class PerceiverImageProcessor(BaseImageProcessor):
cropped_width = (size["width"] / crop_size["width"]) * min_dim cropped_width = (size["width"] / crop_size["width"]) * min_dim
return center_crop(image, size=(cropped_height, cropped_width), data_format=data_format, **kwargs) return center_crop(image, size=(cropped_height, cropped_width), data_format=data_format, **kwargs)
# Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
size: Dict[str, int], size: Dict[str, int],
resample: PILImageResampling = PIL.Image.BICUBIC, resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None, data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
...@@ -161,18 +162,23 @@ class PerceiverImageProcessor(BaseImageProcessor): ...@@ -161,18 +162,23 @@ class PerceiverImageProcessor(BaseImageProcessor):
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Size of the output image. Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PIL.Image.BICUBIC`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resizing the image. `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
Returns:
`np.ndarray`: The resized image.
""" """
size = get_size_dict(size) size = get_size_dict(size)
if "height" not in size or "width" not in size: if "height" not in size or "width" not in size:
raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}") raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
return resize( output_size = (size["height"], size["width"])
image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
)
def preprocess( def preprocess(
self, self,
......
...@@ -93,6 +93,7 @@ class PvtImageProcessor(BaseImageProcessor): ...@@ -93,6 +93,7 @@ class PvtImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
# Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -109,7 +110,7 @@ class PvtImageProcessor(BaseImageProcessor): ...@@ -109,7 +110,7 @@ class PvtImageProcessor(BaseImageProcessor):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample: resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
data_format (`ChannelDimension` or `str`, *optional*): data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input The channel dimension format for the output image. If unset, the channel dimension format of the input
...@@ -123,9 +124,8 @@ class PvtImageProcessor(BaseImageProcessor): ...@@ -123,9 +124,8 @@ class PvtImageProcessor(BaseImageProcessor):
size = get_size_dict(size) size = get_size_dict(size)
if "height" not in size or "width" not in size: if "height" not in size or "width" not in size:
raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
return resize( output_size = (size["height"], size["width"])
image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
)
def preprocess( def preprocess(
self, self,
......
...@@ -128,6 +128,7 @@ class SegformerImageProcessor(BaseImageProcessor): ...@@ -128,6 +128,7 @@ class SegformerImageProcessor(BaseImageProcessor):
image_processor_dict["reduce_labels"] = kwargs.pop("reduce_labels") image_processor_dict["reduce_labels"] = kwargs.pop("reduce_labels")
return super().from_dict(image_processor_dict, **kwargs) return super().from_dict(image_processor_dict, **kwargs)
# Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -143,19 +144,25 @@ class SegformerImageProcessor(BaseImageProcessor): ...@@ -143,19 +144,25 @@ class SegformerImageProcessor(BaseImageProcessor):
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Size of the output image. Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PIL.Image.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use when resiizing the image. `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
Returns:
`np.ndarray`: The resized image.
""" """
size = get_size_dict(size) size = get_size_dict(size)
if "height" not in size or "width" not in size: if "height" not in size or "width" not in size:
raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
return resize( output_size = (size["height"], size["width"])
image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
)
# Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.reduce_label
def reduce_label(self, label: ImageInput) -> np.ndarray: def reduce_label(self, label: ImageInput) -> np.ndarray:
label = to_numpy_array(label) label = to_numpy_array(label)
# Avoid using underflow conversion # Avoid using underflow conversion
...@@ -387,6 +394,7 @@ class SegformerImageProcessor(BaseImageProcessor): ...@@ -387,6 +394,7 @@ class SegformerImageProcessor(BaseImageProcessor):
return BatchFeature(data=data, tensor_type=return_tensors) return BatchFeature(data=data, tensor_type=return_tensors)
# Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->Segformer
def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None): def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
""" """
Converts the output of [`SegformerForSemanticSegmentation`] into semantic segmentation maps. Only supports Converts the output of [`SegformerForSemanticSegmentation`] into semantic segmentation maps. Only supports
...@@ -396,8 +404,9 @@ class SegformerImageProcessor(BaseImageProcessor): ...@@ -396,8 +404,9 @@ class SegformerImageProcessor(BaseImageProcessor):
outputs ([`SegformerForSemanticSegmentation`]): outputs ([`SegformerForSemanticSegmentation`]):
Raw outputs of the model. Raw outputs of the model.
target_sizes (`List[Tuple]` of length `batch_size`, *optional*): target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
List of tuples corresponding to the requested final size (height, width) of each prediction. If left to List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
None, predictions will not be resized. predictions will not be resized.
Returns: Returns:
semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
......
...@@ -229,6 +229,7 @@ class ViltImageProcessor(BaseImageProcessor): ...@@ -229,6 +229,7 @@ class ViltImageProcessor(BaseImageProcessor):
output_size = get_resize_output_image_size(image, shorter=shorter, longer=longer, size_divisor=size_divisor) output_size = get_resize_output_image_size(image, shorter=shorter, longer=longer, size_divisor=size_divisor)
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs) return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image( def _pad_image(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -250,22 +251,26 @@ class ViltImageProcessor(BaseImageProcessor): ...@@ -250,22 +251,26 @@ class ViltImageProcessor(BaseImageProcessor):
) )
return padded_image return padded_image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad( def pad(
self, self,
images: List[np.ndarray], images: List[np.ndarray],
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True, return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
) -> BatchFeature: ) -> BatchFeature:
""" """
Pads a batch of images with zeros to the size of largest height and width in the batch and optionally returns Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
their corresponding pixel mask. in the batch and optionally returns their corresponding pixel mask.
Args: Args:
images (`List[np.ndarray]`): image (`np.ndarray`):
Batch of images to pad. Image to pad.
constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`): return_pixel_mask (`bool`, *optional*, defaults to `True`):
Whether to return the pixel mask. Whether to return a pixel mask.
return_tensors (`str` or `TensorType`, *optional*): return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of: The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`. - Unset: Return a list of `np.ndarray`.
...@@ -277,10 +282,13 @@ class ViltImageProcessor(BaseImageProcessor): ...@@ -277,10 +282,13 @@ class ViltImageProcessor(BaseImageProcessor):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
""" """
pad_size = get_max_height_width(images) pad_size = get_max_height_width(images)
padded_images = [ padded_images = [
self._pad_image(image=image, output_size=pad_size, data_format=data_format) for image in images self._pad_image(image, pad_size, constant_values=constant_values, data_format=data_format)
for image in images
] ]
data = {"pixel_values": padded_images} data = {"pixel_values": padded_images}
if return_pixel_mask: if return_pixel_mask:
masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images] masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images]
data["pixel_mask"] = masks data["pixel_mask"] = masks
......
...@@ -109,7 +109,7 @@ class ViTImageProcessor(BaseImageProcessor): ...@@ -109,7 +109,7 @@ class ViTImageProcessor(BaseImageProcessor):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample: resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
`PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
data_format (`ChannelDimension` or `str`, *optional*): data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input The channel dimension format for the output image. If unset, the channel dimension format of the input
...@@ -123,9 +123,8 @@ class ViTImageProcessor(BaseImageProcessor): ...@@ -123,9 +123,8 @@ class ViTImageProcessor(BaseImageProcessor):
size = get_size_dict(size) size = get_size_dict(size)
if "height" not in size or "width" not in size: if "height" not in size or "width" not in size:
raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
return resize( output_size = (size["height"], size["width"])
image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
)
def preprocess( def preprocess(
self, self,
......
...@@ -118,6 +118,7 @@ class ViTHybridImageProcessor(BaseImageProcessor): ...@@ -118,6 +118,7 @@ class ViTHybridImageProcessor(BaseImageProcessor):
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
self.do_convert_rgb = do_convert_rgb self.do_convert_rgb = do_convert_rgb
# Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
......
...@@ -844,10 +844,21 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -844,10 +844,21 @@ class YolosImageProcessor(BaseImageProcessor):
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
def rescale( def rescale(
self, image: np.ndarray, rescale_factor: Union[float, int], data_format: Optional[ChannelDimension] = None self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray: ) -> np.ndarray:
""" """
Rescale the image by the given factor. Rescale the image by the given factor. image = image * rescale_factor.
Args:
image (`np.ndarray`):
Image to rescale.
rescale_factor (`float`):
The value to use for rescaling.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
""" """
return rescale(image, rescale_factor, data_format=data_format) return rescale(image, rescale_factor, data_format=data_format)
...@@ -881,13 +892,15 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -881,13 +892,15 @@ class YolosImageProcessor(BaseImageProcessor):
) )
return padded_image return padded_image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad( def pad(
self, self,
images: List[np.ndarray], images: List[np.ndarray],
return_pixel_mask: bool = False, constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
) -> np.ndarray: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask. in the batch and optionally returns their corresponding pixel mask.
...@@ -895,16 +908,26 @@ class YolosImageProcessor(BaseImageProcessor): ...@@ -895,16 +908,26 @@ class YolosImageProcessor(BaseImageProcessor):
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
Image to pad. Image to pad.
constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`): return_pixel_mask (`bool`, *optional*, defaults to `True`):
Whether to return a pixel mask. Whether to return a pixel mask.
input_channel_dimension (`ChannelDimension`, *optional*): return_tensors (`str` or `TensorType`, *optional*):
The channel dimension format of the image. If not provided, it will be inferred from the input image. The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
""" """
pad_size = get_max_height_width(images) pad_size = get_max_height_width(images)
padded_images = [self._pad_image(image, pad_size, data_format=data_format) for image in images] padded_images = [
self._pad_image(image, pad_size, constant_values=constant_values, data_format=data_format)
for image in images
]
data = {"pixel_values": padded_images} data = {"pixel_values": padded_images}
if return_pixel_mask: if return_pixel_mask:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment