Unverified Commit e3490104 authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Add copied from for image processor methods (#25121)

* Add copied from statements for image processors

* Move out rescale and normalize to base image processor

* Remove rescale and normalize from vit (post rebase)

* Update docstrings and tidy up

* PR comments
parent 5b517e17
...@@ -434,8 +434,9 @@ class BeitImageProcessor(BaseImageProcessor): ...@@ -434,8 +434,9 @@ class BeitImageProcessor(BaseImageProcessor):
outputs ([`BeitForSemanticSegmentation`]): outputs ([`BeitForSemanticSegmentation`]):
Raw outputs of the model. Raw outputs of the model.
target_sizes (`List[Tuple]` of length `batch_size`, *optional*): target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
List of tuples corresponding to the requested final size (height, width) of each prediction. If left to List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
None, predictions will not be resized. predictions will not be resized.
Returns: Returns:
semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
......
...@@ -118,6 +118,7 @@ class BitImageProcessor(BaseImageProcessor): ...@@ -118,6 +118,7 @@ class BitImageProcessor(BaseImageProcessor):
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
self.do_convert_rgb = do_convert_rgb self.do_convert_rgb = do_convert_rgb
# Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
......
...@@ -104,6 +104,7 @@ class BlipImageProcessor(BaseImageProcessor): ...@@ -104,6 +104,7 @@ class BlipImageProcessor(BaseImageProcessor):
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
self.do_convert_rgb = do_convert_rgb self.do_convert_rgb = do_convert_rgb
# Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -131,7 +132,7 @@ class BlipImageProcessor(BaseImageProcessor): ...@@ -131,7 +132,7 @@ class BlipImageProcessor(BaseImageProcessor):
Returns: Returns:
`np.ndarray`: The resized image. `np.ndarray`: The resized image.
""" """
size = get_size_dict(size, default_to_square=True) size = get_size_dict(size)
if "height" not in size or "width" not in size: if "height" not in size or "width" not in size:
raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
output_size = (size["height"], size["width"]) output_size = (size["height"], size["width"])
......
...@@ -234,20 +234,21 @@ class BridgeTowerImageProcessor(BaseImageProcessor): ...@@ -234,20 +234,21 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
""" """
Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
edge, the image is padded with 0's and then center cropped. any edge, the image is padded with 0's and then center cropped.
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
Image to center crop. Image to center crop.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Size of the output image. Size of the output image in the form `{"height": h, "width": w}`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
""" """
output_size = size["shortest_edge"] output_size = size["shortest_edge"]
return center_crop(image, size=(output_size, output_size), data_format=data_format, **kwargs) return center_crop(image, size=(output_size, output_size), data_format=data_format, **kwargs)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image( def _pad_image(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -269,22 +270,26 @@ class BridgeTowerImageProcessor(BaseImageProcessor): ...@@ -269,22 +270,26 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
) )
return padded_image return padded_image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad( def pad(
self, self,
images: List[np.ndarray], images: List[np.ndarray],
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True, return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
) -> BatchFeature: ) -> BatchFeature:
""" """
Pads a batch of images with zeros to the size of largest height and width in the batch and optionally returns Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
their corresponding pixel mask. in the batch and optionally returns their corresponding pixel mask.
Args: Args:
images (`List[np.ndarray]`): image (`np.ndarray`):
Batch of images to pad. Image to pad.
return_pixel_mask (`bool`, *optional*, defaults to `False`): constant_values (`float` or `Iterable[float]`, *optional*):
Whether to return the pixel mask. The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`):
Whether to return a pixel mask.
return_tensors (`str` or `TensorType`, *optional*): return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of: The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`. - Unset: Return a list of `np.ndarray`.
...@@ -296,10 +301,13 @@ class BridgeTowerImageProcessor(BaseImageProcessor): ...@@ -296,10 +301,13 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
""" """
pad_size = get_max_height_width(images) pad_size = get_max_height_width(images)
padded_images = [ padded_images = [
self._pad_image(image=image, output_size=pad_size, data_format=data_format) for image in images self._pad_image(image, pad_size, constant_values=constant_values, data_format=data_format)
for image in images
] ]
data = {"pixel_values": padded_images} data = {"pixel_values": padded_images}
if return_pixel_mask: if return_pixel_mask:
masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images] masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images]
data["pixel_mask"] = masks data["pixel_mask"] = masks
......
...@@ -935,10 +935,21 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -935,10 +935,21 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
def rescale( def rescale(
self, image: np.ndarray, rescale_factor: Union[float, int], data_format: Optional[ChannelDimension] = None self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray: ) -> np.ndarray:
""" """
Rescale the image by the given factor. Rescale the image by the given factor. image = image * rescale_factor.
Args:
image (`np.ndarray`):
Image to rescale.
rescale_factor (`float`):
The value to use for rescaling.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
""" """
return rescale(image, rescale_factor, data_format=data_format) return rescale(image, rescale_factor, data_format=data_format)
...@@ -980,7 +991,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -980,7 +991,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
return_pixel_mask: bool = True, return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
) -> np.ndarray: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask. in the batch and optionally returns their corresponding pixel mask.
...@@ -992,8 +1003,13 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): ...@@ -992,8 +1003,13 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
The value to use for the padding if `mode` is `"constant"`. The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`): return_pixel_mask (`bool`, *optional*, defaults to `True`):
Whether to return a pixel mask. Whether to return a pixel mask.
input_channel_dimension (`ChannelDimension`, *optional*): return_tensors (`str` or `TensorType`, *optional*):
The channel dimension format of the image. If not provided, it will be inferred from the input image. The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
""" """
......
...@@ -933,10 +933,21 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -933,10 +933,21 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
def rescale( def rescale(
self, image: np.ndarray, rescale_factor: Union[float, int], data_format: Optional[ChannelDimension] = None self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray: ) -> np.ndarray:
""" """
Rescale the image by the given factor. Rescale the image by the given factor. image = image * rescale_factor.
Args:
image (`np.ndarray`):
Image to rescale.
rescale_factor (`float`):
The value to use for rescaling.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
""" """
return rescale(image, rescale_factor, data_format=data_format) return rescale(image, rescale_factor, data_format=data_format)
...@@ -978,7 +989,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -978,7 +989,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
return_pixel_mask: bool = True, return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
) -> np.ndarray: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask. in the batch and optionally returns their corresponding pixel mask.
...@@ -990,8 +1001,13 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ...@@ -990,8 +1001,13 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
The value to use for the padding if `mode` is `"constant"`. The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`): return_pixel_mask (`bool`, *optional*, defaults to `True`):
Whether to return a pixel mask. Whether to return a pixel mask.
input_channel_dimension (`ChannelDimension`, *optional*): return_tensors (`str` or `TensorType`, *optional*):
The channel dimension format of the image. If not provided, it will be inferred from the input image. The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
""" """
......
...@@ -107,33 +107,39 @@ class DeiTImageProcessor(BaseImageProcessor): ...@@ -107,33 +107,39 @@ class DeiTImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
# Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
size: Dict[str, int], size: Dict[str, int],
resample: PILImageResampling = PIL.Image.BICUBIC, resample: PILImageResampling = PILImageResampling.BICUBIC,
data_format: Optional[Union[str, ChannelDimension]] = None, data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
""" """
Resize an image to `(size["height"], size["width"])` using the specified resampling filter. Resize an image to `(size["height"], size["width"])`.
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Size of the output image. Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resizing the image. `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
Returns:
`np.ndarray`: The resized image.
""" """
size = get_size_dict(size) size = get_size_dict(size)
if "height" not in size or "width" not in size: if "height" not in size or "width" not in size:
raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}") raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
return resize( output_size = (size["height"], size["width"])
image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
)
def preprocess( def preprocess(
self, self,
......
...@@ -606,10 +606,21 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -606,10 +606,21 @@ class DetaImageProcessor(BaseImageProcessor):
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
def rescale( def rescale(
self, image: np.ndarray, rescale_factor: Union[float, int], data_format: Optional[ChannelDimension] = None self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray: ) -> np.ndarray:
""" """
Rescale the image by the given factor. Rescale the image by the given factor. image = image * rescale_factor.
Args:
image (`np.ndarray`):
Image to rescale.
rescale_factor (`float`):
The value to use for rescaling.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
""" """
return rescale(image, rescale_factor, data_format=data_format) return rescale(image, rescale_factor, data_format=data_format)
...@@ -651,7 +662,7 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -651,7 +662,7 @@ class DetaImageProcessor(BaseImageProcessor):
return_pixel_mask: bool = True, return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
) -> np.ndarray: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask. in the batch and optionally returns their corresponding pixel mask.
...@@ -663,8 +674,13 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -663,8 +674,13 @@ class DetaImageProcessor(BaseImageProcessor):
The value to use for the padding if `mode` is `"constant"`. The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`): return_pixel_mask (`bool`, *optional*, defaults to `True`):
Whether to return a pixel mask. Whether to return a pixel mask.
input_channel_dimension (`ChannelDimension`, *optional*): return_tensors (`str` or `TensorType`, *optional*):
The channel dimension format of the image. If not provided, it will be inferred from the input image. The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
""" """
......
...@@ -907,11 +907,23 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -907,11 +907,23 @@ class DetrImageProcessor(BaseImageProcessor):
""" """
return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample) return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
# TODO (Amy) - update to use `rescale_factor` instead of `scale`
def rescale( def rescale(
self, image: np.ndarray, rescale_factor: Union[float, int], data_format: Optional[ChannelDimension] = None self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray: ) -> np.ndarray:
""" """
Rescale the image by the given factor. Rescale the image by the given factor. image = image * rescale_factor.
Args:
image (`np.ndarray`):
Image to rescale.
rescale_factor (`float`):
The value to use for rescaling.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
""" """
return rescale(image, rescale_factor, data_format=data_format) return rescale(image, rescale_factor, data_format=data_format)
...@@ -950,7 +962,7 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -950,7 +962,7 @@ class DetrImageProcessor(BaseImageProcessor):
return_pixel_mask: bool = True, return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
) -> np.ndarray: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask. in the batch and optionally returns their corresponding pixel mask.
...@@ -962,8 +974,13 @@ class DetrImageProcessor(BaseImageProcessor): ...@@ -962,8 +974,13 @@ class DetrImageProcessor(BaseImageProcessor):
The value to use for the padding if `mode` is `"constant"`. The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`): return_pixel_mask (`bool`, *optional*, defaults to `True`):
Whether to return a pixel mask. Whether to return a pixel mask.
input_channel_dimension (`ChannelDimension`, *optional*): return_tensors (`str` or `TensorType`, *optional*):
The channel dimension format of the image. If not provided, it will be inferred from the input image. The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
""" """
......
...@@ -296,6 +296,7 @@ class DPTImageProcessor(BaseImageProcessor): ...@@ -296,6 +296,7 @@ class DPTImageProcessor(BaseImageProcessor):
data = {"pixel_values": images} data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors) return BatchFeature(data=data, tensor_type=return_tensors)
# Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->DPT
def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None): def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
""" """
Converts the output of [`DPTForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch. Converts the output of [`DPTForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
......
...@@ -116,33 +116,39 @@ class EfficientNetImageProcessor(BaseImageProcessor): ...@@ -116,33 +116,39 @@ class EfficientNetImageProcessor(BaseImageProcessor):
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self.include_top = include_top self.include_top = include_top
# Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.NEAREST
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
size: Dict[str, int], size: Dict[str, int],
resample: PILImageResampling = PIL.Image.NEAREST, resample: PILImageResampling = PILImageResampling.NEAREST,
data_format: Optional[Union[str, ChannelDimension]] = None, data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
""" """
Resize an image to `(size["height"], size["width"])` using the specified resampling filter. Resize an image to `(size["height"], size["width"])`.
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Size of the output image. Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.NEAREST`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.NEAREST`):
Resampling filter to use when resizing the image. `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.NEAREST`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
Returns:
`np.ndarray`: The resized image.
""" """
size = get_size_dict(size) size = get_size_dict(size)
if "height" not in size or "width" not in size: if "height" not in size or "width" not in size:
raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}") raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
return resize( output_size = (size["height"], size["width"])
image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
)
def rescale( def rescale(
self, self,
......
...@@ -331,6 +331,7 @@ class FlavaImageProcessor(BaseImageProcessor): ...@@ -331,6 +331,7 @@ class FlavaImageProcessor(BaseImageProcessor):
mask_group_max_aspect_ratio=mask_group_max_aspect_ratio, mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
) )
# Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -346,18 +347,23 @@ class FlavaImageProcessor(BaseImageProcessor): ...@@ -346,18 +347,23 @@ class FlavaImageProcessor(BaseImageProcessor):
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Size of the output image. Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
Resampling filter to use when resiizing the image. `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
Returns:
`np.ndarray`: The resized image.
""" """
size = get_size_dict(size) size = get_size_dict(size)
if "height" not in size or "width" not in size: if "height" not in size or "width" not in size:
raise ValueError(f"The size dictionary must contain 'height' and 'width' keys. Got {size.keys()}") raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
return resize( output_size = (size["height"], size["width"])
image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
)
def map_pixels(self, image: np.ndarray) -> np.ndarray: def map_pixels(self, image: np.ndarray) -> np.ndarray:
return (1 - 2 * LOGIT_LAPLACE_EPS) * image + LOGIT_LAPLACE_EPS return (1 - 2 * LOGIT_LAPLACE_EPS) * image + LOGIT_LAPLACE_EPS
......
...@@ -70,7 +70,12 @@ class GLPNImageProcessor(BaseImageProcessor): ...@@ -70,7 +70,12 @@ class GLPNImageProcessor(BaseImageProcessor):
super().__init__(**kwargs) super().__init__(**kwargs)
def resize( def resize(
self, image: np.ndarray, size_divisor: int, resample, data_format: Optional[ChannelDimension] = None, **kwargs self,
image: np.ndarray,
size_divisor: int,
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[ChannelDimension] = None,
**kwargs,
) -> np.ndarray: ) -> np.ndarray:
""" """
Resize the image, rounding the (height, width) dimensions down to the closest multiple of size_divisor. Resize the image, rounding the (height, width) dimensions down to the closest multiple of size_divisor.
......
...@@ -100,6 +100,7 @@ class ImageGPTImageProcessor(BaseImageProcessor): ...@@ -100,6 +100,7 @@ class ImageGPTImageProcessor(BaseImageProcessor):
self.do_normalize = do_normalize self.do_normalize = do_normalize
self.do_color_quantize = do_color_quantize self.do_color_quantize = do_color_quantize
# Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -109,24 +110,29 @@ class ImageGPTImageProcessor(BaseImageProcessor): ...@@ -109,24 +110,29 @@ class ImageGPTImageProcessor(BaseImageProcessor):
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
""" """
Resize an image to (size["height"], size["width"]). Resize an image to `(size["height"], size["width"])`.
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Size of the output image. Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use when resizing the image. `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
Returns:
`np.ndarray`: The resized image.
""" """
size = get_size_dict(size) size = get_size_dict(size)
if "height" not in size or "width" not in size: if "height" not in size or "width" not in size:
raise ValueError(f"Size dictionary must contain both height and width keys. Got {size.keys()}") raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
return resize( output_size = (size["height"], size["width"])
image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
)
def normalize( def normalize(
self, self,
......
...@@ -131,6 +131,7 @@ class LayoutLMv2ImageProcessor(BaseImageProcessor): ...@@ -131,6 +131,7 @@ class LayoutLMv2ImageProcessor(BaseImageProcessor):
self.ocr_lang = ocr_lang self.ocr_lang = ocr_lang
self.tesseract_config = tesseract_config self.tesseract_config = tesseract_config
# Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -146,15 +147,21 @@ class LayoutLMv2ImageProcessor(BaseImageProcessor): ...@@ -146,15 +147,21 @@ class LayoutLMv2ImageProcessor(BaseImageProcessor):
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Size of the output image. Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use when resizing the image. `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
Returns:
`np.ndarray`: The resized image.
""" """
size = get_size_dict(size) size = get_size_dict(size)
if "height" not in size or "width" not in size: if "height" not in size or "width" not in size:
raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}") raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
output_size = (size["height"], size["width"]) output_size = (size["height"], size["width"])
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs) return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
......
...@@ -157,6 +157,7 @@ class LayoutLMv3ImageProcessor(BaseImageProcessor): ...@@ -157,6 +157,7 @@ class LayoutLMv3ImageProcessor(BaseImageProcessor):
self.ocr_lang = ocr_lang self.ocr_lang = ocr_lang
self.tesseract_config = tesseract_config self.tesseract_config = tesseract_config
# Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -166,21 +167,27 @@ class LayoutLMv3ImageProcessor(BaseImageProcessor): ...@@ -166,21 +167,27 @@ class LayoutLMv3ImageProcessor(BaseImageProcessor):
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
""" """
Resize an image to (size["height"], size["width"]) dimensions. Resize an image to `(size["height"], size["width"])`.
Args: Args:
image (`np.ndarray`): image (`np.ndarray`):
Image to resize. Image to resize.
size (`Dict[str, int]`): size (`Dict[str, int]`):
Size of the output image. Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use when resiizing the image. `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
Returns:
`np.ndarray`: The resized image.
""" """
size = get_size_dict(size) size = get_size_dict(size)
if "height" not in size or "width" not in size: if "height" not in size or "width" not in size:
raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}") raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
output_size = (size["height"], size["width"]) output_size = (size["height"], size["width"])
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs) return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
......
...@@ -253,6 +253,7 @@ def compute_segments( ...@@ -253,6 +253,7 @@ def compute_segments(
# TODO: (Amy) Move to image_transforms # TODO: (Amy) Move to image_transforms
# Copied from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks
def convert_segmentation_map_to_binary_masks( def convert_segmentation_map_to_binary_masks(
segmentation_map: "np.ndarray", segmentation_map: "np.ndarray",
instance_id_to_semantic_id: Optional[Dict[int, int]] = None, instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
...@@ -289,6 +290,7 @@ def convert_segmentation_map_to_binary_masks( ...@@ -289,6 +290,7 @@ def convert_segmentation_map_to_binary_masks(
return binary_masks.astype(np.float32), labels.astype(np.int64) return binary_masks.astype(np.float32), labels.astype(np.int64)
# Copied from transformers.models.maskformer.image_processing_maskformer.get_maskformer_resize_output_image_size with maskformer->mask2former
def get_mask2former_resize_output_image_size( def get_mask2former_resize_output_image_size(
image: np.ndarray, image: np.ndarray,
size: Union[int, Tuple[int, int], List[int], Tuple[int]], size: Union[int, Tuple[int, int], List[int], Tuple[int]],
...@@ -440,6 +442,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor): ...@@ -440,6 +442,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
image_processor_dict["size_divisibility"] = kwargs.pop("size_divisibility") image_processor_dict["size_divisibility"] = kwargs.pop("size_divisibility")
return super().from_dict(image_processor_dict, **kwargs) return super().from_dict(image_processor_dict, **kwargs)
# Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.resize with get_maskformer_resize_output_image_size->get_mask2former_resize_output_image_size
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -483,14 +486,27 @@ class Mask2FormerImageProcessor(BaseImageProcessor): ...@@ -483,14 +486,27 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
image = resize(image, size=size, resample=resample, data_format=data_format) image = resize(image, size=size, resample=resample, data_format=data_format)
return image return image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
def rescale( def rescale(
self, image: np.ndarray, rescale_factor: float, data_format: Optional[ChannelDimension] = None self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray: ) -> np.ndarray:
""" """
Rescale the image by the given factor. Rescale the image by the given factor. image = image * rescale_factor.
Args:
image (`np.ndarray`):
Image to rescale.
rescale_factor (`float`):
The value to use for rescaling.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
""" """
return rescale(image, rescale_factor, data_format=data_format) return rescale(image, rescale_factor, data_format=data_format)
# Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.convert_segmentation_map_to_binary_masks
def convert_segmentation_map_to_binary_masks( def convert_segmentation_map_to_binary_masks(
self, self,
segmentation_map: "np.ndarray", segmentation_map: "np.ndarray",
...@@ -719,7 +735,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor): ...@@ -719,7 +735,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
return_pixel_mask: bool = True, return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
) -> np.ndarray: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask. in the batch and optionally returns their corresponding pixel mask.
...@@ -731,8 +747,13 @@ class Mask2FormerImageProcessor(BaseImageProcessor): ...@@ -731,8 +747,13 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
The value to use for the padding if `mode` is `"constant"`. The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`): return_pixel_mask (`bool`, *optional*, defaults to `True`):
Whether to return a pixel mask. Whether to return a pixel mask.
input_channel_dimension (`ChannelDimension`, *optional*): return_tensors (`str` or `TensorType`, *optional*):
The channel dimension format of the image. If not provided, it will be inferred from the input image. The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
""" """
...@@ -808,6 +829,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor): ...@@ -808,6 +829,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
""" """
ignore_index = self.ignore_index if ignore_index is None else ignore_index ignore_index = self.ignore_index if ignore_index is None else ignore_index
reduce_labels = self.reduce_labels if reduce_labels is None else reduce_labels reduce_labels = self.reduce_labels if reduce_labels is None else reduce_labels
pixel_values_list = [to_numpy_array(pixel_values) for pixel_values in pixel_values_list] pixel_values_list = [to_numpy_array(pixel_values) for pixel_values in pixel_values_list]
encoded_inputs = self.pad(pixel_values_list, return_tensors=return_tensors) encoded_inputs = self.pad(pixel_values_list, return_tensors=return_tensors)
......
...@@ -494,11 +494,23 @@ class MaskFormerImageProcessor(BaseImageProcessor): ...@@ -494,11 +494,23 @@ class MaskFormerImageProcessor(BaseImageProcessor):
image = resize(image, size=size, resample=resample, data_format=data_format) image = resize(image, size=size, resample=resample, data_format=data_format)
return image return image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
def rescale( def rescale(
self, image: np.ndarray, rescale_factor: float, data_format: Optional[ChannelDimension] = None self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray: ) -> np.ndarray:
""" """
Rescale the image by the given factor. Rescale the image by the given factor. image = image * rescale_factor.
Args:
image (`np.ndarray`):
Image to rescale.
rescale_factor (`float`):
The value to use for rescaling.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
""" """
return rescale(image, rescale_factor, data_format=data_format) return rescale(image, rescale_factor, data_format=data_format)
...@@ -508,7 +520,6 @@ class MaskFormerImageProcessor(BaseImageProcessor): ...@@ -508,7 +520,6 @@ class MaskFormerImageProcessor(BaseImageProcessor):
instance_id_to_semantic_id: Optional[Dict[int, int]] = None, instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
ignore_index: Optional[int] = None, ignore_index: Optional[int] = None,
reduce_labels: bool = False, reduce_labels: bool = False,
**kwargs,
): ):
reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels
ignore_index = ignore_index if ignore_index is not None else self.ignore_index ignore_index = ignore_index if ignore_index is not None else self.ignore_index
...@@ -741,7 +752,7 @@ class MaskFormerImageProcessor(BaseImageProcessor): ...@@ -741,7 +752,7 @@ class MaskFormerImageProcessor(BaseImageProcessor):
return_pixel_mask: bool = True, return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None, data_format: Optional[ChannelDimension] = None,
) -> np.ndarray: ) -> BatchFeature:
""" """
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask. in the batch and optionally returns their corresponding pixel mask.
...@@ -753,8 +764,13 @@ class MaskFormerImageProcessor(BaseImageProcessor): ...@@ -753,8 +764,13 @@ class MaskFormerImageProcessor(BaseImageProcessor):
The value to use for the padding if `mode` is `"constant"`. The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`): return_pixel_mask (`bool`, *optional*, defaults to `True`):
Whether to return a pixel mask. Whether to return a pixel mask.
input_channel_dimension (`ChannelDimension`, *optional*): return_tensors (`str` or `TensorType`, *optional*):
The channel dimension format of the image. If not provided, it will be inferred from the input image. The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`str` or `ChannelDimension`, *optional*): data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image. The channel dimension format of the image. If not provided, it will be the same as the input image.
""" """
......
...@@ -111,6 +111,7 @@ class MobileNetV1ImageProcessor(BaseImageProcessor): ...@@ -111,6 +111,7 @@ class MobileNetV1ImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
# Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
......
...@@ -115,6 +115,7 @@ class MobileNetV2ImageProcessor(BaseImageProcessor): ...@@ -115,6 +115,7 @@ class MobileNetV2ImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
# Copied from transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -254,6 +255,7 @@ class MobileNetV2ImageProcessor(BaseImageProcessor): ...@@ -254,6 +255,7 @@ class MobileNetV2ImageProcessor(BaseImageProcessor):
data = {"pixel_values": images} data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors) return BatchFeature(data=data, tensor_type=return_tensors)
# Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->MobileNetV2
def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None): def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
""" """
Converts the output of [`MobileNetV2ForSemanticSegmentation`] into semantic segmentation maps. Only supports Converts the output of [`MobileNetV2ForSemanticSegmentation`] into semantic segmentation maps. Only supports
...@@ -262,14 +264,14 @@ class MobileNetV2ImageProcessor(BaseImageProcessor): ...@@ -262,14 +264,14 @@ class MobileNetV2ImageProcessor(BaseImageProcessor):
Args: Args:
outputs ([`MobileNetV2ForSemanticSegmentation`]): outputs ([`MobileNetV2ForSemanticSegmentation`]):
Raw outputs of the model. Raw outputs of the model.
target_sizes (`List[Tuple]`, *optional*): target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
A list of length `batch_size`, where each item is a `Tuple[int, int]` corresponding to the requested List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
final size (height, width) of each prediction. If left to None, predictions will not be resized. predictions will not be resized.
Returns: Returns:
`List[torch.Tensor]`: semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width) segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
`torch.Tensor` correspond to a semantic class id.
""" """
# TODO: add support for other frameworks # TODO: add support for other frameworks
logits = outputs.logits logits = outputs.logits
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment