Input data format (#25464)

* Add copied from statements for image processors * Move out rescale and normalize to base image processor * Remove rescale and normalize from vit (post rebase) * Update docstrings and tidy up * PR comments * Add input_data_format as preprocess argument * Resolve tests and tidy up * Remove num_channels argument * Update doc strings -> default ints not in code formatting

Input data format (#25464)
* Add copied from statements for image processors * Move out rescale and normalize to base image processor * Remove rescale and normalize from vit (post rebase) * Update docstrings and tidy up * PR comments * Add input_data_format as preprocess argument * Resolve tests and tidy up * Remove num_channels argument * Update doc strings -> default ints not in code formatting
6bca43bb · amyeroberts · GitHub · a6609caf · 6bca43bb · 6bca43bb
Unverified Commit 6bca43bb authored Aug 16, 2023 by amyeroberts Committed by GitHub Aug 16, 2023
15 changed files
--- a/src/transformers/models/vit/image_processing_vit.py
+++ b/src/transformers/models/vit/image_processing_vit.py
@@ -26,6 +26,7 @@ from ...image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
+    infer_channel_dimension_format,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -99,6 +100,7 @@ class ViTImageProcessor(BaseImageProcessor):
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
@@ -116,6 +118,13 @@ class ViTImageProcessor(BaseImageProcessor):
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        Returns:
            `np.ndarray`: The resized image.
@@ -124,7 +133,14 @@ class ViTImageProcessor(BaseImageProcessor):
        if "height" not in size or "width" not in size:
            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
        output_size = (size["height"], size["width"])
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
    def preprocess(
        self,
@@ -139,6 +155,7 @@ class ViTImageProcessor(BaseImageProcessor):
        image_std: Optional[Union[float, List[float]]] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):
        """
@@ -177,6 +194,12 @@ class ViTImageProcessor(BaseImageProcessor):
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        """
        do_resize = do_resize if do_resize is not None else self.do_resize
        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
@@ -206,16 +229,31 @@ class ViTImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
        if do_resize:
-            images = [self.resize(image=image, size=size_dict, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
        if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
        if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
        data = {"pixel_values": images}
        return BatchFeature(data=data, tensor_type=return_tensors)
--- a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
@@ -31,6 +31,7 @@ from ...image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
+    infer_channel_dimension_format,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -125,6 +126,7 @@ class ViTHybridImageProcessor(BaseImageProcessor):
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
@@ -140,12 +142,23 @@ class ViTHybridImageProcessor(BaseImageProcessor):
                Resampling filter to use when resiizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        size = get_size_dict(size, default_to_square=False)
        if "shortest_edge" not in size:
            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
-        output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
+        output_size = get_resize_output_image_size(
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+            image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
    def preprocess(
        self,
@@ -163,6 +176,7 @@ class ViTHybridImageProcessor(BaseImageProcessor):
        do_convert_rgb: bool = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> PIL.Image.Image:
        """
@@ -208,6 +222,12 @@ class ViTHybridImageProcessor(BaseImageProcessor):
                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: defaults to the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        """
        do_resize = do_resize if do_resize is not None else self.do_resize
        size = size if size is not None else self.size
@@ -250,19 +270,36 @@ class ViTHybridImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
        if do_resize:
-            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
        if do_center_crop:
-            images = [self.center_crop(image=image, size=crop_size) for image in images]
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
        if do_rescale:
-            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
        if do_normalize:
-            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
-        images = [to_channel_dimension_format(image, data_format) for image in images]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
        data = {"pixel_values": images}
        return BatchFeature(data=data, tensor_type=return_tensors)
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -33,6 +33,7 @@ from ...image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
+    infer_channel_dimension_format,
    is_valid_image,
    to_numpy_array,
    valid_images,
@@ -141,6 +142,7 @@ class VivitImageProcessor(BaseImageProcessor):
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
@@ -157,15 +159,26 @@ class VivitImageProcessor(BaseImageProcessor):
                Resampling filter to use when resiizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        size = get_size_dict(size, default_to_square=False)
        if "shortest_edge" in size:
-            output_size = get_resize_output_image_size(image, size["shortest_edge"], default_to_square=False)
+            output_size = get_resize_output_image_size(
+                image, size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+            )
        elif "height" in size and "width" in size:
            output_size = (size["height"], size["width"])
        else:
            raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
-        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
    # Copied from transformers.models.efficientnet.image_processing_efficientnet.EfficientNetImageProcessor.rescale
    def rescale(
@@ -174,6 +187,7 @@ class VivitImageProcessor(BaseImageProcessor):
        scale: Union[int, float],
        offset: bool = True,
        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):
        """
@@ -195,8 +209,12 @@ class VivitImageProcessor(BaseImageProcessor):
                Whether to scale the image in both negative and positive directions.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
        """
-        rescaled_image = rescale(image, scale=scale, data_format=data_format, **kwargs)
+        rescaled_image = rescale(
+            image, scale=scale, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
        if offset:
            rescaled_image = rescaled_image - 1
@@ -218,6 +236,7 @@ class VivitImageProcessor(BaseImageProcessor):
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """Preprocesses a single image."""
        if do_resize and size is None or resample is None:
@@ -238,19 +257,22 @@ class VivitImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        image = to_numpy_array(image)
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
        if do_resize:
-            image = self.resize(image=image, size=size, resample=resample)
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
        if do_center_crop:
-            image = self.center_crop(image, size=crop_size)
+            image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)
        if do_rescale:
-            image = self.rescale(image=image, scale=rescale_factor, offset=offset)
+            image = self.rescale(image=image, scale=rescale_factor, offset=offset, input_data_format=input_data_format)
        if do_normalize:
-            image = self.normalize(image=image, mean=image_mean, std=image_std)
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
-        image = to_channel_dimension_format(image, data_format)
+        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
        return image
    def preprocess(
@@ -269,6 +291,7 @@ class VivitImageProcessor(BaseImageProcessor):
        image_std: Optional[Union[float, List[float]]] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> PIL.Image.Image:
        """
@@ -312,6 +335,12 @@ class VivitImageProcessor(BaseImageProcessor):
                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                    - Unset: Use the inferred channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        """
        do_resize = do_resize if do_resize is not None else self.do_resize
        resample = resample if resample is not None else self.resample
@@ -352,6 +381,7 @@ class VivitImageProcessor(BaseImageProcessor):
                    image_mean=image_mean,
                    image_std=image_std,
                    data_format=data_format,
+                    input_data_format=input_data_format,
                )
                for img in video
            ]

--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -88,18 +88,21 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.CO
 # Copied from transformers.models.detr.image_processing_detr.get_max_height_width
-def get_max_height_width(images: List[np.ndarray]) -> List[int]:
+def get_max_height_width(
+    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
    """
    Get the maximum height and width across all images in a batch.
    """
-    input_channel_dimension = infer_channel_dimension_format(images[0])
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
-    if input_channel_dimension == ChannelDimension.FIRST:
+    if input_data_format == ChannelDimension.FIRST:
        _, max_height, max_width = max_across_indices([img.shape for img in images])
-    elif input_channel_dimension == ChannelDimension.LAST:
+    elif input_data_format == ChannelDimension.LAST:
        max_height, max_width, _ = max_across_indices([img.shape for img in images])
    else:
-        raise ValueError(f"Invalid channel dimension format: {input_channel_dimension}")
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
    return (max_height, max_width)
@@ -137,7 +140,10 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
 # Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
 def get_resize_output_image_size(
-    input_image: np.ndarray, size: Union[int, Tuple[int, int], List[int]], max_size: Optional[int] = None
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int]],
+    max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> Tuple[int, int]:
    """
    Computes the output image size given the input image size and the desired output size. If the desired output size
@@ -151,8 +157,10 @@ def get_resize_output_image_size(
            The desired output size.
        max_size (`int`, *optional*):
            The maximum allowed output size.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
    """
-    image_size = get_image_size(input_image)
+    image_size = get_image_size(input_image, input_data_format)
    if isinstance(size, (list, tuple)):
        return size
@@ -222,7 +230,9 @@ def max_across_indices(values: Iterable[Any]) -> List[Any]:
 # Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
-def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarray:
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
    """
    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
@@ -232,7 +242,7 @@ def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarr
        output_size (`Tuple[int, int]`):
            Output size of the mask.
    """
-    input_height, input_width = get_image_size(image)
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
    mask = np.zeros(output_size, dtype=np.int64)
    mask[:input_height, :input_width] = 1
    return mask
@@ -274,11 +284,16 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndar
 # Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation
-def prepare_coco_detection_annotation(image, target, return_segmentation_masks: bool = False):
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
    """
    Convert the target in COCO format into the format expected by DETR.
    """
-    image_height, image_width = get_image_size(image)
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
    image_id = target["image_id"]
    image_id = np.asarray([image_id], dtype=np.int64)
@@ -363,12 +378,16 @@ def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
 # Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->YOLOS
 def prepare_coco_panoptic_annotation(
-    image: np.ndarray, target: Dict, masks_path: Union[str, pathlib.Path], return_masks: bool = True
+    image: np.ndarray,
+    target: Dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
 ) -> Dict:
    """
    Prepare a coco panoptic annotation for YOLOS.
    """
-    image_height, image_width = get_image_size(image)
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
    annotation_path = pathlib.Path(masks_path) / target["file_name"]
    new_target = {}
@@ -751,6 +770,7 @@ class YolosImageProcessor(BaseImageProcessor):
        format: Optional[AnnotionFormat] = None,
        return_segmentation_masks: bool = None,
        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> Dict:
        """
        Prepare an annotation for feeding into DETR model.
@@ -759,11 +779,17 @@ class YolosImageProcessor(BaseImageProcessor):
        if format == AnnotionFormat.COCO_DETECTION:
            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
-            target = prepare_coco_detection_annotation(image, target, return_segmentation_masks)
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
        elif format == AnnotionFormat.COCO_PANOPTIC:
            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
            target = prepare_coco_panoptic_annotation(
-                image, target, masks_path=masks_path, return_masks=return_segmentation_masks
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
            )
        else:
            raise ValueError(f"Format {format} is not supported.")
@@ -801,11 +827,26 @@ class YolosImageProcessor(BaseImageProcessor):
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
        int, smaller edge of the image will be matched to this number.
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
+                `height` and `width`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use if resizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        if "max_size" in kwargs:
            logger.warning_once(
@@ -817,7 +858,9 @@ class YolosImageProcessor(BaseImageProcessor):
            max_size = None
        size = get_size_dict(size, max_size=max_size, default_to_square=False)
        if "shortest_edge" in size and "longest_edge" in size:
-            size = get_resize_output_image_size(image, size["shortest_edge"], size["longest_edge"])
+            size = get_resize_output_image_size(
+                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+            )
        elif "height" in size and "width" in size:
            size = (size["height"], size["width"])
        else:
@@ -825,7 +868,9 @@ class YolosImageProcessor(BaseImageProcessor):
                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
                f" {size.keys()}."
            )
-        image = resize(image, size=size, resample=resample, data_format=data_format)
+        image = resize(
+            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
        return image
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
@@ -844,7 +889,11 @@ class YolosImageProcessor(BaseImageProcessor):
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
    def rescale(
-        self, image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None
+        self,
+        image: np.ndarray,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Rescale the image by the given factor. image = image * rescale_factor.
@@ -859,8 +908,13 @@ class YolosImageProcessor(BaseImageProcessor):
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        """
-        return rescale(image, rescale_factor, data_format=data_format)
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
@@ -877,28 +931,36 @@ class YolosImageProcessor(BaseImageProcessor):
        output_size: Tuple[int, int],
        constant_values: Union[float, Iterable[float]] = 0,
        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Pad an image with zeros to the given size.
        """
-        input_height, input_width = get_image_size(image)
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
        output_height, output_width = output_size
        pad_bottom = output_height - input_height
        pad_right = output_width - input_width
        padding = ((0, pad_bottom), (0, pad_right))
        padded_image = pad(
-            image, padding, mode=PaddingMode.CONSTANT, constant_values=constant_values, data_format=data_format
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
        )
        return padded_image
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
    def pad(
        self,
        images: List[np.ndarray],
        constant_values: Union[float, Iterable[float]] = 0,
-        return_pixel_mask: bool = False,
+        return_pixel_mask: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> BatchFeature:
        """
        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
@@ -920,17 +982,28 @@ class YolosImageProcessor(BaseImageProcessor):
                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
        """
-        pad_size = get_max_height_width(images)
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
        padded_images = [
-            self._pad_image(image, pad_size, constant_values=constant_values, data_format=data_format)
+            self._pad_image(
+                image,
+                pad_size,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
            for image in images
        ]
        data = {"pixel_values": padded_images}
        if return_pixel_mask:
-            masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images]
+            masks = [
+                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
+                for image in images
+            ]
            data["pixel_mask"] = masks
        return BatchFeature(data=data, tensor_type=return_tensors)
@@ -953,6 +1026,7 @@ class YolosImageProcessor(BaseImageProcessor):
        format: Optional[Union[str, AnnotionFormat]] = None,
        return_tensors: Optional[Union[TensorType, str]] = None,
        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> BatchFeature:
        """
@@ -1000,6 +1074,12 @@ class YolosImageProcessor(BaseImageProcessor):
                Type of tensors to return. If `None`, will return the list of images.
            data_format (`str` or `ChannelDimension`, *optional*, defaults to self.data_format):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        """
        if "pad_and_return_pixel_mask" in kwargs:
            logger.warning_once(
@@ -1084,13 +1164,22 @@ class YolosImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays
        images = [to_numpy_array(image) for image in images]
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
        # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
        if annotations is not None:
            prepared_images = []
            prepared_annotations = []
            for image, target in zip(images, annotations):
                target = self.prepare_annotation(
-                    image, target, format, return_segmentation_masks=return_segmentation_masks, masks_path=masks_path
+                    image,
+                    target,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
                )
                prepared_images.append(image)
                prepared_annotations.append(target)
@@ -1103,22 +1192,31 @@ class YolosImageProcessor(BaseImageProcessor):
            if annotations is not None:
                resized_images, resized_annotations = [], []
                for image, target in zip(images, annotations):
-                    orig_size = get_image_size(image)
+                    orig_size = get_image_size(image, input_data_format)
-                    resized_image = self.resize(image, size=size, max_size=max_size, resample=resample)
+                    resized_image = self.resize(
-                    resized_annotation = self.resize_annotation(target, orig_size, get_image_size(resized_image))
+                        image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format
+                    )
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
                    resized_images.append(resized_image)
                    resized_annotations.append(resized_annotation)
                images = resized_images
                annotations = resized_annotations
                del resized_images, resized_annotations
            else:
-                images = [self.resize(image, size=size, resample=resample) for image in images]
+                images = [
+                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
        if do_rescale:
-            images = [self.rescale(image, rescale_factor) for image in images]
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
        if do_normalize:
-            images = [self.normalize(image, image_mean, image_std) for image in images]
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
            if annotations is not None:
                annotations = [
                    self.normalize_annotation(annotation, get_image_size(image))
@@ -1126,9 +1224,12 @@ class YolosImageProcessor(BaseImageProcessor):
                ]
        if do_pad:
-            data = self.pad(images, data_format=data_format)
+            data = self.pad(images, data_format=data_format, input_data_format=input_data_format)
        else:
-            images = [to_channel_dimension_format(image, data_format) for image in images]
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
            data = {"pixel_values": images}
        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)

--- a/tests/models/blip/test_image_processing_blip.py
+++ b/tests/models/blip/test_image_processing_blip.py
@@ -70,7 +70,7 @@ class BlipImageProcessingTester(unittest.TestCase):
        }
    def expected_output_image_shape(self, images):
-        return 3, self.size["height"], self.size["width"]
+        return self.num_channels, self.size["height"], self.size["width"]
    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
        return prepare_image_inputs(
@@ -135,3 +135,11 @@ class BlipImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.Tes
    @unittest.skip("BlipImageProcessor does not support 4 channels yet")  # FIXME Amy
    def test_call_pytorch(self):
        return super().test_call_torch()
+    @unittest.skip("BLIP doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    def test_call_pil(self):
+        pass
+    @unittest.skip("BLIP doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    def test_call_numpy_4_channels(self):
+        pass
--- a/tests/models/chinese_clip/test_image_processing_chinese_clip.py
+++ b/tests/models/chinese_clip/test_image_processing_chinese_clip.py
@@ -17,7 +17,7 @@
 import unittest
 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_vision_available
+from transformers.utils import is_torch_available, is_vision_available
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
@@ -26,6 +26,10 @@ if is_vision_available():
    from transformers import ChineseCLIPImageProcessor
+if is_torch_available():
+    pass
 class ChineseCLIPImageProcessingTester(unittest.TestCase):
    def __init__(
        self,
@@ -120,6 +124,10 @@ class ChineseCLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase
        self.assertEqual(image_processor.size, {"shortest_edge": 42})
        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
+    @unittest.skip("ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    def test_call_numpy_4_channels(self):
+        pass
 @require_torch
 @require_vision
@@ -152,3 +160,7 @@ class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingTestMixin, unitt
    @unittest.skip("ChineseCLIPImageProcessor does not support 4 channels yet")  # FIXME Amy
    def test_call_pytorch(self):
        return super().test_call_torch()
+    @unittest.skip("ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    def test_call_numpy_4_channels(self):
+        pass
--- a/tests/models/flava/test_image_processing_flava.py
+++ b/tests/models/flava/test_image_processing_flava.py
@@ -337,6 +337,11 @@ class FlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    def test_call_numpy(self):
        self._test_call_framework(np.ndarray, prepare_kwargs={"numpify": True})
+    def test_call_numpy_4_channels(self):
+        self.image_processing_class.num_channels = 4
+        self._test_call_framework(np.ndarray, prepare_kwargs={"numpify": True})
+        self.image_processing_class.num_channels = 3
    def test_call_pytorch(self):
        self._test_call_framework(torch.Tensor, prepare_kwargs={"torchify": True})

--- a/tests/models/glpn/test_image_processing_glpn.py
+++ b/tests/models/glpn/test_image_processing_glpn.py
@@ -144,3 +144,18 @@ class GLPNImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
        self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape))
+    def test_call_numpy_4_channels(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        self.image_processing_class.num_channels = 4
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+        # Test not batched input (GLPNImageProcessor doesn't support batching)
+        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertTrue(tuple(encoded_images.shape) == (1, *expected_output_image_shape))
+        self.image_processing_class.num_channels = 3
--- a/tests/models/imagegpt/test_image_processing_imagegpt.py
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -198,6 +198,10 @@ class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
        )
+    @unittest.skip("ImageGPT assumes clusters for 3 channels")
+    def test_call_numpy_4_channels(self):
+        pass
    # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input
    def test_call_pytorch(self):
        # Initialize image_processing

--- a/tests/models/pix2struct/test_image_processing_pix2struct.py
+++ b/tests/models/pix2struct/test_image_processing_pix2struct.py
@@ -222,6 +222,40 @@ class Pix2StructImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase)
                (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
            )
+    def test_call_numpy_4_channels(self):
+        # Initialize image_processor
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        self.image_processor_tester.num_channels = 4
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+        expected_hidden_dim = (
+            (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
+            * self.image_processor_tester.num_channels
+        ) + 2
+        for max_patch in self.image_processor_tester.max_patches:
+            # Test not batched input
+            encoded_images = image_processor(
+                image_inputs[0], return_tensors="pt", max_patches=max_patch, input_data_format="channels_first"
+            ).flattened_patches
+            self.assertEqual(
+                encoded_images.shape,
+                (1, max_patch, expected_hidden_dim),
+            )
+            # Test batched
+            encoded_images = image_processor(
+                image_inputs, return_tensors="pt", max_patches=max_patch, input_data_format="channels_first"
+            ).flattened_patches
+            self.assertEqual(
+                encoded_images.shape,
+                (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
+            )
+        self.image_processor_tester.num_channels = 3
    def test_call_pytorch(self):
        # Initialize image_processor
        image_processor = self.image_processing_class(**self.image_processor_dict)
@@ -318,3 +352,7 @@ class Pix2StructImageProcessingTestFourChannels(ImageProcessingTestMixin, unitte
    @unittest.skip("Pix2StructImageProcessor does not support 4 channels yet")  # FIXME Amy
    def test_call_pytorch(self):
        return super().test_call_torch()
+    @unittest.skip("Pix2StructImageProcessor does treat numpy and PIL 4 channel images consistently")  # FIXME Amy
+    def test_call_numpy_4_channels(self):
+        return super().test_call_torch()
--- a/tests/models/swin2sr/test_image_processing_swin2sr.py
+++ b/tests/models/swin2sr/test_image_processing_swin2sr.py
@@ -147,6 +147,24 @@ class Swin2SRImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+    # Swin2SRImageProcessor does not support batched input
+    def test_call_numpy_4_channels(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        self.image_processor_tester.num_channels = 4
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        for image in image_inputs:
+            self.assertIsInstance(image, np.ndarray)
+        # Test not batched input
+        encoded_images = image_processing(
+            image_inputs[0], return_tensors="pt", input_data_format="channels_first"
+        ).pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+        self.image_processor_tester.num_channels = 3
    # Swin2SRImageProcessor does not support batched input
    def test_call_pytorch(self):
        # Initialize image_processing

--- a/tests/models/tvlt/test_image_processor_tvlt.py
+++ b/tests/models/tvlt/test_image_processor_tvlt.py
@@ -217,6 +217,47 @@ class TvltImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase):
            ),
        )
+    def test_call_numpy_4_channels(self):
+        # Initialize image_processor
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        self.image_processor_tester.num_channels = 4
+        video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], np.ndarray)
+        # Test not batched input
+        encoded_videos = image_processor(
+            video_inputs[0], return_tensors="pt", input_data_format="channels_first", image_mean=0, image_std=1
+        ).pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+        # Test batched
+        encoded_videos = image_processor(
+            video_inputs, return_tensors="pt", input_data_format="channels_first", image_mean=0, image_std=1
+        ).pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                self.image_processor_tester.crop_size["height"],
+                self.image_processor_tester.crop_size["width"],
+            ),
+        )
+        self.image_processor_tester.num_channels = 3
    def test_call_pytorch(self):
        # Initialize image_processor
        image_processor = self.image_processing_class(**self.image_processor_dict)

--- a/tests/models/videomae/test_image_processing_videomae.py
+++ b/tests/models/videomae/test_image_processing_videomae.py
@@ -165,6 +165,33 @@ class VideoMAEImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
        )
+    def test_call_numpy_4_channels(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        self.image_processor_tester.num_channels = 4
+        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], np.ndarray)
+        # Test not batched input
+        encoded_videos = image_processing(
+            video_inputs[0], return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
+        ).pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
+        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
+        # Test batched
+        encoded_videos = image_processing(
+            video_inputs, return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
+        ).pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
+        self.assertEqual(
+            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
+        )
+        self.image_processor_tester.num_channels = 3
    def test_call_pytorch(self):
        # Initialize image_processing
        image_processing = self.image_processing_class(**self.image_processor_dict)

--- a/tests/models/vivit/test_image_processing_vivit.py
+++ b/tests/models/vivit/test_image_processing_vivit.py
@@ -179,6 +179,33 @@ class VivitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
        )
+    def test_call_numpy_4_channels(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        self.image_processor_tester.num_channels = 4
+        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], np.ndarray)
+        # Test not batched input
+        encoded_videos = image_processing(
+            video_inputs[0], return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
+        ).pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
+        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
+        # Test batched
+        encoded_videos = image_processing(
+            video_inputs, return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
+        ).pixel_values
+        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
+        self.assertEqual(
+            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
+        )
+        self.image_processor_tester.num_channels = 3
    def test_call_pytorch(self):
        # Initialize image_processing
        image_processing = self.image_processing_class(**self.image_processor_dict)

--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -252,3 +252,36 @@ class ImageProcessingTestMixin:
            tuple(encoded_images.shape),
            (self.image_processor_tester.batch_size, *expected_output_image_shape),
        )
+    def test_call_numpy_4_channels(self):
+        # Test that can process images which have an arbitrary number of channels
+        # Initialize image_processing
+        image_processor = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        self.image_processor_tester.num_channels = 4
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+        # Test not batched input
+        encoded_images = image_processor(
+            image_inputs[0],
+            return_tensors="pt",
+            input_data_format="channels_first",
+            image_mean=0,
+            image_std=1,
+        ).pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+        # Test batched
+        encoded_images = image_processor(
+            image_inputs,
+            return_tensors="pt",
+            input_data_format="channels_first",
+            image_mean=0,
+            image_std=1,
+        ).pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertEqual(
+            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+        )