ImageProcessor - check if input pixel values between 0-255 (#25688)

* Check if pixel values between 0-255 and add doc clarification * Add missing docstrings * _is_scale_image -> is_scaled_image * Spelling is hard * Tidy up

ImageProcessor - check if input pixel values between 0-255 (#25688)
* Check if pixel values between 0-255 and add doc clarification * Add missing docstrings * _is_scale_image -> is_scaled_image * Spelling is hard * Tidy up
1b2381c4 · amyeroberts · GitHub · 7a6efe1e · 1b2381c4 · 1b2381c4
Unverified Commit 1b2381c4 authored Aug 24, 2023 by amyeroberts Committed by GitHub Aug 24, 2023
20 changed files
--- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
@@ -27,6 +27,7 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
+    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -236,7 +237,8 @@ class LayoutLMv3ImageProcessor(BaseImageProcessor):

        Args:
            images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -314,6 +316,12 @@ class LayoutLMv3ImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]

+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])

--- a/src/transformers/models/levit/image_processing_levit.py
+++ b/src/transformers/models/levit/image_processing_levit.py
@@ -31,6 +31,7 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
+    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -192,7 +193,8 @@ class LevitImageProcessor(BaseImageProcessor):

        Args:
            images (`ImageInput`):
-                Image or batch of images to preprocess.
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -273,6 +275,12 @@ class LevitImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]

+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])

--- a/src/transformers/models/mask2former/image_processing_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -36,6 +36,7 @@ from ...image_utils import (
    get_image_size,
    infer_channel_dimension_format,
    is_batched,
+    is_scaled_image,
    to_numpy_array,
    valid_images,
 )
@@ -606,6 +607,11 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
        """Preprocesses a single image."""
        # All transformations expect numpy arrays.
        image = to_numpy_array(image)
+        if is_scaled_image(image) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)
        image = self._preprocess(

--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@@ -35,6 +35,7 @@ from ...image_utils import (
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
+    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -613,6 +614,11 @@ class MaskFormerImageProcessor(BaseImageProcessor):
        """Preprocesses a single image."""
        # All transformations expect numpy arrays.
        image = to_numpy_array(image)
+        if is_scaled_image(image) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)
        image = self._preprocess(

--- a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
+++ b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
@@ -31,6 +31,7 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
+    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -176,7 +177,8 @@ class MobileNetV1ImageProcessor(BaseImageProcessor):

        Args:
            images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -254,6 +256,12 @@ class MobileNetV1ImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]

+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])

--- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
@@ -31,6 +31,7 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
+    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -180,7 +181,8 @@ class MobileNetV2ImageProcessor(BaseImageProcessor):

        Args:
            images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -258,6 +260,12 @@ class MobileNetV2ImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]

+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])

--- a/src/transformers/models/mobilevit/image_processing_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py
@@ -30,6 +30,7 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
+    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -189,7 +190,8 @@ class MobileViTImageProcessor(BaseImageProcessor):

        Args:
            images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -259,6 +261,12 @@ class MobileViTImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]

+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])

--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -36,6 +36,7 @@ from ...image_utils import (
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
+    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -574,6 +575,11 @@ class OneFormerImageProcessor(BaseImageProcessor):
        """Preprocesses a single image."""
        # All transformations expect numpy arrays.
        image = to_numpy_array(image)
+        if is_scaled_image(image) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)
        image = self._preprocess(

--- a/src/transformers/models/owlvit/image_processing_owlvit.py
+++ b/src/transformers/models/owlvit/image_processing_owlvit.py
@@ -34,6 +34,7 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
+    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -288,7 +289,8 @@ class OwlViTImageProcessor(BaseImageProcessor):

        Args:
            images (`ImageInput`):
-                The image or batch of images to be prepared.
+                The image or batch of images to be prepared. Expects a single or batch of images with pixel values
+                ranging from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether or not to resize the input. If `True`, will resize the input to the size specified by `size`.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -368,6 +370,12 @@ class OwlViTImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays
        images = [to_numpy_array(image) for image in images]

+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])

--- a/src/transformers/models/perceiver/image_processing_perceiver.py
+++ b/src/transformers/models/perceiver/image_processing_perceiver.py
@@ -28,6 +28,7 @@ from ...image_utils import (
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
+    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -228,7 +229,8 @@ class PerceiverImageProcessor(BaseImageProcessor):

        Args:
            images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
                Whether to center crop the image to `crop_size`.
            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
@@ -304,6 +306,12 @@ class PerceiverImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]

+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])

--- a/src/transformers/models/pix2struct/image_processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/image_processing_pix2struct.py
@@ -382,7 +382,7 @@ class Pix2StructImageProcessor(BaseImageProcessor):

        Args:
            images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images.
            header_text (`Union[List[str], str]`, *optional*):
                Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):

--- a/src/transformers/models/poolformer/image_processing_poolformer.py
+++ b/src/transformers/models/poolformer/image_processing_poolformer.py
@@ -31,6 +31,7 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
+    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -231,7 +232,8 @@ class PoolFormerImageProcessor(BaseImageProcessor):

        Args:
            images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -311,6 +313,12 @@ class PoolFormerImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]

+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])

--- a/src/transformers/models/pvt/image_processing_pvt.py
+++ b/src/transformers/models/pvt/image_processing_pvt.py
@@ -27,6 +27,7 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
+    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -164,7 +165,8 @@ class PvtImageProcessor(BaseImageProcessor):

        Args:
            images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -230,6 +232,12 @@ class PvtImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]

+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])

--- a/src/transformers/models/sam/image_processing_sam.py
+++ b/src/transformers/models/sam/image_processing_sam.py
@@ -30,6 +30,7 @@ from ...image_utils import (
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
+    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -259,7 +260,8 @@ class SamImageProcessor(BaseImageProcessor):

        Args:
            images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -344,6 +346,12 @@ class SamImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]

+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])

--- a/src/transformers/models/segformer/image_processing_segformer.py
+++ b/src/transformers/models/segformer/image_processing_segformer.py
@@ -28,6 +28,7 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
+    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -232,6 +233,11 @@ class SegformerImageProcessor(BaseImageProcessor):
        """Preprocesses a single image."""
        # All transformations expect numpy arrays.
        image = to_numpy_array(image)
+        if is_scaled_image(image) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)
        image = self._preprocess(
@@ -319,7 +325,8 @@ class SegformerImageProcessor(BaseImageProcessor):

        Args:
            images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            segmentation_maps (`ImageInput`, *optional*):
                Segmentation map to preprocess.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):

--- a/src/transformers/models/swin2sr/image_processing_swin2sr.py
+++ b/src/transformers/models/swin2sr/image_processing_swin2sr.py
@@ -24,6 +24,7 @@ from ...image_utils import (
    ChannelDimension,
    ImageInput,
    infer_channel_dimension_format,
+    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -122,7 +123,8 @@ class Swin2SRImageProcessor(BaseImageProcessor):

        Args:
            images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image values between [0 - 1].
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
@@ -170,6 +172,12 @@ class Swin2SRImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]

+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])

--- a/src/transformers/models/tvlt/image_processing_tvlt.py
+++ b/src/transformers/models/tvlt/image_processing_tvlt.py
@@ -30,6 +30,7 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
+    is_scaled_image,
    is_valid_image,
    to_numpy_array,
    valid_images,
@@ -226,6 +227,12 @@ class TvltImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        image = to_numpy_array(image)

+        if is_scaled_image(image) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)

@@ -269,7 +276,8 @@ class TvltImageProcessor(BaseImageProcessor):

        Args:
            videos (`ImageInput`):
-                Images or videos to preprocess.
+                Images or videos to preprocess. Expects a single or batch of frames with pixel values ranging from 0 to
+                255. If passing in frames with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):

--- a/src/transformers/models/videomae/image_processing_videomae.py
+++ b/src/transformers/models/videomae/image_processing_videomae.py
@@ -31,6 +31,7 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
+    is_scaled_image,
    is_valid_image,
    to_numpy_array,
    valid_images,
@@ -205,6 +206,12 @@ class VideoMAEImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        image = to_numpy_array(image)

+        if is_scaled_image(image) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)

@@ -246,7 +253,8 @@ class VideoMAEImageProcessor(BaseImageProcessor):

        Args:
            images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):

--- a/src/transformers/models/vilt/image_processing_vilt.py
+++ b/src/transformers/models/vilt/image_processing_vilt.py
@@ -28,6 +28,7 @@ from ...image_utils import (
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
+    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -357,7 +358,8 @@ class ViltImageProcessor(BaseImageProcessor):

        Args:
            images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -433,6 +435,12 @@ class ViltImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]

+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])

--- a/src/transformers/models/vit/image_processing_vit.py
+++ b/src/transformers/models/vit/image_processing_vit.py
@@ -27,6 +27,7 @@ from ...image_utils import (
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
+    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
@@ -163,7 +164,8 @@ class ViTImageProcessor(BaseImageProcessor):

        Args:
            images (`ImageInput`):
-                Image to preprocess.
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -229,6 +231,12 @@ class ViTImageProcessor(BaseImageProcessor):
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]

+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])