Update InstructBLIP & Align values after rescale update (#25209)

* Update InstructBLIP values Note: the tests are not independent. Running the test independentely produces different logits compared to running all the integration tests * Update test values after rescale update * Remove left over commented out code * Revert to previous rescaling logic * Update rescale tests

Update InstructBLIP & Align values after rescale update (#25209)
* Update InstructBLIP values Note: the tests are not independent. Running the test independentely produces different logits compared to running all the integration tests * Update test values after rescale update * Remove left over commented out code * Revert to previous rescaling logic * Update rescale tests
30409af6 · amyeroberts · GitHub · 15082a9d · 30409af6 · 30409af6
Unverified Commit 30409af6 authored Aug 03, 2023 by amyeroberts Committed by GitHub Aug 03, 2023
5 changed files
--- a/src/transformers/models/efficientnet/image_processing_efficientnet.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -155,10 +155,11 @@ class EfficientNetImageProcessor(BaseImageProcessor):
        """
        Rescale an image by a scale factor.
-        If offset is True, the image is rescaled between [-1, 1].
+        If `offset` is `True`, the image has its values rescaled by `scale` and then offset by 1. If `scale` is
-            image = image * scale * 2 - 1
+        1/127.5, the image is rescaled between [-1, 1].
+            image = image * scale - 1
-        If offset is False, the image is rescaled between [0, 1].
+        If `offset` is `False`, and `scale` is 1/255, the image is rescaled between [0, 1].
            image = image * scale
        Args:
@@ -171,7 +172,6 @@ class EfficientNetImageProcessor(BaseImageProcessor):
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
        """
-        scale = scale * 2 if offset else scale
        rescaled_image = rescale(image, scale=scale, data_format=data_format, **kwargs)
        if offset:

--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -179,10 +179,11 @@ class VivitImageProcessor(BaseImageProcessor):
        """
        Rescale an image by a scale factor.
-        If offset is True, the image is rescaled between [-1, 1].
+        If `offset` is `True`, the image has its values rescaled by `scale` and then offset by 1. If `scale` is
-            image = image * scale * 2 - 1
+        1/127.5, the image is rescaled between [-1, 1].
+            image = image * scale - 1
-        If offset is False, the image is rescaled between [0, 1].
+        If `offset` is `False`, and `scale` is 1/255, the image is rescaled between [0, 1].
            image = image * scale
        Args:
@@ -195,7 +196,6 @@ class VivitImageProcessor(BaseImageProcessor):
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
        """
-        scale = scale * 2 if offset else scale
        rescaled_image = rescale(image, scale=scale, data_format=data_format, **kwargs)
        if offset:

--- a/tests/models/efficientnet/test_image_processing_efficientnet.py
+++ b/tests/models/efficientnet/test_image_processing_efficientnet.py
@@ -200,8 +200,8 @@ class EfficientNetImageProcessorTest(ImageProcessingSavingTestMixin, unittest.Te
        image_processor = self.image_processing_class(**self.image_processor_dict)
-        rescaled_image = image_processor.rescale(image, scale=1 / 255)
+        rescaled_image = image_processor.rescale(image, scale=1 / 127.5)
-        expected_image = (image * (2 / 255.0)).astype(np.float32) - 1
+        expected_image = (image * (1 / 127.5)).astype(np.float32) - 1
        self.assertTrue(np.allclose(rescaled_image, expected_image))
        rescaled_image = image_processor.rescale(image, scale=1 / 255, offset=False)

--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -538,7 +538,7 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):
            logits = model(**inputs).logits
        expected_slice = torch.tensor(
-            [[-3.5020, -12.3281, 8.4453], [-5.1406, -11.9609, 7.8711], [-4.0430, -13.4375, 9.1172]],
+            [[-3.4727, -11.8203, 8.3828], [-5.1172, -11.3438, 7.7656], [-4.0742, -13.4688, 9.1953]],
            device=torch_device,
        )
        self.assertTrue(torch.allclose(logits[0, :3, :3].float(), expected_slice, atol=1e-3))
@@ -548,12 +548,12 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):
        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        # fmt: off
-        expected_outputs = [    2,   450, 22910,  9565,   310,   445,  1967,   338,   393,   263, 767,   338, 13977,   292, 22095,   373,   278,  1250,   310,   263, 13328, 20134, 29963,  1550, 19500,  1623,   263, 19587,  4272, 11952, 29889]
+        expected_outputs = [2, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 19500, 373, 263, 19587, 4272, 11952, 29889]
        # fmt: on
        self.assertEqual(outputs[0].tolist(), expected_outputs)
        self.assertEqual(
            generated_text,
-            "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while driving down a busy city street.",
+            "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while driving on a busy city street.",
        )
    def test_inference_flant5_xl(self):

--- a/tests/models/vivit/test_image_processing_vivit.py
+++ b/tests/models/vivit/test_image_processing_vivit.py
@@ -219,8 +219,8 @@ class VivitImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase
        image_processor = self.image_processing_class(**self.image_processor_dict)
-        rescaled_image = image_processor.rescale(image, scale=1 / 255)
+        rescaled_image = image_processor.rescale(image, scale=1 / 127.5)
-        expected_image = (image * (2 / 255.0)).astype(np.float32) - 1
+        expected_image = (image * (1 / 127.5)).astype(np.float32) - 1
        self.assertTrue(np.allclose(rescaled_image, expected_image))
        rescaled_image = image_processor.rescale(image, scale=1 / 255, offset=False)