Paligemma - fix slow tests, add bf16 and f16 slow tests (#30851)

* fix slow tests, add bf16 and f16 slow tests * few fixes * [run-slow]paligemma * add gate decorator * [run-slow]paligemma * add missing gating * [run-slow]paligemma * [run-slow]paligemma

Paligemma - fix slow tests, add bf16 and f16 slow tests (#30851)
* fix slow tests, add bf16 and f16 slow tests * few fixes * [run-slow]paligemma * add gate decorator * [run-slow]paligemma * add missing gating * [run-slow]paligemma * [run-slow]paligemma
250ae9f7 · Pablo Montalvo · GitHub · ada86f97 · 250ae9f7
Unverified Commit 250ae9f7 authored May 22, 2024 by Pablo Montalvo Committed by GitHub May 22, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 89 additions and 61 deletions

tests/models/paligemma/test_modeling_paligemma.py tests/models/paligemma/test_modeling_paligemma.py +89 -61

No files found.
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -28,7 +28,7 @@ from transformers import (
    is_vision_available,
 )
 from transformers.testing_utils import (
-    require_bitsandbytes,
+    require_read_token,
    require_torch,
    require_torch_sdpa,
    slow,
@@ -260,60 +260,32 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, unittest.Test
 @slow
 @require_torch
+@require_read_token
 class PaliGemmaForConditionalGenerationIntegrationTest(unittest.TestCase):
    def setUp(self):
-        self.processor = PaliGemmaProcessor.from_pretrained("gv-hf/PaliGemma-test-224px-hf")
+        self.processor = PaliGemmaProcessor.from_pretrained("google/paligemma-3b-pt-224")
    def tearDown(self):
        gc.collect()
        torch.cuda.empty_cache()
    @slow
-    @require_bitsandbytes
+    @require_read_token
    def test_small_model_integration_test(self):
        # Let' s make sure we test the preprocessing to replace what is used
-        model = PaliGemmaForConditionalGeneration.from_pretrained("gv-hf/PaliGemma-test-224px-hf")
+        model_id = "google/paligemma-3b-pt-224"
+        model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
        prompt = ""
        image_file = (
            "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
        )
        raw_image = Image.open(requests.get(image_file, stream=True).raw)
        inputs = self.processor(text=prompt, images=raw_image, return_tensors="pt")
-        # fmt: off
+        EXPECTED_INPUT_IDS = torch.tensor([[257152] * 256 + [2, 108]])
-        EXPECTED_INPUT_IDS = torch.tensor([[256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000,      2,    108]])
-        # fmt: on
        self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
        output = model.generate(**inputs, max_new_tokens=20)
-        EXPECTED_DECODED_TEXT = "\ncow standing on the beach"  # fmt: skip
+        EXPECTED_DECODED_TEXT = "\ncow on the beach"  # fmt: skip
        self.assertEqual(
            self.processor.decode(output[0], skip_special_tokens=True),
@@ -321,37 +293,55 @@ class PaliGemmaForConditionalGenerationIntegrationTest(unittest.TestCase):
        )
    @slow
-    @require_bitsandbytes
+    @require_read_token
-    def test_small_model_integration_test_paligemma(self):
+    def test_small_model_integration_test_paligemma_VQA(self):
        # Let' s make sure we test the preprocessing to replace what is used
-        model_id = "gv-hf/PaliGemma-test-224px-hf"
+        model_id = "google/paligemma-3b-pt-224"
+        model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
-        model = PaliGemmaForConditionalGeneration.from_pretrained("gv-hf/PaliGemma-test-224px-hf")
-        processor = PaliGemmaProcessor.from_pretrained(model_id)
        prompt = "answer en Where is the cow standing?"
        image_file = (
            "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
        )
        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = processor(text=prompt, images=raw_image, return_tensors="pt").to(torch.float16)
+        inputs = self.processor(text=prompt, images=raw_image, return_tensors="pt").to(torch.float16)
        output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
        EXPECTED_DECODED_TEXT = "answer en Where is the cow standing?\nbeach"  # fmt: skip
        self.assertEqual(
-            processor.decode(output[0], skip_special_tokens=True),
+            self.processor.decode(output[0], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
+    @slow
+    @require_read_token
+    def test_small_model_integration_test_paligemma_empty_prompt(self):
+        # Let' s make sure we test the preprocessing to replace what is used
+        model_id = "google/paligemma-3b-pt-224"
+        model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
+        prompt = ""
+        image_file = (
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
+        )
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+        inputs = self.processor(text=prompt, images=raw_image, return_tensors="pt").to(torch.float16)
+        output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
+        EXPECTED_DECODED_TEXT = "\ncow on the beach"  # fmt: skip
+        self.assertEqual(
+            self.processor.decode(output[0], skip_special_tokens=True),
            EXPECTED_DECODED_TEXT,
        )
    @slow
-    @require_bitsandbytes
+    @require_read_token
    def test_small_model_integration_test_paligemma_batched(self):
        # Let' s make sure we test the preprocessing to replace what is used
-        model_id = "gv-hf/PaliGemma-test-224px-hf"
+        model_id = "google/paligemma-3b-pt-224"
        model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
-        processor = PaliGemmaProcessor.from_pretrained(model_id)
        prompts = [
            "answer en Where is the cow standing?",
@@ -365,19 +355,23 @@ class PaliGemmaForConditionalGenerationIntegrationTest(unittest.TestCase):
        )
        image2 = image1
-        inputs = processor(text=prompts, images=[image1, image2], return_tensors="pt", padding=True)
+        inputs = self.processor(text=prompts, images=[image1, image2], return_tensors="pt", padding=True)
        output = model.generate(**inputs, max_new_tokens=20)
-        EXPECTED_DECODED_TEXT = ["answer en Where is the cow standing?\nbeach", "\ncow standing on the beach"]  # fmt: skip
+        EXPECTED_DECODED_TEXT = ["answer en Where is the cow standing?\nbeach", "\ncow on the beach"]  # fmt: skip
-        self.assertEqual(processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
+        self.assertEqual(self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
    @slow
-    @require_bitsandbytes
+    @require_torch
-    def test_small_model_integration_test_batch(self):
+    @require_read_token
+    def test_small_model_integration_test_paligemma_batched_bf16(self):
        # Let' s make sure we test the preprocessing to replace what is used
-        model = PaliGemmaForConditionalGeneration.from_pretrained("gv-hf/PaliGemma-test-224px-hf")
+        model_id = "google/paligemma-3b-pt-224"
+        model = PaliGemmaForConditionalGeneration.from_pretrained(
+            model_id, revision="bfloat16", torch_dtype=torch.bfloat16
+        ).to(torch_device)
        # The first batch is longer in terms of text, the second will be padded.
        prompts = [
            "answer en Where is the cow standing?",
@@ -391,24 +385,58 @@ class PaliGemmaForConditionalGenerationIntegrationTest(unittest.TestCase):
        )
        image2 = image1
-        inputs = self.processor(text=prompts, images=[image1, image2], return_tensors="pt", padding=True)
+        inputs = (
+            self.processor(text=prompts, images=[image1, image2], return_tensors="pt", padding=True)
+            .to(torch.bfloat16)
+            .to(torch_device)
+        )
+        output = model.generate(**inputs, max_new_tokens=20)
+        EXPECTED_DECODED_TEXT = ["answer en Where is the cow standing?\nbeach", "\ncow on the beach"]  # fmt: skip
+        self.assertEqual(self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
+    @slow
+    @require_torch
+    @require_read_token
+    def test_small_model_integration_test_paligemma_batched_f16(self):
+        # Let' s make sure we test the preprocessing to replace what is used
+        model_id = "google/paligemma-3b-pt-224"
+        model = PaliGemmaForConditionalGeneration.from_pretrained(
+            model_id, revision="float16", torch_dtype=torch.float16
+        ).to(torch_device)
+        # The first batch is longer in terms of text, the second will be padded.
+        prompts = [
+            "answer en Where is the cow standing?",
+            "",
+        ]
+        image1 = Image.open(
+            requests.get(
+                "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
+                stream=True,
+            ).raw
+        )
+        image2 = image1
+        inputs = (
+            self.processor(text=prompts, images=[image1, image2], return_tensors="pt", padding=True)
+            .to(torch.float16)
+            .to(torch_device)
+        )
        output = model.generate(**inputs, max_new_tokens=20)
-        EXPECTED_DECODED_TEXT = ["answer en Where is the cow standing?\nbeach", "\ncow standing on the beach"]  # fmt: skip
+        EXPECTED_DECODED_TEXT = ["answer en Where is the cow standing?\nbeach", "\ncow on the beach"]  # fmt: skip
        self.assertEqual(self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
    @slow
-    @require_bitsandbytes
+    @require_read_token
    def test_paligemma_index_error_bug(self):
        # This is a reproducer of https://github.com/huggingface/transformers/pull/28032 and makes sure it does not happen anymore
        # Please refer to that PR, or specifically https://github.com/huggingface/transformers/pull/28032#issuecomment-1860650043 for
        # more details
-        model_id = "gv-hf/PaliGemma-test-224px-hf"
+        model_id = "google/paligemma-3b-pt-224"
        model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
-        processor = PaliGemmaProcessor.from_pretrained(model_id)
        # Simulate a super long prompt
        prompt = "\n" * 200
        image_file = (
@@ -416,7 +444,7 @@ class PaliGemmaForConditionalGenerationIntegrationTest(unittest.TestCase):
        )
        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = processor(
+        inputs = self.processor(
            text=prompt,
            images=raw_image,
            return_tensors="pt",