Add tests for batching support (#29297)

* add tests for batching support * Update src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update tests/test_modeling_common.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update tests/test_modeling_common.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update tests/test_modeling_common.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * fixes and comments * use cosine distance for conv models * skip mra model testing * Update tests/models/vilt/test_modeling_vilt.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * finzalize and make style * check model type by input names * Update tests/models/vilt/test_modeling_vilt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * fixed batch size for all testers * Revert "fixed batch size for all testers" This reverts commit 525f3a0a058f069fbda00352cf202b728d40df99. * add batch_size for all testers * dict from model output * do not skip layoutlm * bring back some code from git revert * Update tests/test_modeling_common.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/test_modeling_common.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * clean-up * where did minus go in tolerance * make whisper happy * deal with consequences of losing minus * deal with consequences of losing minus * maskformer needs its own test for happiness * fix more models * tag flaky CV models from Amy's approval * make codestyle --------- Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

Add tests for batching support (#29297)
* add tests for batching support * Update src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update tests/test_modeling_common.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update tests/test_modeling_common.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update tests/test_modeling_common.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * fixes and comments * use cosine distance for conv models * skip mra model testing * Update tests/models/vilt/test_modeling_vilt.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * finzalize and make style * check model type by input names * Update tests/models/vilt/test_modeling_vilt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * fixed batch size for all testers * Revert "fixed batch size for all testers" This reverts commit 525f3a0a058f069fbda00352cf202b728d40df99. * add batch_size for all testers * dict from model output * do not skip layoutlm * bring back some code from git revert * Update tests/test_modeling_common.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/test_modeling_common.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * clean-up * where did minus go in tolerance * make whisper happy * deal with consequences of losing minus * deal with consequences of losing minus * maskformer needs its own test for happiness * fix more models * tag flaky CV models from Amy's approval * make codestyle --------- Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
8e64ba28 · Raushan Turganbay · GitHub · 11163fff · 8e64ba28 · 8e64ba28
Unverified Commit 8e64ba28 authored Mar 12, 2024 by Raushan Turganbay Committed by GitHub Mar 12, 2024
8 changed files
--- a/tests/models/univnet/test_modeling_univnet.py
+++ b/tests/models/univnet/test_modeling_univnet.py
@@ -66,13 +66,13 @@ class UnivNetModelTester:
    def prepare_noise_sequence(self):
        generator = torch.manual_seed(self.seed)
-        noise_shape = (self.seq_length, self.in_channels)
+        noise_shape = (self.batch_size, self.seq_length, self.in_channels)
        # Create noise on CPU for reproducibility
        noise_sequence = torch.randn(noise_shape, generator=generator, dtype=torch.float)
        return noise_sequence
    def prepare_config_and_inputs(self):
-        spectrogram = floats_tensor([self.seq_length, self.num_mel_bins], scale=1.0)
+        spectrogram = floats_tensor([self.batch_size, self.seq_length, self.num_mel_bins], scale=1.0)
        noise_sequence = self.prepare_noise_sequence()
        noise_sequence = noise_sequence.to(spectrogram.device)
        config = self.get_config()
@@ -89,7 +89,7 @@ class UnivNetModelTester:
    def create_and_check_model(self, config, spectrogram, noise_sequence):
        model = UnivNetModel(config=config).to(torch_device).eval()
        result = model(spectrogram, noise_sequence)[0]
-        self.parent.assertEqual(result.shape, (1, self.seq_length * 256))
+        self.parent.assertEqual(result.shape, (self.batch_size, self.seq_length * 256))
    def prepare_config_and_inputs_for_common(self):
        config, spectrogram, noise_sequence = self.prepare_config_and_inputs()
@@ -182,8 +182,8 @@ class UnivNetModelTest(ModelTesterMixin, unittest.TestCase):
            model.to(torch_device)
            model.eval()
-            batched_spectrogram = inputs["input_features"].unsqueeze(0).repeat(2, 1, 1)
+            batched_spectrogram = inputs["input_features"]
-            batched_noise_sequence = inputs["noise_sequence"].unsqueeze(0).repeat(2, 1, 1)
+            batched_noise_sequence = inputs["noise_sequence"]
            with torch.no_grad():
                batched_outputs = model(
                    batched_spectrogram.to(torch_device),
@@ -205,36 +205,10 @@ class UnivNetModelTest(ModelTesterMixin, unittest.TestCase):
            model.eval()
            with torch.no_grad():
-                outputs = model(inputs["input_features"].to(torch_device), inputs["noise_sequence"].to(torch_device))[
+                outputs = model(
-                    0
+                    inputs["input_features"][:1].to(torch_device), inputs["noise_sequence"][:1].to(torch_device)
-                ]
-            self.assertTrue(outputs.shape[0] == 1, msg="Unbatched input should create batched output with bsz = 1")
-    def test_unbatched_batched_outputs_consistency(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            unbatched_spectrogram = inputs["input_features"].detach().clone()
-            unbatched_noise_sequence = inputs["noise_sequence"].detach().clone()
-            batched_spectrogram = inputs["input_features"].unsqueeze(0)
-            batched_noise_sequence = inputs["noise_sequence"].unsqueeze(0)
-            with torch.no_grad():
-                unbatched_outputs = model(
-                    unbatched_spectrogram.to(torch_device),
-                    unbatched_noise_sequence.to(torch_device),
                )[0]
+            self.assertTrue(outputs.shape[0] == 1, msg="Unbatched input should create batched output with bsz = 1")
-                batched_outputs = model(
-                    batched_spectrogram.to(torch_device),
-                    batched_noise_sequence.to(torch_device),
-                )[0]
-            torch.testing.assert_close(unbatched_outputs, batched_outputs)
 @require_torch_gpu

--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@@ -345,6 +345,12 @@ class ViltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    def test_determinism(self):
        pass
+    @unittest.skip(
+        "VilT samples image tokens from a multinomial distribution, resulting in not deterministic hidden states"
+    )
+    def test_batching_equivalence(self):
+        pass
    @unittest.skip(
        reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic
                            hidden states"""

--- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
@@ -18,7 +18,7 @@
 import unittest
 from transformers import ViTHybridConfig
-from transformers.testing_utils import require_accelerate, require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import is_flaky, require_accelerate, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 from ...test_configuration_common import ConfigTester
@@ -221,6 +221,10 @@ class ViTHybridModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
            model = ViTHybridModel.from_pretrained(model_name)
            self.assertIsNotNone(model)
+    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
 # We will verify our results on an image of cute cats
 def prepare_img():

--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -270,6 +270,10 @@ class ViTMAEModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    def test_model_outputs_equivalence(self):
        pass
+    @unittest.skip(reason="ViTMAE returns a random mask + ids_restore in each forward pass")
+    def test_batching_equivalence(self):
+        pass
    @slow
    def test_model_from_pretrained(self):
        for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:

--- a/tests/models/vits/test_modeling_vits.py
+++ b/tests/models/vits/test_modeling_vits.py
@@ -216,6 +216,10 @@ class VitsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    def test_determinism(self):
        pass
+    @unittest.skip("VITS is not deterministic")
+    def test_batching_equivalence(self):
+        pass
    @is_flaky(
        max_attempts=3,
        description="Weight initialisation for the VITS conv layers sometimes exceeds the kaiming normal range",

--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -190,7 +190,7 @@ class WhisperModelTester:
    def __init__(
        self,
        parent,
-        batch_size=2,
+        batch_size=3,  # need batch_size != num_hidden_layers
        seq_length=60,
        is_training=True,
        use_labels=False,
@@ -1446,6 +1446,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
        model = WhisperForConditionalGeneration(config).eval().to(torch_device)
        input_features = input_dict["input_features"].to(torch_device)
+        input_features = input_features[:2]
        # len = 250 with num_input_frames = 60
        long_input_features = torch.cat([input_features.repeat(1, 1, 4), input_features[:, :, :10]], dim=-1)
@@ -2626,7 +2627,7 @@ class WhisperEncoderModelTester:
    def __init__(
        self,
        parent,
-        batch_size=2,
+        batch_size=3,  # need batch_size != num_hidden layers
        seq_length=60,
        is_training=True,
        use_labels=True,
@@ -2997,7 +2998,7 @@ class WhisperStandaloneDecoderModelTester:
    def __init__(
        self,
        parent,
-        batch_size=2,
+        batch_size=3,  # need batch_size != num_hidden layers
        is_training=True,
        use_labels=False,
        vocab_size=200,

--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -479,6 +479,7 @@ class XCLIPModelTester:
        self.mit_hidden_size = mit_hidden_size
        self.text_model_tester = XCLIPTextModelTester(parent, **text_kwargs)
        self.vision_model_tester = XCLIPVisionModelTester(parent, **vision_kwargs)
+        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
        self.is_training = is_training
    def prepare_config_and_inputs(self):

--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -99,6 +99,7 @@ if is_accelerate_available():
 if is_torch_available():
    import torch
+    import torch.nn.functional as F
    from safetensors.torch import load_file as safe_load_file
    from safetensors.torch import save_file as safe_save_file
    from torch import nn
@@ -693,6 +694,99 @@ class ModelTesterMixin:
                expected_arg_names = [model.main_input_name]
                self.assertListEqual(arg_names[:1], expected_arg_names)
+    def test_batching_equivalence(self):
+        """
+        Tests that the model supports batching and that the output is the nearly the same for the same input in
+        different batch sizes.
+        (Why "nearly the same" not "exactly the same"? Batching uses different matmul shapes, which often leads to
+        different results: https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535)
+        """
+        def get_tensor_equivalence_function(batched_input):
+            # models operating on continuous spaces have higher abs difference than LMs
+            # instead, we can rely on cos distance for image/speech models, similar to `diffusers`
+            if "input_ids" not in batched_input:
+                return lambda tensor1, tensor2: (
+                    1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=1e-38)
+                )
+            return lambda tensor1, tensor2: torch.max(torch.abs(tensor1 - tensor2))
+        def recursive_check(batched_object, single_row_object, model_name, key):
+            if isinstance(batched_object, (list, tuple)):
+                for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
+                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
+            elif isinstance(batched_object, dict):
+                for batched_object_value, single_row_object_value in zip(
+                    batched_object.values(), single_row_object.values()
+                ):
+                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
+            # do not compare returned loss (0-dim tensor) or codebook ids (int)
+            elif batched_object is None or isinstance(batched_object, int):
+                return
+            elif batched_object.dim() == 0:
+                return
+            else:
+                # indexing the first element does not always work
+                # e.g. models that output similarity scores of size (N, M) would need to index [0, 0]
+                slice_ids = [slice(0, index) for index in single_row_object.shape]
+                batched_row = batched_object[slice_ids]
+                self.assertFalse(
+                    torch.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
+                )
+                self.assertFalse(
+                    torch.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
+                )
+                self.assertFalse(
+                    torch.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
+                )
+                self.assertFalse(
+                    torch.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
+                )
+                self.assertTrue(
+                    (equivalence(batched_row, single_row_object)) <= 1e-03,
+                    msg=(
+                        f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
+                        f"Difference={equivalence(batched_row, single_row_object)}."
+                    ),
+                )
+        config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
+        equivalence = get_tensor_equivalence_function(batched_input)
+        for model_class in self.all_model_classes:
+            config.output_hidden_states = True
+            model_name = model_class.__name__
+            if hasattr(self.model_tester, "prepare_config_and_inputs_for_model_class"):
+                config, batched_input = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
+            batched_input_prepared = self._prepare_for_class(batched_input, model_class)
+            model = model_class(config).to(torch_device).eval()
+            batch_size = self.model_tester.batch_size
+            single_row_input = {}
+            for key, value in batched_input_prepared.items():
+                if isinstance(value, torch.Tensor) and value.shape[0] % batch_size == 0:
+                    # e.g. musicgen has inputs of size (bs*codebooks). in most cases value.shape[0] == batch_size
+                    single_batch_shape = value.shape[0] // batch_size
+                    single_row_input[key] = value[:single_batch_shape]
+                else:
+                    single_row_input[key] = value
+            with torch.no_grad():
+                model_batched_output = model(**batched_input_prepared)
+                model_row_output = model(**single_row_input)
+            if isinstance(model_batched_output, torch.Tensor):
+                model_batched_output = {"model_output": model_batched_output}
+                model_row_output = {"model_output": model_row_output}
+            for key in model_batched_output:
+                # DETR starts from zero-init queries to decoder, leading to cos_similarity = `nan`
+                if hasattr(self, "zero_init_hidden_state") and "decoder_hidden_states" in key:
+                    model_batched_output[key] = model_batched_output[key][1:]
+                    model_row_output[key] = model_row_output[key][1:]
+                recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
    def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
        if not self.model_tester.is_training:
            return