CI with `num_hidden_layers=2` 🚀🚀🚀 (#25266)

* CI with layers=2 --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>

CI with `num_hidden_layers=2` 🚀🚀🚀 (#25266)
* CI with layers=2 --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
bd90cda9 · Yih-Dar · GitHub · b28ebb26 · bd90cda9 · bd90cda9
Unverified Commit bd90cda9 authored Aug 02, 2023 by Yih-Dar Committed by GitHub Aug 02, 2023
20 changed files
--- a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
+++ b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
@@ -45,7 +45,7 @@ class GPTSanJapaneseTester:
        is_training=True,
        hidden_size=32,
        ext_size=42,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_ext_layers=2,
        num_attention_heads=4,
        num_experts=2,

--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -356,7 +356,7 @@ class GroupViTTextModelTester:
        use_labels=True,
        vocab_size=99,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        dropout=0.1,
@@ -553,6 +553,10 @@ class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
    def test_model_common_attributes(self):
        pass
+    # overwritten from parent as this equivalent test needs a specific `seed` and hard to get a good one!
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-5, name="outputs", attributes=None):
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol=tol, name=name, attributes=attributes)
    @is_pt_tf_cross_test
    def test_pt_tf_model_equivalence(self):
        import tensorflow as tf

--- a/tests/models/hubert/test_modeling_hubert.py
+++ b/tests/models/hubert/test_modeling_hubert.py
@@ -71,7 +71,7 @@ class HubertModelTester:
        conv_bias=False,
        num_conv_pos_embeddings=16,
        num_conv_pos_embedding_groups=2,
-        num_hidden_layers=4,
+        num_hidden_layers=2,
        num_attention_heads=2,
        hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
        intermediate_size=20,

--- a/tests/models/ibert/test_modeling_ibert.py
+++ b/tests/models/ibert/test_modeling_ibert.py
@@ -62,7 +62,7 @@ class IBertModelTester:
        use_labels=True,
        vocab_size=99,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",

--- a/tests/models/imagegpt/test_modeling_imagegpt.py
+++ b/tests/models/imagegpt/test_modeling_imagegpt.py
@@ -65,7 +65,7 @@ class ImageGPTModelTester:
        use_mc_token_ids=True,
        vocab_size=99,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",

--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -64,7 +64,7 @@ class InstructBlipVisionModelTester:
        is_training=True,
        hidden_size=32,
        projection_dim=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        dropout=0.1,
@@ -219,7 +219,7 @@ class InstructBlipQFormerModelTester:
        vocab_size=99,
        hidden_size=32,
        projection_dim=32,
-        num_hidden_layers=6,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        dropout=0.1,
@@ -295,7 +295,7 @@ class InstructBlipTextModelDecoderOnlyTester:
        use_labels=False,
        vocab_size=99,
        hidden_size=16,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=4,
        hidden_act="gelu",

--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@@ -48,7 +48,7 @@ class LayoutLMModelTester:
        use_labels=True,
        vocab_size=99,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",

--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -55,7 +55,7 @@ class LayoutLMv2ModelTester:
        use_labels=True,
        vocab_size=99,
        hidden_size=36,
-        num_hidden_layers=3,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",

--- a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
@@ -63,7 +63,7 @@ class LayoutLMv3ModelTester:
        use_labels=True,
        vocab_size=99,
        hidden_size=36,
-        num_hidden_layers=3,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",

--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -46,7 +46,7 @@ class LlamaModelTester:
        use_labels=True,
        vocab_size=99,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",

--- a/tests/models/longformer/test_modeling_longformer.py
+++ b/tests/models/longformer/test_modeling_longformer.py
@@ -50,7 +50,7 @@ class LongformerModelTester:
        use_labels=True,
        vocab_size=99,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",

--- a/tests/models/longt5/test_modeling_flax_longt5.py
+++ b/tests/models/longt5/test_modeling_flax_longt5.py
@@ -71,7 +71,7 @@ class FlaxLongT5ModelTester:
        use_attention_mask=True,
        use_labels=True,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        d_ff=37,
        relative_attention_num_buckets=8,

--- a/tests/models/longt5/test_modeling_longt5.py
+++ b/tests/models/longt5/test_modeling_longt5.py
@@ -59,7 +59,7 @@ class LongT5ModelTester:
        use_attention_mask=True,
        use_labels=True,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        d_ff=37,
        relative_attention_num_buckets=8,
@@ -916,7 +916,7 @@ class LongT5EncoderOnlyModelTester:
        # For common tests
        use_attention_mask=True,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        d_ff=37,
        relative_attention_num_buckets=8,

--- a/tests/models/luke/test_modeling_luke.py
+++ b/tests/models/luke/test_modeling_luke.py
@@ -61,7 +61,7 @@ class LukeModelTester:
        entity_vocab_size=10,
        entity_emb_size=6,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",

--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@@ -661,7 +661,7 @@ class MarianStandaloneDecoderModelTester:
        use_labels=True,
        decoder_start_token_id=2,
        decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
        encoder_attention_heads=4,
        decoder_attention_heads=4,
        max_position_embeddings=30,

--- a/tests/models/markuplm/test_modeling_markuplm.py
+++ b/tests/models/markuplm/test_modeling_markuplm.py
@@ -53,7 +53,7 @@ class MarkupLMModelTester:
        use_labels=True,
        vocab_size=99,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",

--- a/tests/models/mbart/test_modeling_mbart.py
+++ b/tests/models/mbart/test_modeling_mbart.py
@@ -491,7 +491,7 @@ class MBartStandaloneDecoderModelTester:
        use_labels=True,
        decoder_start_token_id=2,
        decoder_ffn_dim=32,
-        decoder_layers=4,
+        decoder_layers=2,
        encoder_attention_heads=4,
        decoder_attention_heads=4,
        max_position_embeddings=30,

--- a/tests/models/mega/test_modeling_mega.py
+++ b/tests/models/mega/test_modeling_mega.py
@@ -51,7 +51,7 @@ class MegaModelTester:
        use_labels=True,
        vocab_size=99,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        intermediate_size=37,
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,

--- a/tests/models/megatron_bert/test_modeling_megatron_bert.py
+++ b/tests/models/megatron_bert/test_modeling_megatron_bert.py
@@ -58,7 +58,7 @@ class MegatronBertModelTester:
        vocab_size=99,
        hidden_size=64,
        embedding_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",

--- a/tests/models/mgp_str/test_modeling_mgp_str.py
+++ b/tests/models/mgp_str/test_modeling_mgp_str.py
@@ -55,7 +55,7 @@ class MgpstrModelTester:
        num_bpe_labels=99,
        num_wordpiece_labels=99,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        mlp_ratio=4.0,
        patch_embeds_hidden_size=257,