CI with `num_hidden_layers=2` 🚀🚀🚀 (#25266)

* CI with layers=2 --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>

CI with `num_hidden_layers=2` 🚀🚀🚀 (#25266)
* CI with layers=2 --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
bd90cda9 · Yih-Dar · GitHub · b28ebb26 · bd90cda9 · bd90cda9
Unverified Commit bd90cda9 authored Aug 02, 2023 by Yih-Dar Committed by GitHub Aug 02, 2023
7 changed files
--- a/tests/models/xlm/test_modeling_xlm.py
+++ b/tests/models/xlm/test_modeling_xlm.py
@@ -57,7 +57,7 @@ class XLMModelTester:
        vocab_size=99,
        n_special=0,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,

--- a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
+++ b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
@@ -55,7 +55,7 @@ class XLMRobertaXLModelTester:
        use_labels=True,
        vocab_size=99,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",

--- a/tests/models/xlnet/test_modeling_xlnet.py
+++ b/tests/models/xlnet/test_modeling_xlnet.py
@@ -56,7 +56,7 @@ class XLNetModelTester:
        hidden_size=32,
        num_attention_heads=4,
        d_inner=128,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        type_sequence_label_size=2,
        untie_r=True,
        bi_data=False,

--- a/tests/models/xmod/test_modeling_xmod.py
+++ b/tests/models/xmod/test_modeling_xmod.py
@@ -51,7 +51,7 @@ class XmodModelTester:
        use_labels=True,
        vocab_size=99,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",

--- a/tests/models/yolos/test_modeling_yolos.py
+++ b/tests/models/yolos/test_modeling_yolos.py
@@ -52,7 +52,7 @@ class YolosModelTester:
        is_training=True,
        use_labels=True,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",

--- a/tests/models/yoso/test_modeling_yoso.py
+++ b/tests/models/yoso/test_modeling_yoso.py
@@ -51,7 +51,7 @@ class YosoModelTester:
        use_labels=True,
        vocab_size=99,
        hidden_size=32,
-        num_hidden_layers=5,
+        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",

--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1017,7 +1017,8 @@ class ModelTesterMixin:
            attentions = outputs[-1]

            self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
+            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)

    def test_head_pruning_save_load_from_pretrained(self):
@@ -1053,7 +1054,8 @@ class ModelTesterMixin:
                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
            attentions = outputs[-1]
            self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
+            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)

    def test_head_pruning_save_load_from_config_init(self):
@@ -1087,7 +1089,8 @@ class ModelTesterMixin:
            attentions = outputs[-1]

            self.assertEqual(attentions[0].shape[-3], 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
+            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)

    def test_head_pruning_integration(self):
@@ -1106,7 +1109,7 @@ class ModelTesterMixin:
            inputs_dict["output_attentions"] = True
            config.output_hidden_states = False

-            heads_to_prune = {0: [0], 1: [1, 2]}
+            heads_to_prune = {1: [1, 2]}
            config.pruned_heads = heads_to_prune

            model = model_class(config=config)
@@ -1117,10 +1120,8 @@ class ModelTesterMixin:
                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
            attentions = outputs[-1]

-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 0)
            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)

            with tempfile.TemporaryDirectory() as temp_dir_name:
                model.save_pretrained(temp_dir_name)
@@ -1131,12 +1132,10 @@ class ModelTesterMixin:
                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
            attentions = outputs[-1]

-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
+            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 0)
            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)

-            heads_to_prune = {0: [0], 2: [1, 2]}
+            heads_to_prune = {0: [0], 1: [1, 2]}
            model.prune_heads(heads_to_prune)

            with torch.no_grad():
@@ -1145,10 +1144,8 @@ class ModelTesterMixin:

            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
-            self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)

-            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
+            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2]})

    def test_hidden_states_output(self):
        def check_hidden_states_output(inputs_dict, config, model_class):