Allow saved_model export of TFCLIPModel in save_pretrained (#16886)

* CLIP Serving * Add type hints per code review * Use black, flake8, and isort * Update src/transformers/models/clip/modeling_tf_clip.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Rollback serving_output and add TODO * Remove irrelevant portions of failing tests * Revert "Rollback serving_output and add TODO" This reverts commit a4abfa6ba3b7875a13538dbc2ddc4eb17dfcca8d. * Rollback to original test/serving_output * Fix unused var * Apply suggestions from code review * Update formatting with black * Fix style again from rebase * Update tests/models/clip/test_modeling_tf_clip.py Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> Co-authored-by: Sean Moriarity <sean.l.moriarity.mil@army.mil> Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>

Allow saved_model export of TFCLIPModel in save_pretrained (#16886)
* CLIP Serving * Add type hints per code review * Use black, flake8, and isort * Update src/transformers/models/clip/modeling_tf_clip.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Rollback serving_output and add TODO * Remove irrelevant portions of failing tests * Revert "Rollback serving_output and add TODO" This reverts commit a4abfa6ba3b7875a13538dbc2ddc4eb17dfcca8d. * Rollback to original test/serving_output * Fix unused var * Apply suggestions from code review * Update formatting with black * Fix style again from rebase * Update tests/models/clip/test_modeling_tf_clip.py Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> Co-authored-by: Sean Moriarity <sean.l.moriarity.mil@army.mil> Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
279bc584 · Sean Moriarity · GitHub · ef203902 · 279bc584 · 279bc584
Unverified Commit 279bc584 authored May 04, 2022 by Sean Moriarity Committed by GitHub May 04, 2022
Showing with 132 additions and 5 deletions

src/transformers/models/clip/modeling_tf_clip.py src/transformers/models/clip/modeling_tf_clip.py +23 -5

tests/models/clip/test_modeling_tf_clip.py tests/models/clip/test_modeling_tf_clip.py +109 -0

No files found.
--- a/src/transformers/models/clip/modeling_tf_clip.py
+++ b/src/transformers/models/clip/modeling_tf_clip.py
@@ -551,11 +551,14 @@ class TFCLIPTextTransformer(tf.keras.layers.Layer):
        )

    def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32):
-
-        diag = tf.constant(0.0, shape=(seq_length,), dtype=dtype)
+        # It is possible with an unspecified sequence length for seq_length to be
+        # a runtime value, which is unsupported by tf.constant. Per the TensorFlow
+        # docs, tf.fill can handle runtime dynamic shapes:
+        # https://www.tensorflow.org/api_docs/python/tf/fill
+        diag = tf.cast(tf.fill((seq_length,), 0.0), dtype)

        # set an additive 2D attention mask with all places being masked
-        to_mask = tf.constant(-10000.0, shape=(seq_length, seq_length), dtype=dtype)
+        to_mask = tf.cast(tf.fill((seq_length, seq_length), -10000.0), dtype)

        # set diagonal & lower triangular parts to 0 (i.e. the places not to be masked)
        # TIP: think the 2D matrix as the space of (query_seq, key_seq)
@@ -1082,6 +1085,18 @@ class TFCLIPTextModel(TFCLIPPreTrainedModel):

        return outputs

+    @tf.function(
+        input_signature=[
+            {
+                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
+                "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
+            }
+        ]
+    )
+    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
+        output = self.call(inputs)
+        return self.serving_output(output)
+
    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -1123,7 +1138,7 @@ class TFCLIPVisionModel(TFCLIPPreTrainedModel):
            }
        ]
    )
-    def serving(self, inputs):
+    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
        """
        Method used for serving the model.

@@ -1226,7 +1241,7 @@ class TFCLIPModel(TFCLIPPreTrainedModel):
            }
        ]
    )
-    def serving(self, inputs):
+    def serving(self, inputs: Dict[str, tf.Tensor]) -> TFCLIPOutput:
        """
        Method used for serving the model.

@@ -1375,4 +1390,7 @@ class TFCLIPModel(TFCLIPPreTrainedModel):
        return outputs

    def serving_output(self, output: TFCLIPOutput) -> TFCLIPOutput:
+        # TODO: As is this currently fails with saved_model=True, because
+        # TensorFlow cannot trace through nested dataclasses. Reference:
+        # https://github.com/huggingface/transformers/pull/16886
        return output
--- a/tests/models/clip/test_modeling_tf_clip.py
+++ b/tests/models/clip/test_modeling_tf_clip.py
@@ -256,6 +256,62 @@ class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase):
            model = TFCLIPVisionModel.from_pretrained(model_name)
            self.assertIsNotNone(model)

+    @slow
+    def test_saved_model_creation_extended(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        if hasattr(config, "use_cache"):
+            config.use_cache = True
+
+        # in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.model_tester.image_size, self.model_tester.image_size)
+        patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        seq_len = num_patches + 1
+
+        for model_class in self.all_model_classes:
+            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+            model = model_class(config)
+            num_out = len(model(class_inputs_dict))
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname, saved_model=True)
+                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
+                model = tf.keras.models.load_model(saved_model_dir)
+                outputs = model(class_inputs_dict)
+                output_hidden_states = outputs["hidden_states"]
+                output_attentions = outputs["attentions"]
+
+                # Check num outputs
+                self.assertEqual(len(outputs), num_out)
+
+                # Check num layers
+                expected_num_layers = getattr(
+                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+                )
+
+                self.assertEqual(len(output_hidden_states), expected_num_layers)
+                self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
+
+                # Check attention outputs
+                image_size = (self.model_tester.image_size, self.model_tester.image_size)
+                patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
+                num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+                seq_len = num_patches + 1
+
+                self.assertListEqual(
+                    list(output_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, seq_len, seq_len],
+                )
+
+                # Check hidden states
+                self.assertListEqual(
+                    list(output_hidden_states[0].shape[-2:]),
+                    [seq_len, self.model_tester.hidden_size],
+                )
+

 class TFCLIPTextModelTester:
    def __init__(
@@ -367,6 +423,54 @@ class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase):
            model = TFCLIPTextModel.from_pretrained(model_name)
            self.assertIsNotNone(model)

+    @slow
+    def test_saved_model_creation_extended(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.output_hidden_states = True
+        config.output_attentions = True
+
+        if hasattr(config, "use_cache"):
+            config.use_cache = True
+
+        for model_class in self.all_model_classes:
+            class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+            model = model_class(config)
+            num_out = len(model(class_inputs_dict))
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname, saved_model=True)
+                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
+                model = tf.keras.models.load_model(saved_model_dir)
+                outputs = model(class_inputs_dict)
+                output_hidden_states = outputs["hidden_states"]
+                output_attentions = outputs["attentions"]
+
+                # Check number of outputs
+                self.assertEqual(len(outputs), num_out)
+
+                # Check number of layers
+                expected_num_layers = getattr(
+                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+                )
+
+                # Check hidden states
+                self.assertEqual(len(output_hidden_states), expected_num_layers)
+                self.assertListEqual(
+                    list(output_hidden_states[0].shape[-2:]),
+                    [self.model_tester.seq_length, self.model_tester.hidden_size],
+                )
+
+                # Check attention outputs
+                self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
+
+                seq_length = self.model_tester.seq_length
+                key_length = getattr(self.model_tester, "key_length", seq_length)
+
+                self.assertListEqual(
+                    list(output_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, seq_length, key_length],
+                )
+

 class TFCLIPModelTester:
    def __init__(self, parent, is_training=True):
@@ -502,6 +606,11 @@ class TFCLIPModelTest(TFModelTesterMixin, unittest.TestCase):
            model = TFCLIPModel.from_pretrained(model_name)
            self.assertIsNotNone(model)

+    @unittest.skip(reason="Currently `saved_model` doesn't work with nested outputs.")
+    @slow
+    def test_saved_model_creation_extended(self):
+        pass
+

 # We will verify our results on an image of cute cats
 def prepare_img():