TF: check embeddings range (#19102)

1b5ab39c · Joao Gante · GitHub · cf6308ef · 1b5ab39c · 1b5ab39c
Unverified Commit 1b5ab39c authored Sep 22, 2022 by Joao Gante Committed by GitHub Sep 22, 2022
17 changed files
--- a/src/transformers/models/lxmert/modeling_tf_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py
@@ -227,6 +227,16 @@ class TFLxmertEmbeddings(tf.keras.layers.Layer):
        assert not (input_ids is None and inputs_embeds is None)

        if input_ids is not None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
+                ),
+            )
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        input_shape = shape_list(inputs_embeds)[:-1]

--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -772,6 +772,16 @@ class TFMarianEncoder(tf.keras.layers.Layer):
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if inputs_embeds is None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
+                ),
+            )
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        embed_pos = self.embed_positions(input_shape)
@@ -967,6 +977,16 @@ class TFMarianDecoder(tf.keras.layers.Layer):
            positions = self.embed_positions(input_shape, position_ids=position_ids)

        if inputs_embeds is None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
+                ),
+            )
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        hidden_states = inputs_embeds

--- a/src/transformers/models/mbart/modeling_tf_mbart.py
+++ b/src/transformers/models/mbart/modeling_tf_mbart.py
@@ -757,6 +757,16 @@ class TFMBartEncoder(tf.keras.layers.Layer):
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if inputs_embeds is None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
+                ),
+            )
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        embed_pos = self.embed_positions(input_shape)
@@ -959,6 +969,16 @@ class TFMBartDecoder(tf.keras.layers.Layer):
            positions = self.embed_positions(input_shape, position_ids=position_ids)

        if inputs_embeds is None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
+                ),
+            )
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        hidden_states = inputs_embeds

--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -214,6 +214,16 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer):
        assert not (input_ids is None and inputs_embeds is None)

        if input_ids is not None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
+                ),
+            )
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        input_shape = shape_list(inputs_embeds)[:-1]

--- a/src/transformers/models/mpnet/modeling_tf_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py
@@ -145,6 +145,16 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer):
        assert not (input_ids is None and inputs_embeds is None)

        if input_ids is not None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
+                ),
+            )
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        input_shape = shape_list(inputs_embeds)[:-1]

--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -298,10 +298,30 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])

        if inputs_embeds is None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
+                ),
+            )
            inputs_embeds = self.tokens_embed(input_ids, mode="embedding")
        position_embeds = tf.gather(self.positions_embed, position_ids)
        if token_type_ids is not None:
            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                token_type_ids,
+                tf.cast(self.vocab_size, dtype=token_type_ids.dtype),
+                message=(
+                    "token_type_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(token_type_ids)} >= {self.vocab_size})"
+                ),
+            )
            token_type_embeds = self.tokens_embed(token_type_ids, mode="embedding")
        else:
            token_type_embeds = 0

--- a/src/transformers/models/opt/modeling_tf_opt.py
+++ b/src/transformers/models/opt/modeling_tf_opt.py
@@ -632,6 +632,16 @@ class TFOPTDecoder(tf.keras.layers.Layer):
        past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0

        if inputs_embeds is None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
+                ),
+            )
            inputs_embeds = self.embed_tokens(input_ids)

        if attention_mask is None:

--- a/src/transformers/models/pegasus/modeling_tf_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -775,6 +775,16 @@ class TFPegasusEncoder(tf.keras.layers.Layer):
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if inputs_embeds is None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
+                ),
+            )
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        embed_pos = self.embed_positions(input_shape)
@@ -973,6 +983,16 @@ class TFPegasusDecoder(tf.keras.layers.Layer):
            positions = self.embed_positions(input_shape, position_ids=position_ids)

        if inputs_embeds is None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
+                ),
+            )
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        hidden_states = inputs_embeds

--- a/src/transformers/models/rembert/modeling_tf_rembert.py
+++ b/src/transformers/models/rembert/modeling_tf_rembert.py
@@ -124,6 +124,16 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer):
        assert not (input_ids is None and inputs_embeds is None)

        if input_ids is not None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
+                ),
+            )
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        input_shape = shape_list(inputs_embeds)[:-1]

--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@@ -146,6 +146,16 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
        assert not (input_ids is None and inputs_embeds is None)

        if input_ids is not None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
+                ),
+            )
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        input_shape = shape_list(inputs_embeds)[:-1]

--- a/src/transformers/models/roformer/modeling_tf_roformer.py
+++ b/src/transformers/models/roformer/modeling_tf_roformer.py
@@ -177,6 +177,16 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer):
        assert not (input_ids is None and inputs_embeds is None)

        if input_ids is not None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
+                ),
+            )
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        input_shape = shape_list(inputs_embeds)[:-1]

--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -1022,6 +1022,16 @@ class TFSpeech2TextDecoder(tf.keras.layers.Layer):
        past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0

        if inputs_embeds is None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
+                ),
+            )
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
        else:
            inputs_embeds = inputs_embeds

--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -681,6 +681,16 @@ class TFT5MainLayer(tf.keras.layers.Layer):

        if inputs_embeds is None:
            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
+                ),
+            )
            inputs_embeds = self.embed_tokens(input_ids)

        batch_size, seq_length = input_shape

--- a/src/transformers/models/xglm/modeling_tf_xglm.py
+++ b/src/transformers/models/xglm/modeling_tf_xglm.py
@@ -533,6 +533,16 @@ class TFXGLMMainLayer(tf.keras.layers.Layer):
        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0

        if inputs_embeds is None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
+                ),
+            )
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        attention_mask = self._prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length)

--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -440,6 +440,16 @@ class TFXLMMainLayer(tf.keras.layers.Layer):

        # embeddings
        if inputs_embeds is None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.embeddings.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.embeddings.vocab_size})"
+                ),
+            )
            inputs_embeds = self.embeddings(input_ids)

        tensor = inputs_embeds + tf.gather(self.position_embeddings, position_ids)

--- a/src/transformers/models/xlnet/modeling_tf_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -680,6 +680,16 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        if inputs_embeds is not None:
            word_emb_k = inputs_embeds
        else:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.word_embedding.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.word_embedding.vocab_size})"
+                ),
+            )
            word_emb_k = self.word_embedding(input_ids)
        output_h = self.dropout(word_emb_k, training=training)
        if target_mapping is not None:

--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -127,6 +127,16 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer):
        assert not (input_ids is None and inputs_embeds is None)

        if input_ids is not None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
+                ),
+            )
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)

        input_shape = shape_list(inputs_embeds)[:-1]
@@ -2305,6 +2315,16 @@ class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer):
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if inputs_embeds is None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
+                ),
+            )
            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        embed_pos = self.embed_positions(input_shape)
@@ -2494,6 +2514,16 @@ class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer):
        positions = self.embed_positions(input_shape, past_key_values_length)

        if inputs_embeds is None:
+            # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+            # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+            tf.debugging.assert_less(
+                input_ids,
+                tf.cast(self.embed_tokens.vocab_size, dtype=input_ids.dtype),
+                message=(
+                    "input_ids must be smaller than the embedding layer's input dimension (got"
+                    f" {tf.math.reduce_max(input_ids)} >= {self.embed_tokens.vocab_size})"
+                ),
+            )
            inputs_embeds = self.embed_tokens(input_ids)

        hidden_states = inputs_embeds