TF: XLA stable softmax (#16892)

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

TF: XLA stable softmax (#16892)
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
e03966e4 · Joao Gante · GitHub · 8246caf3 · e03966e4 · e03966e4
Unverified Commit e03966e4 authored Apr 25, 2022 by Joao Gante Committed by GitHub Apr 25, 2022
20 changed files
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -33,7 +33,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
@@ -271,7 +271,7 @@ class TFLEDEncoderSelfAttention(tf.keras.layers.Layer):
            ),
            lambda: attn_scores,
        )
-        attn_probs = tf.nn.softmax(attn_scores, axis=-1)
+        attn_probs = stable_softmax(attn_scores, axis=-1)
        # softmax sometimes inserts NaN if all positions are masked, replace them with 0
        # Make sure to create a mask with the proper shape:
@@ -886,7 +886,7 @@ class TFLEDEncoderSelfAttention(tf.keras.layers.Layer):
        )
        # compute global attn probs
-        global_attn_probs_float = tf.nn.softmax(global_attn_scores, axis=-1)
+        global_attn_probs_float = stable_softmax(global_attn_scores, axis=-1)
        # apply layer head masking
        if layer_head_mask is not None:
@@ -1085,7 +1085,7 @@ class TFLEDDecoderAttention(tf.keras.layers.Layer):
            )
            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
-        attn_weights = tf.nn.softmax(attn_weights, axis=-1)
+        attn_weights = stable_softmax(attn_weights, axis=-1)
        if layer_head_mask is not None:
            if tf.executing_eagerly():

--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -34,7 +34,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
    MULTIPLE_CHOICE_DUMMY_INPUTS,
    ModelOutput,
@@ -800,7 +800,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
            ),
            lambda: attn_scores,
        )
-        attn_probs = tf.nn.softmax(attn_scores, axis=-1)
+        attn_probs = stable_softmax(attn_scores, axis=-1)
        # softmax sometimes inserts NaN if all positions are masked, replace them with 0
        # Make sure to create a mask with the proper shape:
@@ -1415,7 +1415,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
        )
        # compute global attn probs
-        global_attn_probs_float = tf.nn.softmax(global_attn_scores, axis=-1)
+        global_attn_probs_float = stable_softmax(global_attn_scores, axis=-1)
        # apply layer head masking
        if layer_head_mask is not None:

--- a/src/transformers/models/lxmert/modeling_tf_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py
@@ -22,6 +22,8 @@ from typing import Dict, Optional, Tuple
 import tensorflow as tf
+from transformers.tf_utils import stable_softmax
 from ...activations_tf import get_tf_activation
 from ...modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list, unpack_inputs
 from ...utils import (
@@ -302,7 +304,7 @@ class TFLxmertAttention(tf.keras.layers.Layer):
            attention_scores = attention_scores + attention_mask
        # Normalize the attention scores to probabilities.
-        attention_probs = tf.nn.softmax(attention_scores, axis=-1)
+        attention_probs = stable_softmax(attention_scores, axis=-1)
        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.

--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -39,7 +39,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
    add_code_sample_docstrings,
    add_end_docstrings,
@@ -284,7 +284,7 @@ class TFMarianAttention(tf.keras.layers.Layer):
            attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
-        attn_weights = tf.nn.softmax(attn_weights, axis=-1)
+        attn_weights = stable_softmax(attn_weights, axis=-1)
        if layer_head_mask is not None:
            # The tf.debugging asserts are not compliant with XLA then they

--- a/src/transformers/models/mbart/modeling_tf_mbart.py
+++ b/src/transformers/models/mbart/modeling_tf_mbart.py
@@ -39,7 +39,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
    add_code_sample_docstrings,
    add_end_docstrings,
@@ -246,7 +246,7 @@ class TFMBartAttention(tf.keras.layers.Layer):
            attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
-        attn_weights = tf.nn.softmax(attn_weights, axis=-1)
+        attn_weights = stable_softmax(attn_weights, axis=-1)
        if layer_head_mask is not None:
            # The tf.debugging asserts are not compliant with XLA then they

--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -46,7 +46,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
    MULTIPLE_CHOICE_DUMMY_INPUTS,
    ModelOutput,
@@ -278,7 +278,7 @@ class TFMobileBertSelfAttention(tf.keras.layers.Layer):
            attention_scores = attention_scores + attention_mask
        # Normalize the attention scores to probabilities.
-        attention_probs = tf.nn.softmax(attention_scores, axis=-1)
+        attention_probs = stable_softmax(attention_scores, axis=-1)
        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.

--- a/src/transformers/models/mpnet/modeling_tf_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py
@@ -42,7 +42,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
    MULTIPLE_CHOICE_DUMMY_INPUTS,
    add_code_sample_docstrings,
@@ -241,7 +241,7 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer):
        if attention_mask is not None:
            attention_scores = attention_scores + attention_mask
-        attention_probs = tf.nn.softmax(attention_scores, axis=-1)
+        attention_probs = stable_softmax(attention_scores, axis=-1)
        attention_probs = self.dropout(attention_probs, training=training)

--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -35,7 +35,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
@@ -111,7 +111,7 @@ class TFAttention(tf.keras.layers.Layer):
            attention_mask = tf.cast(attention_mask, dtype=w.dtype)
            w = w + attention_mask
-        w = tf.nn.softmax(w, axis=-1)
+        w = stable_softmax(w, axis=-1)
        w = self.attn_dropout(w, training=training)
        # Mask heads if we want to

--- a/src/transformers/models/pegasus/modeling_tf_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -39,7 +39,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
    add_code_sample_docstrings,
    add_end_docstrings,
@@ -285,7 +285,7 @@ class TFPegasusAttention(tf.keras.layers.Layer):
            attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
-        attn_weights = tf.nn.softmax(attn_weights, axis=-1)
+        attn_weights = stable_softmax(attn_weights, axis=-1)
        if layer_head_mask is not None:
            # The tf.debugging asserts are not compliant with XLA then they

--- a/src/transformers/models/rembert/modeling_tf_rembert.py
+++ b/src/transformers/models/rembert/modeling_tf_rembert.py
@@ -45,7 +45,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
    DUMMY_INPUTS,
    MULTIPLE_CHOICE_DUMMY_INPUTS,
@@ -241,7 +241,7 @@ class TFRemBertSelfAttention(tf.keras.layers.Layer):
            attention_scores = tf.add(attention_scores, attention_mask)
        # Normalize the attention scores to probabilities.
-        attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.

--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@@ -46,7 +46,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
    DUMMY_INPUTS,
    MULTIPLE_CHOICE_DUMMY_INPUTS,
@@ -290,7 +290,7 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer):
            attention_scores = tf.add(attention_scores, attention_mask)
        # Normalize the attention scores to probabilities.
-        attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.

--- a/src/transformers/models/roformer/modeling_tf_roformer.py
+++ b/src/transformers/models/roformer/modeling_tf_roformer.py
@@ -46,7 +46,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
    MULTIPLE_CHOICE_DUMMY_INPUTS,
    add_code_sample_docstrings,
@@ -262,7 +262,7 @@ class TFRoFormerSelfAttention(tf.keras.layers.Layer):
            attention_scores = tf.add(attention_scores, attention_mask)
        # Normalize the attention scores to probabilities.
-        attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.

--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -36,7 +36,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
@@ -348,7 +348,7 @@ class TFSpeech2TextAttention(tf.keras.layers.Layer):
            attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
-        attn_weights = tf.nn.softmax(attn_weights, axis=-1)
+        attn_weights = stable_softmax(attn_weights, axis=-1)
        if layer_head_mask is not None:
            # The tf.debugging asserts are not compliant with XLA then they

--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -41,7 +41,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
    DUMMY_INPUTS,
    DUMMY_MASK,
@@ -398,7 +398,7 @@ class TFT5Attention(tf.keras.layers.Layer):
                position_bias = position_bias + mask  # (batch_size, n_heads, query_length, key_length)
        scores += position_bias
-        weights = tf.nn.softmax(scores, axis=-1)  # (batch_size, n_heads, query_length, key_length)
+        weights = stable_softmax(scores, axis=-1)  # (batch_size, n_heads, query_length, key_length)
        weights = self.dropout(weights, training=training)  # (batch_size, n_heads, query_length, key_length)
        # Mask heads if we want to

--- a/src/transformers/models/tapas/modeling_tf_tapas.py
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@@ -38,7 +38,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
    ModelOutput,
    add_start_docstrings,
@@ -346,7 +346,7 @@ class TFTapasSelfAttention(tf.keras.layers.Layer):
            attention_scores = tf.add(attention_scores, attention_mask)
        # Normalize the attention scores to probabilities.
-        attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
@@ -2216,7 +2216,7 @@ def _calculate_expected_result(
        aggregation_op_only_probs = gumbel_dist.sample()
    else:
        # <float32>[batch_size, num_aggregation_labels - 1]
-        aggregation_op_only_probs = tf.nn.softmax(logits_aggregation[:, 1:] / config.aggregation_temperature, axis=-1)
+        aggregation_op_only_probs = stable_softmax(logits_aggregation[:, 1:] / config.aggregation_temperature, axis=-1)
    all_results = tf.concat(
        [
            tf.expand_dims(sum_result, axis=1),

--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
@@ -31,7 +31,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
@@ -236,7 +236,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
            attn_score = attn_score * (1.0 - attn_mask_t) - 1e30 * attn_mask_t
        # [qlen x klen x bsz x n_head]
-        attn_prob = tf.nn.softmax(attn_score, axis=1)
+        attn_prob = stable_softmax(attn_score, axis=1)
        attn_prob = self.dropatt(attn_prob, training=training)
        # Mask heads if we want to

--- a/src/transformers/models/vit/modeling_tf_vit.py
+++ b/src/transformers/models/vit/modeling_tf_vit.py
@@ -32,7 +32,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_vit import ViTConfig
@@ -260,7 +260,7 @@ class TFViTSelfAttention(tf.keras.layers.Layer):
        attention_scores = tf.divide(attention_scores, dk)
        # Normalize the attention scores to probabilities.
-        attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.

--- a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
@@ -38,7 +38,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import logging
 from .configuration_vit_mae import ViTMAEConfig
@@ -407,7 +407,7 @@ class TFViTMAESelfAttention(tf.keras.layers.Layer):
        attention_scores = tf.divide(attention_scores, dk)
        # Normalize the attention scores to probabilities.
-        attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.

--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -25,7 +25,7 @@ import tensorflow as tf
 from ...activations_tf import get_tf_activation
 from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput
 from ...modeling_tf_utils import TFPreTrainedModel, booleans_processing, get_initializer, keras_serializable
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...tokenization_utils_base import BatchEncoding
 from ...utils import (
    ModelOutput,
@@ -855,7 +855,7 @@ class TFWav2Vec2Attention(tf.keras.layers.Layer):
            attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
-        attn_weights = tf.nn.softmax(attn_weights, axis=-1)
+        attn_weights = stable_softmax(attn_weights, axis=-1)
        if layer_head_mask is not None:
            # The tf.debugging asserts are not compliant with XLA then they

--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -44,7 +44,7 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list
+from ...tf_utils import shape_list, stable_softmax
 from ...utils import (
    MULTIPLE_CHOICE_DUMMY_INPUTS,
    ModelOutput,
@@ -187,7 +187,7 @@ class TFXLMMultiHeadAttention(tf.keras.layers.Layer):
        # scores.masked_fill_(mask, -float('inf'))                            # (bs, n_heads, qlen, klen)
        mask = tf.cast(mask, dtype=scores.dtype)
        scores = scores - 1e30 * (1.0 - mask)
-        weights = tf.nn.softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
+        weights = stable_softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
        weights = self.dropout(weights, training=training)  # (bs, n_heads, qlen, klen)
        # Mask heads if we want to