Replace swish with silu (#8166)

* Replace swish with silu * revert nn.silu to nn.swish due to older version * simplify optimized silu conditional and fix format * Update activations.py * Update activations_tf.py * Update modeling_flax_utils.py * Update modeling_openai.py * add swish testcase * add pytorch swish testcase * Add more robust python version check * more formatting fixes Co-authored-by: TFUsers <TFUsers@gmail.com>

Replace swish with silu (#8166)
* Replace swish with silu * revert nn.silu to nn.swish due to older version * simplify optimized silu conditional and fix format * Update activations.py * Update activations_tf.py * Update modeling_flax_utils.py * Update modeling_openai.py * add swish testcase * add pytorch swish testcase * Add more robust python version check * more formatting fixes Co-authored-by: TFUsers <TFUsers@gmail.com>
00112c35 · TFUsers · GitHub · cdc48ce9 · 00112c35 · 00112c35
Unverified Commit 00112c35 authored Oct 30, 2020 by TFUsers Committed by GitHub Oct 30, 2020
20 changed files
--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@@ -2,6 +2,7 @@ import math
 import torch
 import torch.nn.functional as F
+from packaging import version
 from .utils import logging
@@ -9,29 +10,25 @@ from .utils import logging
 logger = logging.get_logger(__name__)
-def swish(x):
-    return x * torch.sigmoid(x)
 def _gelu_python(x):
    """
-    Original Implementation of the gelu activation function in Google Bert repo when initially created. For
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
-    information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in
-    torch.nn.functional Also see https://arxiv.org/abs/1606.08415
+    torch.nn.functional Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
 def gelu_new(x):
    """
-    Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). Also see
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
-    https://arxiv.org/abs/1606.08415
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
    """
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
-if torch.__version__ < "1.4.0":
+if version.parse(torch.__version__) < version.parse("1.4"):
    gelu = _gelu_python
 else:
    gelu = F.gelu
@@ -41,6 +38,23 @@ def gelu_fast(x):
    return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
+def _silu_python(x):
+    """
+    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
+    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
+    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
+    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
+    later.
+    """
+    return x * torch.sigmoid(x)
+if version.parse(torch.__version__) < version.parse("1.7"):
+    silu = _silu_python
+else:
+    silu = F.silu
 def mish(x):
    return x * torch.tanh(torch.nn.functional.softplus(x))
@@ -51,7 +65,8 @@ def linear_act(x):
 ACT2FN = {
    "relu": F.relu,
-    "swish": swish,
+    "silu": silu,
+    "swish": silu,
    "gelu": gelu,
    "tanh": torch.tanh,
    "gelu_new": gelu_new,

--- a/src/transformers/activations_tf.py
+++ b/src/transformers/activations_tf.py
@@ -52,6 +52,7 @@ ACT2FN = {
    "gelu": tf.keras.layers.Activation(gelu),
    "relu": tf.keras.activations.relu,
    "swish": tf.keras.activations.swish,
+    "silu": tf.keras.activations.swish,
    "gelu_new": tf.keras.layers.Activation(gelu_new),
    "mish": tf.keras.layers.Activation(mish),
    "tanh": tf.keras.activations.tanh,

--- a/src/transformers/configuration_albert.py
+++ b/src/transformers/configuration_albert.py
@@ -61,7 +61,7 @@ class AlbertConfig(PretrainedConfig):
            The number of inner repetition of attention and ffn.
        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu_new"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0):

--- a/src/transformers/configuration_bart.py
+++ b/src/transformers/configuration_bart.py
@@ -59,7 +59,7 @@ class BartConfig(PretrainedConfig):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):

--- a/src/transformers/configuration_bert.py
+++ b/src/transformers/configuration_bert.py
@@ -74,7 +74,7 @@ class BertConfig(PretrainedConfig):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):

--- a/src/transformers/configuration_bert_generation.py
+++ b/src/transformers/configuration_bert_generation.py
@@ -40,7 +40,7 @@ class BertGenerationConfig(PretrainedConfig):
            Dimensionality of the "intermediate" (often called feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):

--- a/src/transformers/configuration_blenderbot.py
+++ b/src/transformers/configuration_blenderbot.py
@@ -56,7 +56,7 @@ class BlenderbotConfig(BartConfig):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):

--- a/src/transformers/configuration_deberta.py
+++ b/src/transformers/configuration_deberta.py
@@ -52,7 +52,7 @@ class DebertaConfig(PretrainedConfig):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
            :obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.

--- a/src/transformers/configuration_distilbert.py
+++ b/src/transformers/configuration_distilbert.py
@@ -66,7 +66,7 @@ class DistilBertConfig(PretrainedConfig):
            The dropout ratio for the attention probabilities.
        activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        qa_dropout (:obj:`float`, `optional`, defaults to 0.1):

--- a/src/transformers/configuration_dpr.py
+++ b/src/transformers/configuration_dpr.py
@@ -55,7 +55,7 @@ class DPRConfig(PretrainedConfig):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):

--- a/src/transformers/configuration_electra.py
+++ b/src/transformers/configuration_electra.py
@@ -60,7 +60,7 @@ class ElectraConfig(PretrainedConfig):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):

--- a/src/transformers/configuration_fsmt.py
+++ b/src/transformers/configuration_fsmt.py
@@ -71,7 +71,7 @@ class FSMTConfig(PretrainedConfig):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
        activation_function (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):

--- a/src/transformers/configuration_funnel.py
+++ b/src/transformers/configuration_funnel.py
@@ -66,7 +66,7 @@ class FunnelConfig(PretrainedConfig):
            Inner dimension in the feed-forward blocks.
        hidden_act (:obj:`str` or :obj:`callable`, `optional`, defaults to :obj:`"gelu_new"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):

--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -60,7 +60,7 @@ class GPT2Config(PretrainedConfig):
        n_inner (:obj:`int`, `optional`, defaults to None):
            Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
        activation_function (:obj:`str`, `optional`, defaults to :obj:`"gelu"`):
-            Activation function, to be selected in the list :obj:`["relu", "swish", "gelu", "tanh", "gelu_new"]`.
+            Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]`.
        resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):

--- a/src/transformers/configuration_layoutlm.py
+++ b/src/transformers/configuration_layoutlm.py
@@ -52,7 +52,7 @@ class LayoutLMConfig(BertConfig):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):

--- a/src/transformers/configuration_lxmert.py
+++ b/src/transformers/configuration_lxmert.py
@@ -55,7 +55,7 @@ class LxmertConfig(PretrainedConfig):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):

--- a/src/transformers/configuration_marian.py
+++ b/src/transformers/configuration_marian.py
@@ -50,7 +50,7 @@ class MarianConfig(BartConfig):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):

--- a/src/transformers/configuration_mbart.py
+++ b/src/transformers/configuration_mbart.py
@@ -55,7 +55,7 @@ class MBartConfig(BartConfig):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):

--- a/src/transformers/configuration_mobilebert.py
+++ b/src/transformers/configuration_mobilebert.py
@@ -48,7 +48,7 @@ class MobileBertConfig(PretrainedConfig):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"relu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.0):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):

--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -54,7 +54,7 @@ class OpenAIGPTConfig(PretrainedConfig):
            Number of attention heads for each attention layer in the Transformer encoder.
        afn (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):