Replace swish with silu (#8166)

* Replace swish with silu * revert nn.silu to nn.swish due to older version * simplify optimized silu conditional and fix format * Update activations.py * Update activations_tf.py * Update modeling_flax_utils.py * Update modeling_openai.py * add swish testcase * add pytorch swish testcase * Add more robust python version check * more formatting fixes Co-authored-by: TFUsers <TFUsers@gmail.com>

Replace swish with silu (#8166)
* Replace swish with silu * revert nn.silu to nn.swish due to older version * simplify optimized silu conditional and fix format * Update activations.py * Update activations_tf.py * Update modeling_flax_utils.py * Update modeling_openai.py * add swish testcase * add pytorch swish testcase * Add more robust python version check * more formatting fixes Co-authored-by: TFUsers <TFUsers@gmail.com>
00112c35 · TFUsers · GitHub · cdc48ce9 · 00112c35 · 00112c35
Unverified Commit 00112c35 authored Oct 30, 2020 by TFUsers Committed by GitHub Oct 30, 2020
10 changed files
--- a/src/transformers/configuration_pegasus.py
+++ b/src/transformers/configuration_pegasus.py
@@ -94,7 +94,7 @@ class PegasusConfig(BartConfig):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        dropout (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):

--- a/src/transformers/configuration_prophetnet.py
+++ b/src/transformers/configuration_prophetnet.py
@@ -39,7 +39,7 @@ class ProphetNetConfig(PretrainedConfig):
            The dropout ratio for activations inside the fully connected layer.
        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        vocab_size (:obj:`int`, `optional`, defaults to 30522):
            Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by
            the :obj:`inputs_ids` passed when calling :class:`~transformers.ProphetNetModel`.

--- a/src/transformers/configuration_reformer.py
+++ b/src/transformers/configuration_reformer.py
@@ -80,7 +80,7 @@ class ReformerConfig(PretrainedConfig):
            :obj:`None` to ensure fully random rotations in local sensitive hashing scheme.
        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`):
            The non-linear activation function (function or string) in the feed forward layer in the residual attention
-            block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.05):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        hidden_size (:obj:`int`, `optional`, defaults to 256):

--- a/src/transformers/configuration_retribert.py
+++ b/src/transformers/configuration_retribert.py
@@ -49,7 +49,7 @@ class RetriBertConfig(PretrainedConfig):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):

--- a/src/transformers/configuration_squeezebert.py
+++ b/src/transformers/configuration_squeezebert.py
@@ -50,7 +50,7 @@ class SqueezeBertConfig(PretrainedConfig):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):

--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
@@ -54,7 +54,7 @@ class XLNetConfig(PretrainedConfig):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        ff_activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the If string, :obj:`"gelu"`, :obj:`"relu"`,
-            :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"silu"` and :obj:`"gelu_new"` are supported.
        untie_r (:obj:`bool`, `optional`, defaults to :obj:`True`):
            Whether or not to untie relative position biases
        attn_type (:obj:`str`, `optional`, defaults to :obj:`"bi"`):

--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -53,6 +53,7 @@ def gelu(x):
 ACT2FN = {
    "gelu": nn.gelu,
    "relu": nn.relu,
+    "silu": nn.swish,
    "swish": nn.swish,
    "gelu_new": gelu,
 }

--- a/src/transformers/modeling_openai.py
+++ b/src/transformers/modeling_openai.py
@@ -27,7 +27,7 @@ import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss
-from .activations import gelu_new, swish
+from .activations import gelu_new, silu
 from .configuration_openai import OpenAIGPTConfig
 from .file_utils import (
    ModelOutput,
@@ -139,7 +139,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
    return model
-ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu_new}
+ACT_FNS = {"relu": nn.ReLU, "silu": silu, "gelu": gelu_new, "swish": silu}
 class Attention(nn.Module):

--- a/tests/test_activations.py
+++ b/tests/test_activations.py
@@ -20,6 +20,7 @@ class TestActivations(unittest.TestCase):
    def test_get_activation(self):
        get_activation("swish")
+        get_activation("silu")
        get_activation("relu")
        get_activation("tanh")
        get_activation("gelu_new")

--- a/tests/test_activations_tf.py
+++ b/tests/test_activations_tf.py
@@ -12,6 +12,7 @@ if is_tf_available():
 class TestTFActivations(unittest.TestCase):
    def test_get_activation(self):
        get_tf_activation("swish")
+        get_tf_activation("silu")
        get_tf_activation("gelu")
        get_tf_activation("relu")
        get_tf_activation("tanh")