Unverified Commit 00112c35 authored by TFUsers's avatar TFUsers Committed by GitHub
Browse files

Replace swish with silu (#8166)



* Replace swish with silu

* revert nn.silu to nn.swish due to older version

* simplify optimized silu conditional and fix format

* Update activations.py

* Update activations_tf.py

* Update modeling_flax_utils.py

* Update modeling_openai.py

* add swish testcase

* add pytorch swish testcase

* Add more robust python version check

* more formatting fixes
Co-authored-by: default avatarTFUsers <TFUsers@gmail.com>
parent cdc48ce9
...@@ -94,7 +94,7 @@ class PegasusConfig(BartConfig): ...@@ -94,7 +94,7 @@ class PegasusConfig(BartConfig):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder. Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
dropout (:obj:`float`, `optional`, defaults to 0.1): dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.0): attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
......
...@@ -39,7 +39,7 @@ class ProphetNetConfig(PretrainedConfig): ...@@ -39,7 +39,7 @@ class ProphetNetConfig(PretrainedConfig):
The dropout ratio for activations inside the fully connected layer. The dropout ratio for activations inside the fully connected layer.
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
vocab_size (:obj:`int`, `optional`, defaults to 30522): vocab_size (:obj:`int`, `optional`, defaults to 30522):
Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by
the :obj:`inputs_ids` passed when calling :class:`~transformers.ProphetNetModel`. the :obj:`inputs_ids` passed when calling :class:`~transformers.ProphetNetModel`.
......
...@@ -80,7 +80,7 @@ class ReformerConfig(PretrainedConfig): ...@@ -80,7 +80,7 @@ class ReformerConfig(PretrainedConfig):
:obj:`None` to ensure fully random rotations in local sensitive hashing scheme. :obj:`None` to ensure fully random rotations in local sensitive hashing scheme.
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`): hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`):
The non-linear activation function (function or string) in the feed forward layer in the residual attention The non-linear activation function (function or string) in the feed forward layer in the residual attention
block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.05): hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.05):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
hidden_size (:obj:`int`, `optional`, defaults to 256): hidden_size (:obj:`int`, `optional`, defaults to 256):
......
...@@ -49,7 +49,7 @@ class RetriBertConfig(PretrainedConfig): ...@@ -49,7 +49,7 @@ class RetriBertConfig(PretrainedConfig):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
......
...@@ -50,7 +50,7 @@ class SqueezeBertConfig(PretrainedConfig): ...@@ -50,7 +50,7 @@ class SqueezeBertConfig(PretrainedConfig):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
......
...@@ -54,7 +54,7 @@ class XLNetConfig(PretrainedConfig): ...@@ -54,7 +54,7 @@ class XLNetConfig(PretrainedConfig):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
ff_activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): ff_activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the If string, :obj:`"gelu"`, :obj:`"relu"`, The non-linear activation function (function or string) in the If string, :obj:`"gelu"`, :obj:`"relu"`,
:obj:`"swish"` and :obj:`"gelu_new"` are supported. :obj:`"silu"` and :obj:`"gelu_new"` are supported.
untie_r (:obj:`bool`, `optional`, defaults to :obj:`True`): untie_r (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to untie relative position biases Whether or not to untie relative position biases
attn_type (:obj:`str`, `optional`, defaults to :obj:`"bi"`): attn_type (:obj:`str`, `optional`, defaults to :obj:`"bi"`):
......
...@@ -53,6 +53,7 @@ def gelu(x): ...@@ -53,6 +53,7 @@ def gelu(x):
ACT2FN = { ACT2FN = {
"gelu": nn.gelu, "gelu": nn.gelu,
"relu": nn.relu, "relu": nn.relu,
"silu": nn.swish,
"swish": nn.swish, "swish": nn.swish,
"gelu_new": gelu, "gelu_new": gelu,
} }
......
...@@ -27,7 +27,7 @@ import torch ...@@ -27,7 +27,7 @@ import torch
import torch.nn as nn import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import CrossEntropyLoss, MSELoss
from .activations import gelu_new, swish from .activations import gelu_new, silu
from .configuration_openai import OpenAIGPTConfig from .configuration_openai import OpenAIGPTConfig
from .file_utils import ( from .file_utils import (
ModelOutput, ModelOutput,
...@@ -139,7 +139,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): ...@@ -139,7 +139,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
return model return model
ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu_new} ACT_FNS = {"relu": nn.ReLU, "silu": silu, "gelu": gelu_new, "swish": silu}
class Attention(nn.Module): class Attention(nn.Module):
......
...@@ -20,6 +20,7 @@ class TestActivations(unittest.TestCase): ...@@ -20,6 +20,7 @@ class TestActivations(unittest.TestCase):
def test_get_activation(self): def test_get_activation(self):
get_activation("swish") get_activation("swish")
get_activation("silu")
get_activation("relu") get_activation("relu")
get_activation("tanh") get_activation("tanh")
get_activation("gelu_new") get_activation("gelu_new")
......
...@@ -12,6 +12,7 @@ if is_tf_available(): ...@@ -12,6 +12,7 @@ if is_tf_available():
class TestTFActivations(unittest.TestCase): class TestTFActivations(unittest.TestCase):
def test_get_activation(self): def test_get_activation(self):
get_tf_activation("swish") get_tf_activation("swish")
get_tf_activation("silu")
get_tf_activation("gelu") get_tf_activation("gelu")
get_tf_activation("relu") get_tf_activation("relu")
get_tf_activation("tanh") get_tf_activation("tanh")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment