"comfy/vscode:/vscode.git/clone" did not exist on "8ea165dd1ef877f58f3710f31ce43f27e0f739ab"
Unverified Commit 00112c35 authored by TFUsers's avatar TFUsers Committed by GitHub
Browse files

Replace swish with silu (#8166)



* Replace swish with silu

* revert nn.silu to nn.swish due to older version

* simplify optimized silu conditional and fix format

* Update activations.py

* Update activations_tf.py

* Update modeling_flax_utils.py

* Update modeling_openai.py

* add swish testcase

* add pytorch swish testcase

* Add more robust python version check

* more formatting fixes
Co-authored-by: default avatarTFUsers <TFUsers@gmail.com>
parent cdc48ce9
......@@ -94,7 +94,7 @@ class PegasusConfig(BartConfig):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
......
......@@ -39,7 +39,7 @@ class ProphetNetConfig(PretrainedConfig):
The dropout ratio for activations inside the fully connected layer.
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
vocab_size (:obj:`int`, `optional`, defaults to 30522):
Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by
the :obj:`inputs_ids` passed when calling :class:`~transformers.ProphetNetModel`.
......
......@@ -80,7 +80,7 @@ class ReformerConfig(PretrainedConfig):
:obj:`None` to ensure fully random rotations in local sensitive hashing scheme.
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`):
The non-linear activation function (function or string) in the feed forward layer in the residual attention
block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.05):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
hidden_size (:obj:`int`, `optional`, defaults to 256):
......
......@@ -49,7 +49,7 @@ class RetriBertConfig(PretrainedConfig):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
......
......@@ -50,7 +50,7 @@ class SqueezeBertConfig(PretrainedConfig):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
......
......@@ -54,7 +54,7 @@ class XLNetConfig(PretrainedConfig):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
ff_activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the If string, :obj:`"gelu"`, :obj:`"relu"`,
:obj:`"swish"` and :obj:`"gelu_new"` are supported.
:obj:`"silu"` and :obj:`"gelu_new"` are supported.
untie_r (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to untie relative position biases
attn_type (:obj:`str`, `optional`, defaults to :obj:`"bi"`):
......
......@@ -53,6 +53,7 @@ def gelu(x):
ACT2FN = {
"gelu": nn.gelu,
"relu": nn.relu,
"silu": nn.swish,
"swish": nn.swish,
"gelu_new": gelu,
}
......
......@@ -27,7 +27,7 @@ import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from .activations import gelu_new, swish
from .activations import gelu_new, silu
from .configuration_openai import OpenAIGPTConfig
from .file_utils import (
ModelOutput,
......@@ -139,7 +139,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
return model
ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu_new}
ACT_FNS = {"relu": nn.ReLU, "silu": silu, "gelu": gelu_new, "swish": silu}
class Attention(nn.Module):
......
......@@ -20,6 +20,7 @@ class TestActivations(unittest.TestCase):
def test_get_activation(self):
get_activation("swish")
get_activation("silu")
get_activation("relu")
get_activation("tanh")
get_activation("gelu_new")
......
......@@ -12,6 +12,7 @@ if is_tf_available():
class TestTFActivations(unittest.TestCase):
def test_get_activation(self):
get_tf_activation("swish")
get_tf_activation("silu")
get_tf_activation("gelu")
get_tf_activation("relu")
get_tf_activation("tanh")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment