Commit 0490e860 authored by xinliupitt's avatar xinliupitt
Browse files

attention initializer

parent c60aa809
...@@ -12,21 +12,26 @@ ...@@ -12,21 +12,26 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""Gaussian error linear unit.""" """Attention Layer Initializer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import tensorflow as tf import tensorflow as tf
@tf.keras.utils.register_keras_serializable(package='Text') @tf.keras.utils.register_keras_serializable(package='Text')
def gelu(x): def attention_initializer(hidden_size):
"""Gaussian Error Linear Unit. """Weight Initializer of Attention Layer in Seq2Seq Transformer.
This is a smoother version of the RELU.
Original paper: https://arxiv.org/abs/1606.08415
Args: Args:
x: float Tensor to perform activation. hidden_size: hidden size of input tensor
Returns: Returns:
`x` with the GELU activation applied. Initialized weights based on hidden size
""" """
return tf.keras.activations.gelu(x, approximate=True) limit = math.sqrt(6.0 / (hidden_size + hidden_size))
return tf.keras.initializers.RandomUniform(minval=-limit, maxval=limit)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment