Commit 6d52239a authored by xinliupitt's avatar xinliupitt
Browse files

initializer func in model

parent 0b395f65
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Attention Layer Initializer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import tensorflow as tf
@tf.keras.utils.register_keras_serializable(package='Text')
def attention_initializer(hidden_size):
"""Weight Initializer of Attention Layer in Seq2Seq Transformer.
Args:
hidden_size: hidden size of input tensor
Returns:
Initialized weights based on hidden size
"""
limit = math.sqrt(6.0 / (hidden_size + hidden_size))
return tf.keras.initializers.RandomUniform(minval=-limit, maxval=limit)
......@@ -21,9 +21,10 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import tensorflow as tf
from official.modeling import tf_utils
from official.modeling.activations import attention_initializer
from official.nlp.modeling import layers
from official.nlp.modeling.layers import position_embedding
from official.nlp.modeling.layers import transformer
......@@ -408,7 +409,7 @@ class TransformerEncoder(tf.keras.layers.Layer):
norm_first=self._norm_first,
norm_epsilon=self._norm_epsilon,
intermediate_dropout=self._intermediate_dropout,
attention_initializer=attention_initializer.attention_initializer(
attention_initializer=attention_initializer(
input_shape[2]),
name=("layer_%d" % i)))
self.output_normalization = tf.keras.layers.LayerNormalization(
......@@ -522,7 +523,7 @@ class TransformerDecoder(tf.keras.layers.Layer):
norm_first=self._norm_first,
norm_epsilon=self._norm_epsilon,
intermediate_dropout=self._intermediate_dropout,
attention_initializer=attention_initializer.attention_initializer(
attention_initializer=attention_initializer(
input_shape[2]),
name=("layer_%d" % i)))
self.output_normalization = tf.keras.layers.LayerNormalization(
......@@ -613,3 +614,7 @@ def embedding_linear(embedding_matrix, x):
logits = tf.matmul(x, embedding_matrix, transpose_b=True)
return tf.reshape(logits, [batch_size, length, vocab_size])
def attention_initializer(hidden_size):
limit = math.sqrt(6.0 / (hidden_size + hidden_size))
return tf.keras.initializers.RandomUniform(minval=-limit, maxval=limit)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment