initializer func in model

6d52239a · xinliupitt · 0b395f65 · 0b395f65 · 6d52239a
Commit 6d52239a authored Aug 08, 2020 by xinliupitt
2 changed files
--- a/official/modeling/activations/attention_initializer.py
+++ b/official/modeling/activations/attention_initializer.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Attention Layer Initializer."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-import tensorflow as tf
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-def attention_initializer(hidden_size):
-  """Weight Initializer of Attention Layer in Seq2Seq Transformer.
-
-  Args:
-    hidden_size: hidden size of input tensor
-
-  Returns:
-    Initialized weights based on hidden size
-  """
-  limit = math.sqrt(6.0 / (hidden_size + hidden_size))
-  return tf.keras.initializers.RandomUniform(minval=-limit, maxval=limit)
--- a/official/nlp/modeling/models/seq2seq_transformer.py
+++ b/official/nlp/modeling/models/seq2seq_transformer.py
@@ -21,9 +21,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import math
+
 import tensorflow as tf
 from official.modeling import tf_utils
-from official.modeling.activations import attention_initializer
 from official.nlp.modeling import layers
 from official.nlp.modeling.layers import position_embedding
 from official.nlp.modeling.layers import transformer
@@ -408,7 +409,7 @@ class TransformerEncoder(tf.keras.layers.Layer):
              norm_first=self._norm_first,
              norm_epsilon=self._norm_epsilon,
              intermediate_dropout=self._intermediate_dropout,
-              attention_initializer=attention_initializer.attention_initializer(
+              attention_initializer=attention_initializer(
                  input_shape[2]),
              name=("layer_%d" % i)))
    self.output_normalization = tf.keras.layers.LayerNormalization(
@@ -522,7 +523,7 @@ class TransformerDecoder(tf.keras.layers.Layer):
              norm_first=self._norm_first,
              norm_epsilon=self._norm_epsilon,
              intermediate_dropout=self._intermediate_dropout,
-              attention_initializer=attention_initializer.attention_initializer(
+              attention_initializer=attention_initializer(
                  input_shape[2]),
              name=("layer_%d" % i)))
    self.output_normalization = tf.keras.layers.LayerNormalization(
@@ -613,3 +614,7 @@ def embedding_linear(embedding_matrix, x):
    logits = tf.matmul(x, embedding_matrix, transpose_b=True)

    return tf.reshape(logits, [batch_size, length, vocab_size])
+
+def attention_initializer(hidden_size):
+  limit = math.sqrt(6.0 / (hidden_size + hidden_size))
+  return tf.keras.initializers.RandomUniform(minval=-limit, maxval=limit)