Commit 37e7a323 authored by xinliupitt's avatar xinliupitt
Browse files

docstring

parent 18402b08
...@@ -107,7 +107,6 @@ class Seq2SeqTransformer(tf.keras.Model): ...@@ -107,7 +107,6 @@ class Seq2SeqTransformer(tf.keras.Model):
First item, inputs: int tensor with shape [batch_size, input_length]. First item, inputs: int tensor with shape [batch_size, input_length].
Second item (optional), targets: None or int tensor with shape Second item (optional), targets: None or int tensor with shape
[batch_size, target_length]. [batch_size, target_length].
training: boolean, whether in training mode or not.
Returns: Returns:
If targets is defined, then return logits for each word in the target If targets is defined, then return logits for each word in the target
...@@ -362,13 +361,26 @@ class Seq2SeqTransformer(tf.keras.Model): ...@@ -362,13 +361,26 @@ class Seq2SeqTransformer(tf.keras.Model):
return symbols_to_logits_fn return symbols_to_logits_fn
class TransformerEncoder(tf.keras.layers.Layer): class TransformerEncoder(tf.keras.layers.Layer):
"""Transformer decoder stack. """Transformer encoder.
Like the encoder stack, the decoder stack is made up of N identical layers. Transformer encoder is made up of N identical layers. Each layer is composed
Each layer is composed of the sublayers: of the sublayers:
1. Self-attention layer 1. Self-attention layer
2. Multi-headed attention layer combining encoder outputs with results from 2. Feedforward network (which is 2 fully-connected layers)
the previous self-attention layer.
3. Feedforward network (2 fully-connected layers) Arguments:
num_layers: Number of layers.
num_attention_heads: Number of attention heads.
intermediate_size: Size of the intermediate (Feedforward) layer.
activation: Activation for the intermediate layer.
dropout_rate: Dropout probability.
attention_dropout_rate: Dropout probability for attention layers.
use_bias: Whether to enable use_bias in attention layer. If set False,
use_bias in attention layer is disabled.
norm_first: Whether to normalize inputs to attention and intermediate dense
layers. If set False, output of attention and intermediate dense layers is
normalized.
norm_epsilon: Epsilon value to initialize normalization layers.
intermediate_dropout: Dropout probability for intermediate_dropout_layer.
""" """
def __init__(self, def __init__(self,
...@@ -443,28 +455,14 @@ class TransformerEncoder(tf.keras.layers.Layer): ...@@ -443,28 +455,14 @@ class TransformerEncoder(tf.keras.layers.Layer):
def call(self, def call(self,
encoder_inputs, encoder_inputs,
attention_mask=None): attention_mask=None):
"""Return the output of the decoder layer stacks. """Return the output of the encoder.
Args: Args:
decoder_inputs: A tensor with shape encoder_inputs: tensor with shape [batch_size, input_length, hidden_size]
[batch_size, target_length, hidden_size]. attention_mask: mask for the encoder self-attention layer. [batch_size,
encoder_outputs: A tensor with shape input_length, input_length]
[batch_size, input_length, hidden_size]
decoder_self_attention_bias: A tensor with shape
[1, 1, target_len, target_length], the bias for decoder self-attention
layer.
attention_bias: A tensor with shape [batch_size, 1, 1, input_length],
the bias for encoder-decoder attention layer.
training: A bool, whether in training mode or not.
cache: (Used for fast decoding) A nested dictionary storing previous
decoder self-attention values. The items are:
{layer_n: {"k": A tensor with shape [batch_size, i, key_channels],
"v": A tensor with shape [batch_size, i, value_channels]},
...}
decode_loop_step: An integer, the step number of the decoding loop. Used
only for autoregressive inference on TPU.
Returns: Returns:
Output of decoder layer stack. Output of encoder.
float32 tensor with shape [batch_size, target_length, hidden_size] float32 tensor with shape [batch_size, input_length, hidden_size]
""" """
for layer_idx in range(self._num_layers): for layer_idx in range(self._num_layers):
encoder_inputs = self.encoder_layers[layer_idx]( encoder_inputs = self.encoder_layers[layer_idx](
...@@ -476,13 +474,28 @@ class TransformerEncoder(tf.keras.layers.Layer): ...@@ -476,13 +474,28 @@ class TransformerEncoder(tf.keras.layers.Layer):
return output_tensor return output_tensor
class TransformerDecoder(tf.keras.layers.Layer): class TransformerDecoder(tf.keras.layers.Layer):
"""Transformer decoder stack. """Transformer decoder.
Like the encoder stack, the decoder stack is made up of N identical layers. Like the encoder, the decoder is made up of N identical layers.
Each layer is composed of the sublayers: Each layer is composed of the sublayers:
1. Self-attention layer 1. Self-attention layer
2. Multi-headed attention layer combining encoder outputs with results from 2. Multi-headed attention layer combining encoder outputs with results from
the previous self-attention layer. the previous self-attention layer.
3. Feedforward network (2 fully-connected layers) 3. Feedforward network (2 fully-connected layers)
Arguments:
num_layers: Number of layers.
num_attention_heads: Number of attention heads.
intermediate_size: Size of the intermediate (Feedforward) layer.
activation: Activation for the intermediate layer.
dropout_rate: Dropout probability.
attention_dropout_rate: Dropout probability for attention layers.
use_bias: Whether to enable use_bias in attention layer. If set False,
use_bias in attention layer is disabled.
norm_first: Whether to normalize inputs to attention and intermediate dense
layers. If set False, output of attention and intermediate dense layers is
normalized.
norm_epsilon: Epsilon value to initialize normalization layers.
intermediate_dropout: Dropout probability for intermediate_dropout_layer.
""" """
def __init__(self, def __init__(self,
...@@ -563,16 +576,15 @@ class TransformerDecoder(tf.keras.layers.Layer): ...@@ -563,16 +576,15 @@ class TransformerDecoder(tf.keras.layers.Layer):
decode_loop_step=None): decode_loop_step=None):
"""Return the output of the decoder layer stacks. """Return the output of the decoder layer stacks.
Args: Args:
decoder_inputs: A tensor with shape target: A tensor with shape
[batch_size, target_length, hidden_size]. [batch_size, target_length, hidden_size].
encoder_outputs: A tensor with shape memory: A tensor with shape
[batch_size, input_length, hidden_size] [batch_size, input_length, hidden_size]
decoder_self_attention_bias: A tensor with shape memory_mask: A tensor with shape
[1, 1, target_len, target_length], the bias for decoder self-attention [batch_size, target_len, target_length], the mask for decoder
layer. self-attention layer.
attention_bias: A tensor with shape [batch_size, 1, 1, input_length], target_mask: A tensor with shape [batch_size, target_length, input_length]
the bias for encoder-decoder attention layer. which is the mask for encoder-decoder attention layer.
training: A bool, whether in training mode or not.
cache: (Used for fast decoding) A nested dictionary storing previous cache: (Used for fast decoding) A nested dictionary storing previous
decoder self-attention values. The items are: decoder self-attention values. The items are:
{layer_n: {"k": A tensor with shape [batch_size, i, key_channels], {layer_n: {"k": A tensor with shape [batch_size, i, key_channels],
...@@ -581,7 +593,7 @@ class TransformerDecoder(tf.keras.layers.Layer): ...@@ -581,7 +593,7 @@ class TransformerDecoder(tf.keras.layers.Layer):
decode_loop_step: An integer, the step number of the decoding loop. Used decode_loop_step: An integer, the step number of the decoding loop. Used
only for autoregressive inference on TPU. only for autoregressive inference on TPU.
Returns: Returns:
Output of decoder layer stack. Output of decoder.
float32 tensor with shape [batch_size, target_length, hidden_size] float32 tensor with shape [batch_size, target_length, hidden_size]
""" """
...@@ -616,5 +628,6 @@ def embedding_linear(embedding_matrix, x): ...@@ -616,5 +628,6 @@ def embedding_linear(embedding_matrix, x):
return tf.reshape(logits, [batch_size, length, vocab_size]) return tf.reshape(logits, [batch_size, length, vocab_size])
def attention_initializer(hidden_size): def attention_initializer(hidden_size):
"""Initializer for attention layers in Seq2SeqTransformer"""
limit = math.sqrt(6.0 / (hidden_size + hidden_size)) limit = math.sqrt(6.0 / (hidden_size + hidden_size))
return tf.keras.initializers.RandomUniform(minval=-limit, maxval=limit) return tf.keras.initializers.RandomUniform(minval=-limit, maxval=limit)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment