Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
9bb04e60
Commit
9bb04e60
authored
Aug 11, 2020
by
xinliupitt
Browse files
seq2seq test case
parent
befa593d
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
582 additions
and
6 deletions
+582
-6
official/nlp/modeling/models/seq2seq_transformer.py
official/nlp/modeling/models/seq2seq_transformer.py
+0
-1
official/nlp/modeling/models/seq2seq_transformer_test.py
official/nlp/modeling/models/seq2seq_transformer_test.py
+75
-0
official/nlp/transformer/transformer.py
official/nlp/transformer/transformer.py
+507
-5
No files found.
official/nlp/modeling/models/seq2seq_transformer.py
View file @
9bb04e60
...
@@ -29,7 +29,6 @@ from official.nlp.modeling import layers
...
@@ -29,7 +29,6 @@ from official.nlp.modeling import layers
from
official.nlp.modeling.layers
import
position_embedding
from
official.nlp.modeling.layers
import
position_embedding
from
official.nlp.modeling.layers
import
transformer
from
official.nlp.modeling.layers
import
transformer
from
official.nlp.modeling.ops
import
beam_search
from
official.nlp.modeling.ops
import
beam_search
from
official.nlp.transformer
import
metrics
from
official.nlp.transformer
import
model_utils
from
official.nlp.transformer
import
model_utils
from
official.nlp.transformer.utils.tokenizer
import
EOS_ID
from
official.nlp.transformer.utils.tokenizer
import
EOS_ID
...
...
official/nlp/modeling/models/seq2seq_transformer_test.py
0 → 100644
View file @
9bb04e60
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Test Transformer model."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
tensorflow
as
tf
from
official.nlp.modeling.models
import
seq2seq_transformer
from
official.nlp.transformer
import
model_params
class
TransformerV2Test
(
tf
.
test
.
TestCase
):
def
setUp
(
self
):
self
.
params
=
params
=
model_params
.
TINY_PARAMS
params
[
"batch_size"
]
=
params
[
"default_batch_size"
]
=
16
params
[
"use_synthetic_data"
]
=
True
params
[
"hidden_size"
]
=
12
params
[
"num_hidden_layers"
]
=
2
params
[
"filter_size"
]
=
14
params
[
"num_heads"
]
=
2
params
[
"vocab_size"
]
=
41
params
[
"extra_decode_length"
]
=
2
params
[
"beam_size"
]
=
3
params
[
"dtype"
]
=
tf
.
float32
def
test_create_model_train
(
self
):
inputs
=
tf
.
keras
.
layers
.
Input
((
None
,),
dtype
=
"int64"
,
name
=
"inputs"
)
targets
=
tf
.
keras
.
layers
.
Input
((
None
,),
dtype
=
"int64"
,
name
=
"targets"
)
internal_model
=
seq2seq_transformer
.
Seq2SeqTransformer
(
self
.
params
)
logits
=
internal_model
([
inputs
,
targets
],
training
=
True
)
model
=
tf
.
keras
.
Model
([
inputs
,
targets
],
logits
)
inputs
,
outputs
=
model
.
inputs
,
model
.
outputs
self
.
assertEqual
(
len
(
inputs
),
2
)
self
.
assertEqual
(
len
(
outputs
),
1
)
self
.
assertEqual
(
inputs
[
0
].
shape
.
as_list
(),
[
None
,
None
])
self
.
assertEqual
(
inputs
[
0
].
dtype
,
tf
.
int64
)
self
.
assertEqual
(
inputs
[
1
].
shape
.
as_list
(),
[
None
,
None
])
self
.
assertEqual
(
inputs
[
1
].
dtype
,
tf
.
int64
)
self
.
assertEqual
(
outputs
[
0
].
shape
.
as_list
(),
[
None
,
None
,
41
])
self
.
assertEqual
(
outputs
[
0
].
dtype
,
tf
.
float32
)
def
test_create_model_not_train
(
self
):
inputs
=
tf
.
keras
.
layers
.
Input
((
None
,),
dtype
=
"int64"
,
name
=
"inputs"
)
internal_model
=
seq2seq_transformer
.
Seq2SeqTransformer
(
self
.
params
)
ret
=
internal_model
([
inputs
],
training
=
False
)
outputs
,
scores
=
ret
[
"outputs"
],
ret
[
"scores"
]
model
=
tf
.
keras
.
Model
(
inputs
,
[
outputs
,
scores
])
inputs
,
outputs
=
model
.
inputs
,
model
.
outputs
self
.
assertEqual
(
len
(
inputs
),
1
)
self
.
assertEqual
(
len
(
outputs
),
2
)
self
.
assertEqual
(
inputs
[
0
].
shape
.
as_list
(),
[
None
,
None
])
self
.
assertEqual
(
inputs
[
0
].
dtype
,
tf
.
int64
)
self
.
assertEqual
(
outputs
[
0
].
shape
.
as_list
(),
[
None
,
None
])
self
.
assertEqual
(
outputs
[
0
].
dtype
,
tf
.
int32
)
self
.
assertEqual
(
outputs
[
1
].
shape
.
as_list
(),
[
None
])
self
.
assertEqual
(
outputs
[
1
].
dtype
,
tf
.
float32
)
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
official/nlp/transformer/transformer.py
View file @
9bb04e60
...
@@ -22,8 +22,14 @@ from __future__ import division
...
@@ -22,8 +22,14 @@ from __future__ import division
from
__future__
import
print_function
from
__future__
import
print_function
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.nlp.modeling.models
import
seq2seq_transformer
from
official.nlp.modeling.layers
import
position_embedding
from
official.nlp.modeling.ops
import
beam_search
from
official.nlp.transformer
import
attention_layer
from
official.nlp.transformer
import
embedding_layer
from
official.nlp.transformer
import
ffn_layer
from
official.nlp.transformer
import
metrics
from
official.nlp.transformer
import
metrics
from
official.nlp.transformer
import
model_utils
from
official.nlp.transformer.utils.tokenizer
import
EOS_ID
# Disable the not-callable lint error, since it claims many objects are not
# Disable the not-callable lint error, since it claims many objects are not
...
@@ -37,8 +43,7 @@ def create_model(params, is_train):
...
@@ -37,8 +43,7 @@ def create_model(params, is_train):
if
is_train
:
if
is_train
:
inputs
=
tf
.
keras
.
layers
.
Input
((
None
,),
dtype
=
"int64"
,
name
=
"inputs"
)
inputs
=
tf
.
keras
.
layers
.
Input
((
None
,),
dtype
=
"int64"
,
name
=
"inputs"
)
targets
=
tf
.
keras
.
layers
.
Input
((
None
,),
dtype
=
"int64"
,
name
=
"targets"
)
targets
=
tf
.
keras
.
layers
.
Input
((
None
,),
dtype
=
"int64"
,
name
=
"targets"
)
internal_model
=
seq2seq_transformer
.
Seq2SeqTransformer
(
internal_model
=
Transformer
(
params
,
name
=
"transformer_v2"
)
params
,
name
=
"transformer_v2"
)
logits
=
internal_model
([
inputs
,
targets
],
training
=
is_train
)
logits
=
internal_model
([
inputs
,
targets
],
training
=
is_train
)
vocab_size
=
params
[
"vocab_size"
]
vocab_size
=
params
[
"vocab_size"
]
label_smoothing
=
params
[
"label_smoothing"
]
label_smoothing
=
params
[
"label_smoothing"
]
...
@@ -54,8 +59,505 @@ def create_model(params, is_train):
...
@@ -54,8 +59,505 @@ def create_model(params, is_train):
else
:
else
:
inputs
=
tf
.
keras
.
layers
.
Input
((
None
,),
dtype
=
"int64"
,
name
=
"inputs"
)
inputs
=
tf
.
keras
.
layers
.
Input
((
None
,),
dtype
=
"int64"
,
name
=
"inputs"
)
internal_model
=
seq2seq_transformer
.
Seq2SeqTransformer
(
internal_model
=
Transformer
(
params
,
name
=
"transformer_v2"
)
params
,
name
=
"transformer_v2"
)
ret
=
internal_model
([
inputs
],
training
=
is_train
)
ret
=
internal_model
([
inputs
],
training
=
is_train
)
outputs
,
scores
=
ret
[
"outputs"
],
ret
[
"scores"
]
outputs
,
scores
=
ret
[
"outputs"
],
ret
[
"scores"
]
return
tf
.
keras
.
Model
(
inputs
,
[
outputs
,
scores
])
return
tf
.
keras
.
Model
(
inputs
,
[
outputs
,
scores
])
class
Transformer
(
tf
.
keras
.
Model
):
"""Transformer model with Keras.
Implemented as described in: https://arxiv.org/pdf/1706.03762.pdf
The Transformer model consists of an encoder and decoder. The input is an int
sequence (or a batch of sequences). The encoder produces a continuous
representation, and the decoder uses the encoder output to generate
probabilities for the output sequence.
"""
def
__init__
(
self
,
params
,
name
=
None
):
"""Initialize layers to build Transformer model.
Args:
params: hyperparameter object defining layer sizes, dropout values, etc.
name: name of the model.
"""
super
(
Transformer
,
self
).
__init__
(
name
=
name
)
self
.
params
=
params
self
.
embedding_softmax_layer
=
embedding_layer
.
EmbeddingSharedWeights
(
params
[
"vocab_size"
],
params
[
"hidden_size"
])
self
.
encoder_stack
=
EncoderStack
(
params
)
self
.
decoder_stack
=
DecoderStack
(
params
)
self
.
position_embedding
=
position_embedding
.
RelativePositionEmbedding
(
hidden_size
=
self
.
params
[
"hidden_size"
])
def
get_config
(
self
):
return
{
"params"
:
self
.
params
,
}
def
call
(
self
,
inputs
,
training
):
"""Calculate target logits or inferred target sequences.
Args:
inputs: input tensor list of size 1 or 2.
First item, inputs: int tensor with shape [batch_size, input_length].
Second item (optional), targets: None or int tensor with shape
[batch_size, target_length].
training: boolean, whether in training mode or not.
Returns:
If targets is defined, then return logits for each word in the target
sequence. float tensor with shape [batch_size, target_length, vocab_size]
If target is none, then generate output sequence one token at a time.
returns a dictionary {
outputs: [batch_size, decoded length]
scores: [batch_size, float]}
Even when float16 is used, the output tensor(s) are always float32.
Raises:
NotImplementedError: If try to use padded decode method on CPU/GPUs.
"""
if
len
(
inputs
)
==
2
:
inputs
,
targets
=
inputs
[
0
],
inputs
[
1
]
else
:
# Decoding path.
inputs
,
targets
=
inputs
[
0
],
None
if
self
.
params
[
"padded_decode"
]:
if
not
self
.
params
[
"num_replicas"
]:
raise
NotImplementedError
(
"Padded decoding on CPU/GPUs is not supported."
)
decode_batch_size
=
int
(
self
.
params
[
"decode_batch_size"
]
/
self
.
params
[
"num_replicas"
])
inputs
.
set_shape
([
decode_batch_size
,
self
.
params
[
"decode_max_length"
]
])
# Variance scaling is used here because it seems to work in many problems.
# Other reasonable initializers may also work just as well.
with
tf
.
name_scope
(
"Transformer"
):
# Calculate attention bias for encoder self-attention and decoder
# multi-headed attention layers.
attention_bias
=
model_utils
.
get_padding_bias
(
inputs
)
# Run the inputs through the encoder layer to map the symbol
# representations to continuous representations.
encoder_outputs
=
self
.
encode
(
inputs
,
attention_bias
,
training
)
# Generate output sequence if targets is None, or return logits if target
# sequence is known.
if
targets
is
None
:
return
self
.
predict
(
encoder_outputs
,
attention_bias
,
training
)
else
:
logits
=
self
.
decode
(
targets
,
encoder_outputs
,
attention_bias
,
training
)
return
logits
def
encode
(
self
,
inputs
,
attention_bias
,
training
):
"""Generate continuous representation for inputs.
Args:
inputs: int tensor with shape [batch_size, input_length].
attention_bias: float tensor with shape [batch_size, 1, 1, input_length].
training: boolean, whether in training mode or not.
Returns:
float tensor with shape [batch_size, input_length, hidden_size]
"""
with
tf
.
name_scope
(
"encode"
):
# Prepare inputs to the layer stack by adding positional encodings and
# applying dropout.
embedded_inputs
=
self
.
embedding_softmax_layer
(
inputs
)
embedded_inputs
=
tf
.
cast
(
embedded_inputs
,
self
.
params
[
"dtype"
])
inputs_padding
=
model_utils
.
get_padding
(
inputs
)
attention_bias
=
tf
.
cast
(
attention_bias
,
self
.
params
[
"dtype"
])
with
tf
.
name_scope
(
"add_pos_encoding"
):
pos_encoding
=
self
.
position_embedding
(
inputs
=
embedded_inputs
)
pos_encoding
=
tf
.
cast
(
pos_encoding
,
self
.
params
[
"dtype"
])
encoder_inputs
=
embedded_inputs
+
pos_encoding
if
training
:
encoder_inputs
=
tf
.
nn
.
dropout
(
encoder_inputs
,
rate
=
self
.
params
[
"layer_postprocess_dropout"
])
return
self
.
encoder_stack
(
encoder_inputs
,
attention_bias
,
inputs_padding
,
training
=
training
)
def
decode
(
self
,
targets
,
encoder_outputs
,
attention_bias
,
training
):
"""Generate logits for each value in the target sequence.
Args:
targets: target values for the output sequence. int tensor with shape
[batch_size, target_length]
encoder_outputs: continuous representation of input sequence. float tensor
with shape [batch_size, input_length, hidden_size]
attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
training: boolean, whether in training mode or not.
Returns:
float32 tensor with shape [batch_size, target_length, vocab_size]
"""
with
tf
.
name_scope
(
"decode"
):
# Prepare inputs to decoder layers by shifting targets, adding positional
# encoding and applying dropout.
decoder_inputs
=
self
.
embedding_softmax_layer
(
targets
)
decoder_inputs
=
tf
.
cast
(
decoder_inputs
,
self
.
params
[
"dtype"
])
attention_bias
=
tf
.
cast
(
attention_bias
,
self
.
params
[
"dtype"
])
with
tf
.
name_scope
(
"shift_targets"
):
# Shift targets to the right, and remove the last element
decoder_inputs
=
tf
.
pad
(
decoder_inputs
,
[[
0
,
0
],
[
1
,
0
],
[
0
,
0
]])[:,
:
-
1
,
:]
with
tf
.
name_scope
(
"add_pos_encoding"
):
length
=
tf
.
shape
(
decoder_inputs
)[
1
]
pos_encoding
=
self
.
position_embedding
(
decoder_inputs
)
pos_encoding
=
tf
.
cast
(
pos_encoding
,
self
.
params
[
"dtype"
])
decoder_inputs
+=
pos_encoding
if
training
:
decoder_inputs
=
tf
.
nn
.
dropout
(
decoder_inputs
,
rate
=
self
.
params
[
"layer_postprocess_dropout"
])
# Run values
decoder_self_attention_bias
=
model_utils
.
get_decoder_self_attention_bias
(
length
,
dtype
=
self
.
params
[
"dtype"
])
outputs
=
self
.
decoder_stack
(
decoder_inputs
,
encoder_outputs
,
decoder_self_attention_bias
,
attention_bias
,
training
=
training
)
logits
=
self
.
embedding_softmax_layer
(
outputs
,
mode
=
"linear"
)
logits
=
tf
.
cast
(
logits
,
tf
.
float32
)
return
logits
def
_get_symbols_to_logits_fn
(
self
,
max_decode_length
,
training
):
"""Returns a decoding function that calculates logits of the next tokens."""
timing_signal
=
self
.
position_embedding
(
inputs
=
None
,
length
=
max_decode_length
+
1
)
timing_signal
=
tf
.
cast
(
timing_signal
,
self
.
params
[
"dtype"
])
decoder_self_attention_bias
=
model_utils
.
get_decoder_self_attention_bias
(
max_decode_length
,
dtype
=
self
.
params
[
"dtype"
])
def
symbols_to_logits_fn
(
ids
,
i
,
cache
):
"""Generate logits for next potential IDs.
Args:
ids: Current decoded sequences. int tensor with shape [batch_size *
beam_size, i + 1].
i: Loop index.
cache: dictionary of values storing the encoder output, encoder-decoder
attention bias, and previous decoder attention values.
Returns:
Tuple of
(logits with shape [batch_size * beam_size, vocab_size],
updated cache values)
"""
# Set decoder input to the last generated IDs
decoder_input
=
ids
[:,
-
1
:]
# Preprocess decoder input by getting embeddings and adding timing signal.
decoder_input
=
self
.
embedding_softmax_layer
(
decoder_input
)
if
self
.
params
[
"padded_decode"
]:
timing_signal_shape
=
timing_signal
.
shape
.
as_list
()
decoder_input
+=
tf
.
slice
(
timing_signal
,
[
i
,
0
],
[
1
,
timing_signal_shape
[
1
]])
bias_shape
=
decoder_self_attention_bias
.
shape
.
as_list
()
self_attention_bias
=
tf
.
slice
(
decoder_self_attention_bias
,
[
0
,
0
,
i
,
0
],
[
bias_shape
[
0
],
bias_shape
[
1
],
1
,
bias_shape
[
3
]])
else
:
decoder_input
+=
timing_signal
[
i
:
i
+
1
]
self_attention_bias
=
decoder_self_attention_bias
[:,
:,
i
:
i
+
1
,
:
i
+
1
]
decoder_outputs
=
self
.
decoder_stack
(
decoder_input
,
cache
.
get
(
"encoder_outputs"
),
self_attention_bias
,
cache
.
get
(
"encoder_decoder_attention_bias"
),
training
=
training
,
cache
=
cache
,
decode_loop_step
=
i
if
self
.
params
[
"padded_decode"
]
else
None
)
logits
=
self
.
embedding_softmax_layer
(
decoder_outputs
,
mode
=
"linear"
)
logits
=
tf
.
squeeze
(
logits
,
axis
=
[
1
])
return
logits
,
cache
return
symbols_to_logits_fn
def
predict
(
self
,
encoder_outputs
,
encoder_decoder_attention_bias
,
training
):
"""Return predicted sequence."""
encoder_outputs
=
tf
.
cast
(
encoder_outputs
,
self
.
params
[
"dtype"
])
if
self
.
params
[
"padded_decode"
]:
batch_size
=
encoder_outputs
.
shape
.
as_list
()[
0
]
input_length
=
encoder_outputs
.
shape
.
as_list
()[
1
]
else
:
batch_size
=
tf
.
shape
(
encoder_outputs
)[
0
]
input_length
=
tf
.
shape
(
encoder_outputs
)[
1
]
max_decode_length
=
input_length
+
self
.
params
[
"extra_decode_length"
]
encoder_decoder_attention_bias
=
tf
.
cast
(
encoder_decoder_attention_bias
,
self
.
params
[
"dtype"
])
symbols_to_logits_fn
=
self
.
_get_symbols_to_logits_fn
(
max_decode_length
,
training
)
# Create initial set of IDs that will be passed into symbols_to_logits_fn.
initial_ids
=
tf
.
zeros
([
batch_size
],
dtype
=
tf
.
int32
)
# Create cache storing decoder attention values for each layer.
# pylint: disable=g-complex-comprehension
init_decode_length
=
(
max_decode_length
if
self
.
params
[
"padded_decode"
]
else
0
)
num_heads
=
self
.
params
[
"num_heads"
]
dim_per_head
=
self
.
params
[
"hidden_size"
]
//
num_heads
cache
=
{
"layer_%d"
%
layer
:
{
"k"
:
tf
.
zeros
([
batch_size
,
init_decode_length
,
num_heads
,
dim_per_head
],
dtype
=
self
.
params
[
"dtype"
]),
"v"
:
tf
.
zeros
([
batch_size
,
init_decode_length
,
num_heads
,
dim_per_head
],
dtype
=
self
.
params
[
"dtype"
])
}
for
layer
in
range
(
self
.
params
[
"num_hidden_layers"
])
}
# pylint: enable=g-complex-comprehension
# Add encoder output and attention bias to the cache.
cache
[
"encoder_outputs"
]
=
encoder_outputs
cache
[
"encoder_decoder_attention_bias"
]
=
encoder_decoder_attention_bias
# Use beam search to find the top beam_size sequences and scores.
decoded_ids
,
scores
=
beam_search
.
sequence_beam_search
(
symbols_to_logits_fn
=
symbols_to_logits_fn
,
initial_ids
=
initial_ids
,
initial_cache
=
cache
,
vocab_size
=
self
.
params
[
"vocab_size"
],
beam_size
=
self
.
params
[
"beam_size"
],
alpha
=
self
.
params
[
"alpha"
],
max_decode_length
=
max_decode_length
,
eos_id
=
EOS_ID
,
padded_decode
=
self
.
params
[
"padded_decode"
],
dtype
=
self
.
params
[
"dtype"
])
# Get the top sequence for each batch element
top_decoded_ids
=
decoded_ids
[:,
0
,
1
:]
top_scores
=
scores
[:,
0
]
return
{
"outputs"
:
top_decoded_ids
,
"scores"
:
top_scores
}
class
PrePostProcessingWrapper
(
tf
.
keras
.
layers
.
Layer
):
"""Wrapper class that applies layer pre-processing and post-processing."""
def
__init__
(
self
,
layer
,
params
):
super
(
PrePostProcessingWrapper
,
self
).
__init__
()
self
.
layer
=
layer
self
.
params
=
params
self
.
postprocess_dropout
=
params
[
"layer_postprocess_dropout"
]
def
build
(
self
,
input_shape
):
# Create normalization layer
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
1e-6
,
dtype
=
"float32"
)
super
(
PrePostProcessingWrapper
,
self
).
build
(
input_shape
)
def
get_config
(
self
):
return
{
"params"
:
self
.
params
,
}
def
call
(
self
,
x
,
*
args
,
**
kwargs
):
"""Calls wrapped layer with same parameters."""
# Preprocessing: apply layer normalization
training
=
kwargs
[
"training"
]
y
=
self
.
layer_norm
(
x
)
# Get layer output
y
=
self
.
layer
(
y
,
*
args
,
**
kwargs
)
# Postprocessing: apply dropout and residual connection
if
training
:
y
=
tf
.
nn
.
dropout
(
y
,
rate
=
self
.
postprocess_dropout
)
return
x
+
y
class
EncoderStack
(
tf
.
keras
.
layers
.
Layer
):
"""Transformer encoder stack.
The encoder stack is made up of N identical layers. Each layer is composed
of the sublayers:
1. Self-attention layer
2. Feedforward network (which is 2 fully-connected layers)
"""
def
__init__
(
self
,
params
):
super
(
EncoderStack
,
self
).
__init__
()
self
.
params
=
params
self
.
layers
=
[]
def
build
(
self
,
input_shape
):
"""Builds the encoder stack."""
params
=
self
.
params
for
_
in
range
(
params
[
"num_hidden_layers"
]):
# Create sublayers for each layer.
self_attention_layer
=
attention_layer
.
SelfAttention
(
params
[
"hidden_size"
],
params
[
"num_heads"
],
params
[
"attention_dropout"
])
feed_forward_network
=
ffn_layer
.
FeedForwardNetwork
(
params
[
"hidden_size"
],
params
[
"filter_size"
],
params
[
"relu_dropout"
])
self
.
layers
.
append
([
PrePostProcessingWrapper
(
self_attention_layer
,
params
),
PrePostProcessingWrapper
(
feed_forward_network
,
params
)
])
# Create final layer normalization layer.
self
.
output_normalization
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
1e-6
,
dtype
=
"float32"
)
super
(
EncoderStack
,
self
).
build
(
input_shape
)
def
get_config
(
self
):
return
{
"params"
:
self
.
params
,
}
def
call
(
self
,
encoder_inputs
,
attention_bias
,
inputs_padding
,
training
):
"""Return the output of the encoder layer stacks.
Args:
encoder_inputs: tensor with shape [batch_size, input_length, hidden_size]
attention_bias: bias for the encoder self-attention layer. [batch_size, 1,
1, input_length]
inputs_padding: tensor with shape [batch_size, input_length], inputs with
zero paddings.
training: boolean, whether in training mode or not.
Returns:
Output of encoder layer stack.
float32 tensor with shape [batch_size, input_length, hidden_size]
"""
for
n
,
layer
in
enumerate
(
self
.
layers
):
# Run inputs through the sublayers.
self_attention_layer
=
layer
[
0
]
feed_forward_network
=
layer
[
1
]
with
tf
.
name_scope
(
"layer_%d"
%
n
):
with
tf
.
name_scope
(
"self_attention"
):
encoder_inputs
=
self_attention_layer
(
encoder_inputs
,
attention_bias
,
training
=
training
)
with
tf
.
name_scope
(
"ffn"
):
encoder_inputs
=
feed_forward_network
(
encoder_inputs
,
training
=
training
)
return
self
.
output_normalization
(
encoder_inputs
)
class
DecoderStack
(
tf
.
keras
.
layers
.
Layer
):
"""Transformer decoder stack.
Like the encoder stack, the decoder stack is made up of N identical layers.
Each layer is composed of the sublayers:
1. Self-attention layer
2. Multi-headed attention layer combining encoder outputs with results from
the previous self-attention layer.
3. Feedforward network (2 fully-connected layers)
"""
def
__init__
(
self
,
params
):
super
(
DecoderStack
,
self
).
__init__
()
self
.
params
=
params
self
.
layers
=
[]
def
build
(
self
,
input_shape
):
"""Builds the decoder stack."""
params
=
self
.
params
for
_
in
range
(
params
[
"num_hidden_layers"
]):
self_attention_layer
=
attention_layer
.
SelfAttention
(
params
[
"hidden_size"
],
params
[
"num_heads"
],
params
[
"attention_dropout"
])
enc_dec_attention_layer
=
attention_layer
.
Attention
(
params
[
"hidden_size"
],
params
[
"num_heads"
],
params
[
"attention_dropout"
])
feed_forward_network
=
ffn_layer
.
FeedForwardNetwork
(
params
[
"hidden_size"
],
params
[
"filter_size"
],
params
[
"relu_dropout"
])
self
.
layers
.
append
([
PrePostProcessingWrapper
(
self_attention_layer
,
params
),
PrePostProcessingWrapper
(
enc_dec_attention_layer
,
params
),
PrePostProcessingWrapper
(
feed_forward_network
,
params
)
])
self
.
output_normalization
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
1e-6
,
dtype
=
"float32"
)
super
(
DecoderStack
,
self
).
build
(
input_shape
)
def
get_config
(
self
):
return
{
"params"
:
self
.
params
,
}
def
call
(
self
,
decoder_inputs
,
encoder_outputs
,
decoder_self_attention_bias
,
attention_bias
,
training
,
cache
=
None
,
decode_loop_step
=
None
):
"""Return the output of the decoder layer stacks.
Args:
decoder_inputs: A tensor with shape
[batch_size, target_length, hidden_size].
encoder_outputs: A tensor with shape
[batch_size, input_length, hidden_size]
decoder_self_attention_bias: A tensor with shape
[1, 1, target_len, target_length], the bias for decoder self-attention
layer.
attention_bias: A tensor with shape [batch_size, 1, 1, input_length],
the bias for encoder-decoder attention layer.
training: A bool, whether in training mode or not.
cache: (Used for fast decoding) A nested dictionary storing previous
decoder self-attention values. The items are:
{layer_n: {"k": A tensor with shape [batch_size, i, key_channels],
"v": A tensor with shape [batch_size, i, value_channels]},
...}
decode_loop_step: An integer, the step number of the decoding loop. Used
only for autoregressive inference on TPU.
Returns:
Output of decoder layer stack.
float32 tensor with shape [batch_size, target_length, hidden_size]
"""
for
n
,
layer
in
enumerate
(
self
.
layers
):
self_attention_layer
=
layer
[
0
]
enc_dec_attention_layer
=
layer
[
1
]
feed_forward_network
=
layer
[
2
]
# Run inputs through the sublayers.
layer_name
=
"layer_%d"
%
n
layer_cache
=
cache
[
layer_name
]
if
cache
is
not
None
else
None
with
tf
.
name_scope
(
layer_name
):
with
tf
.
name_scope
(
"self_attention"
):
decoder_inputs
=
self_attention_layer
(
decoder_inputs
,
decoder_self_attention_bias
,
training
=
training
,
cache
=
layer_cache
,
decode_loop_step
=
decode_loop_step
)
with
tf
.
name_scope
(
"encdec_attention"
):
decoder_inputs
=
enc_dec_attention_layer
(
decoder_inputs
,
encoder_outputs
,
attention_bias
,
training
=
training
)
with
tf
.
name_scope
(
"ffn"
):
decoder_inputs
=
feed_forward_network
(
decoder_inputs
,
training
=
training
)
return
self
.
output_normalization
(
decoder_inputs
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment