Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
71557b16
"vscode:/vscode.git/clone" did not exist on "0c44b11917a5957b8495c50e7dace45bd897877b"
Commit
71557b16
authored
Nov 01, 2018
by
thomwolf
Browse files
working on model
parent
8627a675
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
169 additions
and
883 deletions
+169
-883
modeling_pytorch.py
modeling_pytorch.py
+169
-883
No files found.
modeling_pytorch.py
View file @
71557b16
...
@@ -28,6 +28,11 @@ import tensorflow as tf
...
@@ -28,6 +28,11 @@ import tensorflow as tf
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
def
gelu
(
x
):
raise
NotImplementedError
# TF BERT says: cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
return
0.5
*
x
*
(
1
+
torch
.
tanh
(
math
.
sqrt
(
2
/
math
.
pi
)
*
(
x
+
0.044715
*
torch
.
pow
(
x
,
3
))))
class
BertConfig
(
object
):
class
BertConfig
(
object
):
"""Configuration for `BertModel`."""
"""Configuration for `BertModel`."""
...
@@ -106,30 +111,54 @@ class BertConfig(object):
...
@@ -106,30 +111,54 @@ class BertConfig(object):
class
BERTLayerNorm
(
nn
.
Module
):
class
BERTLayerNorm
(
nn
.
Module
):
def
__init__
(
self
):
def
__init__
(
self
,
config
,
variance_epsilon
=
1e-12
):
tf
.
contrib
.
layers
.
layer_norm
(
"Construct a layernorm module in the TF style (epsilon inside the square root)."
inputs
=
input_tensor
,
begin_norm_axis
=-
1
,
begin_params_axis
=-
1
,
scope
=
name
)
super
(
BERTLayerNorm
,
self
).
__init__
()
self
.
gamma
=
nn
.
Parameter
(
torch
.
ones
(
config
.
hidden_size
))
self
.
beta
=
nn
.
Parameter
(
torch
.
zeros
(
config
.
hidden_size
))
self
.
variance_epsilon
=
variance_epsilon
def
forward
(
self
,
x
):
# TODO check it's identical to TF implementation in details
u
=
x
.
mean
(
-
1
,
keepdim
=
True
)
s
=
(
x
-
u
).
pow
(
2
).
mean
(
-
1
,
keepdim
=
True
)
x
=
(
x
-
u
)
/
torch
.
sqrt
(
s
+
self
.
variance_epsilon
)
return
self
.
gamma
*
x
+
self
.
beta
# tf.contrib.layers.layer_norm(
# inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
class
BERTEmbeddings
(
nn
.
Module
):
class
BERTEmbeddings
(
nn
.
Module
):
def
__init__
(
self
,
embedding_size
,
vocab_size
,
def
__init__
(
self
,
embedding_size
,
vocab_size
,
token_type_vocab_size
,
max_position_embeddings
,
token_type_vocab_size
,
max_position_embeddings
,
config
):
config
):
self
.
word_embeddings
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
embedding_size
)
self
.
word_embeddings
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
embedding_size
)
# Position embeddings are (normally) a contiguous range so we could use a slice
# Since the position embedding table is a learned variable, we create it
# using a (long) sequence length `max_position_embeddings`. The actual
# sequence length might be shorter than this, for faster training of
# tasks that do not have long sequences.
#
# So `full_position_embeddings` is effectively an embedding table
# for position [0, 1, 2, ..., max_position_embeddings-1], and the current
# sequence has positions [0, 1, 2, ... seq_length-1], so we can just
# perform a slice.
self
.
position_embeddings
=
nn
.
Embedding
(
config
.
max_position_embeddings
,
config
.
embedding_size
)
self
.
position_embeddings
=
nn
.
Embedding
(
config
.
max_position_embeddings
,
config
.
embedding_size
)
self
.
token_type_embeddings
=
nn
.
Embedding
(
config
.
token_type_vocab_size
,
config
.
embedding_size
)
self
.
LayerNorm
=
BERTLayerNorm
()
# Not snake-cased to fit with TF model variable name
# token_type_embeddings vocabulary is very small. TF used one-hot embeddings to speedup.
self
.
dropout
=
nn
.
dropout
(
config
.
hidden_dropout_prob
)
self
.
token_type_embeddings
=
nn
.
Embedding
(
config
.
token_type_vocab_size
,
config
.
embedding_size
)
self
.
initialize_weights
(
self
,
config
.
initializer_range
)
self
.
LayerNorm
=
BERTLayerNorm
()
# Not snake-cased to stick with TF model variable name
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
initialize_weights
(
self
,
initializer_range
):
torch
.
truncated_normal_initializer
(
stddev
=
initializer_range
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
):
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
):
batch_size
=
input_ids
.
size
(
0
)
batch_size
=
input_ids
.
size
(
0
)
seq_length
=
input_ids
.
size
(
1
)
seq_length
=
input_ids
.
size
(
1
)
# TODO finich that
position_ids
=
torch
.
range
().
view
(
batch_size
,
seq_length
)
position_ids
=
torch
.
range
().
view
(
batch_size
,
seq_length
)
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros
(
batch_size
,
seq_length
)
words_embeddings
=
self
.
word_embeddings
(
input_ids
)
words_embeddings
=
self
.
word_embeddings
(
input_ids
)
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
...
@@ -141,28 +170,6 @@ class BERTEmbeddings(nn.Module):
...
@@ -141,28 +170,6 @@ class BERTEmbeddings(nn.Module):
return
embeddings
return
embeddings
class
BERTIntermediate
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BERTOutput
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
()
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
dense
(
hidden_states
)
return
hidden_states
class
BERTOutput
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BERTOutput
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
()
self
.
LayerNorm
=
BERTLayerNorm
(
config
)
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
)
return
hidden_states
class
BERTSelfAttention
(
nn
.
Module
):
class
BERTSelfAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
BERTSelfAttention
,
self
).
__init__
()
super
(
BERTSelfAttention
,
self
).
__init__
()
...
@@ -170,22 +177,84 @@ class BERTSelfAttention(nn.Module):
...
@@ -170,22 +177,84 @@ class BERTSelfAttention(nn.Module):
raise
ValueError
(
raise
ValueError
(
"The hidden size (%d) is not a multiple of the number of attention "
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)"
%
(
config
.
hidden_size
,
config
.
num_attention_heads
))
"heads (%d)"
%
(
config
.
hidden_size
,
config
.
num_attention_heads
))
attention_head_size
=
int
(
config
.
hidden_size
/
config
.
num_attention_heads
)
self
.
num_attention_heads
=
config
.
num_attention_heads
all_head_size
=
num_attention_heads
*
attention_head_size
self
.
attention_head_size
=
int
(
config
.
hidden_size
/
config
.
num_attention_heads
)
self
.
all_head_size
=
self
.
num_attention_heads
*
self
.
attention_head_size
self
.
query
=
nn
.
Linear
(
config
.
hidden_size
,
all_head_size
)
self
.
query
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
key
=
nn
.
Linear
(
config
.
hidden_size
,
all_head_size
)
self
.
key
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
value
=
nn
.
Linear
(
config
.
hidden_size
,
all_head_size
)
self
.
value
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
def
transpose_for_scores
(
self
,
x
,
k
=
False
):
self
.
dropout
=
nn
.
Dropout
(
config
.
attention_probs_dropout_prob
)
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
n_head
,
x
.
size
(
-
1
)
//
self
.
n_head
)
x
=
x
.
view
(
*
new_x_shape
)
# in Tensorflow implem: fct split_states
def
transpose_for_scores
(
self
,
input_tensor
,
num_attention_heads
,
is_key_tensor
=
False
):
if
k
:
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads
,
self
.
attention_head_size
)
x
=
x
.
view
(
*
new_x_shape
)
if
is_key_tensor
:
return
x
.
permute
(
0
,
2
,
3
,
1
)
return
x
.
permute
(
0
,
2
,
3
,
1
)
else
:
else
:
return
x
.
permute
(
0
,
2
,
1
,
3
)
return
x
.
permute
(
0
,
2
,
1
,
3
)
def
forward
(
self
,
hidden_states
):
def
forward
(
self
,
hidden_states
,
attention_mask
):
# Scalar dimensions referenced here:
# B = batch size (number of sequences)
# F = `from_tensor` sequence length
# T = `to_tensor` sequence length
# N = `num_attention_heads`
# H = `size_per_head`
query_layer
=
self
.
query
(
hidden_states
)
key_layer
=
self
.
key
(
hidden_states
)
value_layer
=
self
.
value
(
hidden_states
)
query_layer
=
self
.
transpose_for_scores
(
query_layer
)
key_layer
=
self
.
transpose_for_scores
(
key_layer
,
is_key_tensor
=
True
)
value_layer
=
self
.
transpose_for_scores
(
value_layer
)
# Take the dot product between "query" and "key" to get the raw
# attention scores.
# `attention_scores` = [B, N, F, T]
attention_scores
=
torch
.
matmul
(
query_layer
,
key_layer
)
attention_scores
=
attention_scores
/
math
.
sqrt
(
self
.
attention_head_size
)
# TODO clean up this (precompute)
# MY PYTORCH: w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights
# `attention_mask` = [B, 1, F, T]
attention_mask
=
tf
.
expand_dims
(
attention_mask
,
axis
=
[
1
])
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
adder
=
(
1.0
-
attention_mask
)
*
-
10000.0
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_scores
+=
adder
# Normalize the attention scores to probabilities.
# `attention_probs` = [B, N, F, T]
attention_probs
=
nn
.
Softmax
(
dim
=-
1
)(
attention_scores
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
self
.
dropout
(
attention_probs
)
context_layer
=
torch
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
all_head_size
,)
context_layer
=
context_layer
.
view
(
*
new_x_shape
)
return
context_layer
class
BERTSelfOutput
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BERTSelfOutput
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
LayerNorm
=
BERTLayerNorm
(
config
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
,
input_tensor
):
hidden_states
=
self
.
dense
(
input_tensor
)
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
return
hidden_states
return
hidden_states
...
@@ -193,11 +262,37 @@ class BERTAttention(nn.Module):
...
@@ -193,11 +262,37 @@ class BERTAttention(nn.Module):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
BERTAttention
,
self
).
__init__
()
super
(
BERTAttention
,
self
).
__init__
()
self
.
self
=
BERTSelfAttention
(
config
)
self
.
self
=
BERTSelfAttention
(
config
)
self
.
output
=
BERTOutput
(
config
)
self
.
output
=
BERTSelfOutput
(
config
)
def
forward
(
self
,
input_tensor
,
attention_mask
):
attention_output
=
self
.
self
(
input_tensor
,
attention_mask
)
attention_output
=
self
.
output
(
attention_output
,
input_tensor
)
return
attention_output
class
BERTIntermediate
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BERTOutput
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
intermediate_size
)
self
.
intermediate_act_fn
=
gelu
def
forward
(
self
,
hidden_states
):
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
self
(
hidden_states
)
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
output
(
hidden_states
)
hidden_states
=
self
.
intermediate_act_fn
(
hidden_states
)
return
hidden_states
class
BERTOutput
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BERTOutput
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
intermediate_size
,
config
.
hidden_size
)
self
.
LayerNorm
=
BERTLayerNorm
(
config
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
,
input_tensor
):
hidden_states
=
self
.
dense
(
input_tensor
)
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
return
hidden_states
return
hidden_states
...
@@ -208,10 +303,10 @@ class BERTLayer(nn.Module):
...
@@ -208,10 +303,10 @@ class BERTLayer(nn.Module):
self
.
intermediate
=
BERTIntermediate
(
config
)
self
.
intermediate
=
BERTIntermediate
(
config
)
self
.
output
=
BERTOutput
(
config
)
self
.
output
=
BERTOutput
(
config
)
def
forward
(
self
,
hidden_states
):
def
forward
(
self
,
hidden_states
,
attention_mask
):
hidden_states
=
self
.
attention
(
hidden_states
)
attention_output
=
self
.
attention
(
hidden_states
,
attention_mask
)
hidden_states
=
self
.
intermediate
(
hidden_states
)
intermediate_output
=
self
.
intermediate
(
attention_output
)
hidden_states
=
self
.
output
(
hidden_states
)
layer_output
=
self
.
output
(
intermediate_output
,
attention_output
)
return
hidden_states
return
hidden_states
...
@@ -221,7 +316,25 @@ class BERTEncoder(nn.Module):
...
@@ -221,7 +316,25 @@ class BERTEncoder(nn.Module):
layer
=
BERTLayer
(
n_ctx
,
cfg
,
scale
=
True
)
layer
=
BERTLayer
(
n_ctx
,
cfg
,
scale
=
True
)
self
.
layer
=
nn
.
ModuleList
([
copy
.
deepcopy
(
layer
)
for
_
in
range
(
config
.
num_hidden_layers
)])
self
.
layer
=
nn
.
ModuleList
([
copy
.
deepcopy
(
layer
)
for
_
in
range
(
config
.
num_hidden_layers
)])
def
forward
(
self
,
hidden_states
):
def
forward
(
self
,
hidden_states
,
attention_mask
):
"""
Args:
hidden_states: float Tensor of shape [batch_size, seq_length, hidden_size]
Return:
float Tensor of shape [batch_size, seq_length, hidden_size]
"""
for
layer_module
in
self
.
layer
:
hidden_states
=
layer_module
(
hidden_states
,
attention_mask
)
return
hidden_states
class
BERTPooler
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BERTPooler
,
self
).
__init__
()
layer
=
BERTLayer
(
n_ctx
,
cfg
,
scale
=
True
)
self
.
layer
=
nn
.
ModuleList
([
copy
.
deepcopy
(
layer
)
for
_
in
range
(
config
.
num_hidden_layers
)])
def
forward
(
self
,
hidden_states
,
attention_mask
):
"""
"""
Args:
Args:
hidden_states: float Tensor of shape [batch_size, seq_length, hidden_size]
hidden_states: float Tensor of shape [batch_size, seq_length, hidden_size]
...
@@ -229,7 +342,7 @@ class BERTEncoder(nn.Module):
...
@@ -229,7 +342,7 @@ class BERTEncoder(nn.Module):
float Tensor of shape [batch_size, seq_length, hidden_size]
float Tensor of shape [batch_size, seq_length, hidden_size]
"""
"""
for
layer_module
in
self
.
layer
:
for
layer_module
in
self
.
layer
:
hidden_states
=
layer_module
(
hidden_states
)
hidden_states
=
layer_module
(
hidden_states
,
attention_mask
)
return
hidden_states
return
hidden_states
...
@@ -278,836 +391,9 @@ class BertModel(nn.Module):
...
@@ -278,836 +391,9 @@ class BertModel(nn.Module):
self
.
embeddings
=
BERTEmbeddings
(
config
)
self
.
embeddings
=
BERTEmbeddings
(
config
)
self
.
encoder
=
BERTEncoder
(
config
)
self
.
encoder
=
BERTEncoder
(
config
)
self
.
pooler
=
BERTPooler
(
config
)
def
forward
(
self
,
input_ids
,
token_type_ids
,
attention_mask
):
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
input_mask
=
None
):
embedding_output
=
self
.
embeddings
(
input_ids
,
token_type_ids
)
if
input_mask
is
None
:
all_encoder_layers
=
self
.
encoder
(
embedding_output
,
attention_mask
)
input_mask
=
torch
.
ones
(
batch_size
,
seq_length
),
dtype
=
torch
.
long
)
return
all_encoder_layers
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros
((
batch_size
,
seq_length
),
dtype
=
torch
.
long
)
hidden_states
=
self
.
embeddings
(
input_ids
,
token_type_ids
,
input_mask
)
hidden_states
=
self
.
encoder
(
hidden_states
)
# Perform embedding lookup on the word ids.
(
self
.
embedding_output
,
self
.
embedding_table
)
=
embedding_lookup
(
input_ids
=
input_ids
,
vocab_size
=
config
.
vocab_size
,
embedding_size
=
config
.
hidden_size
,
initializer_range
=
config
.
initializer_range
,
word_embedding_name
=
"word_embeddings"
,
use_one_hot_embeddings
=
use_one_hot_embeddings
)
# Add positional embeddings and token type embeddings, then layer
# normalize and perform dropout.
self
.
embedding_output
=
embedding_postprocessor
(
input_tensor
=
self
.
embedding_output
,
use_token_type
=
True
,
token_type_ids
=
token_type_ids
,
token_type_vocab_size
=
config
.
type_vocab_size
,
token_type_embedding_name
=
"token_type_embeddings"
,
use_position_embeddings
=
True
,
position_embedding_name
=
"position_embeddings"
,
initializer_range
=
config
.
initializer_range
,
max_position_embeddings
=
config
.
max_position_embeddings
,
dropout_prob
=
config
.
hidden_dropout_prob
)
with
tf
.
variable_scope
(
"encoder"
):
# This converts a 2D mask of shape [batch_size, seq_length] to a 3D
# mask of shape [batch_size, seq_length, seq_length] which is used
# for the attention scores.
attention_mask
=
create_attention_mask_from_input_mask
(
input_ids
,
input_mask
)
# Run the stacked transformer.
# `sequence_output` shape = [batch_size, seq_length, hidden_size].
self
.
all_encoder_layers
=
transformer_model
(
input_tensor
=
self
.
embedding_output
,
attention_mask
=
attention_mask
,
hidden_size
=
config
.
hidden_size
,
num_hidden_layers
=
config
.
num_hidden_layers
,
num_attention_heads
=
config
.
num_attention_heads
,
intermediate_size
=
config
.
intermediate_size
,
intermediate_act_fn
=
get_activation
(
config
.
hidden_act
),
hidden_dropout_prob
=
config
.
hidden_dropout_prob
,
attention_probs_dropout_prob
=
config
.
attention_probs_dropout_prob
,
initializer_range
=
config
.
initializer_range
,
do_return_all_layers
=
True
)
self
.
sequence_output
=
self
.
all_encoder_layers
[
-
1
]
# The "pooler" converts the encoded sequence tensor of shape
# [batch_size, seq_length, hidden_size] to a tensor of shape
# [batch_size, hidden_size]. This is necessary for segment-level
# (or segment-pair-level) classification tasks where we need a fixed
# dimensional representation of the segment.
with
tf
.
variable_scope
(
"pooler"
):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token. We assume that this has been pre-trained
first_token_tensor
=
tf
.
squeeze
(
self
.
sequence_output
[:,
0
:
1
,
:],
axis
=
1
)
self
.
pooled_output
=
tf
.
layers
.
dense
(
first_token_tensor
,
config
.
hidden_size
,
activation
=
tf
.
tanh
,
kernel_initializer
=
create_initializer
(
config
.
initializer_range
))
def
get_pooled_output
(
self
):
return
self
.
pooled_output
def
get_sequence_output
(
self
):
"""Gets final hidden layer of encoder.
Returns:
float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
to the final hidden of the transformer encoder.
"""
return
self
.
sequence_output
def
get_all_encoder_layers
(
self
):
return
self
.
all_encoder_layers
def
get_embedding_output
(
self
):
"""Gets output of the embedding lookup (i.e., input to the transformer).
Returns:
float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
to the output of the embedding layer, after summing the word
embeddings with the positional embeddings and the token type embeddings,
then performing layer normalization. This is the input to the transformer.
"""
return
self
.
embedding_output
def
get_embedding_table
(
self
):
return
self
.
embedding_table
def
gelu
(
input_tensor
):
"""Gaussian Error Linear Unit.
This is a smoother version of the RELU.
Original paper: https://arxiv.org/abs/1606.08415
Args:
input_tensor: float Tensor to perform activation.
Returns:
`input_tensor` with the GELU activation applied.
"""
cdf
=
0.5
*
(
1.0
+
tf
.
erf
(
input_tensor
/
tf
.
sqrt
(
2.0
)))
return
input_tensor
*
cdf
def
get_activation
(
activation_string
):
"""Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
Args:
activation_string: String name of the activation function.
Returns:
A Python function corresponding to the activation function. If
`activation_string` is None, empty, or "linear", this will return None.
If `activation_string` is not a string, it will return `activation_string`.
Raises:
ValueError: The `activation_string` does not correspond to a known
activation.
"""
# We assume that anything that"s not a string is already an activation
# function, so we just return it.
if
not
isinstance
(
activation_string
,
six
.
string_types
):
return
activation_string
if
not
activation_string
:
return
None
act
=
activation_string
.
lower
()
if
act
==
"linear"
:
return
None
elif
act
==
"relu"
:
return
tf
.
nn
.
relu
elif
act
==
"gelu"
:
return
gelu
elif
act
==
"tanh"
:
return
tf
.
tanh
else
:
raise
ValueError
(
"Unsupported activation: %s"
%
act
)
def
get_assigment_map_from_checkpoint
(
tvars
,
init_checkpoint
):
"""Compute the union of the current variables and checkpoint variables."""
assignment_map
=
{}
initialized_variable_names
=
{}
name_to_variable
=
collections
.
OrderedDict
()
for
var
in
tvars
:
name
=
var
.
name
m
=
re
.
match
(
"^(.*):
\\
d+$"
,
name
)
if
m
is
not
None
:
name
=
m
.
group
(
1
)
name_to_variable
[
name
]
=
var
init_vars
=
tf
.
train
.
list_variables
(
init_checkpoint
)
assignment_map
=
collections
.
OrderedDict
()
for
x
in
init_vars
:
(
name
,
var
)
=
(
x
[
0
],
x
[
1
])
if
name
not
in
name_to_variable
:
continue
assignment_map
[
name
]
=
name
initialized_variable_names
[
name
]
=
1
initialized_variable_names
[
name
+
":0"
]
=
1
return
(
assignment_map
,
initialized_variable_names
)
def
dropout
(
input_tensor
,
dropout_prob
):
"""Perform dropout.
Args:
input_tensor: float Tensor.
dropout_prob: Python float. The probabiltiy of dropping out a value (NOT of
*keeping* a dimension as in `tf.nn.dropout`).
Returns:
A version of `input_tensor` with dropout applied.
"""
if
dropout_prob
is
None
or
dropout_prob
==
0.0
:
return
input_tensor
output
=
tf
.
nn
.
dropout
(
input_tensor
,
1.0
-
dropout_prob
)
return
output
def
layer_norm
(
input_tensor
,
name
=
None
):
"""Run layer normalization on the last dimension of the tensor."""
return
tf
.
contrib
.
layers
.
layer_norm
(
inputs
=
input_tensor
,
begin_norm_axis
=-
1
,
begin_params_axis
=-
1
,
scope
=
name
)
def
layer_norm_and_dropout
(
input_tensor
,
dropout_prob
,
name
=
None
):
"""Runs layer normalization followed by dropout."""
output_tensor
=
layer_norm
(
input_tensor
,
name
)
output_tensor
=
dropout
(
output_tensor
,
dropout_prob
)
return
output_tensor
def
create_initializer
(
initializer_range
=
0.02
):
"""Creates a `truncated_normal_initializer` with the given range."""
return
tf
.
truncated_normal_initializer
(
stddev
=
initializer_range
)
def
embedding_lookup
(
input_ids
,
vocab_size
,
embedding_size
=
128
,
initializer_range
=
0.02
,
word_embedding_name
=
"word_embeddings"
,
use_one_hot_embeddings
=
False
):
"""Looks up words embeddings for id tensor.
Args:
input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
ids.
vocab_size: int. Size of the embedding vocabulary.
embedding_size: int. Width of the word embeddings.
initializer_range: float. Embedding initialization range.
word_embedding_name: string. Name of the embedding table.
use_one_hot_embeddings: bool. If True, use one-hot method for word
embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better
for TPUs.
Returns:
float Tensor of shape [batch_size, seq_length, embedding_size].
"""
# This function assumes that the input is of shape [batch_size, seq_length,
# num_inputs].
#
# If the input is a 2D tensor of shape [batch_size, seq_length], we
# reshape to [batch_size, seq_length, 1].
if
input_ids
.
shape
.
ndims
==
2
:
input_ids
=
tf
.
expand_dims
(
input_ids
,
axis
=
[
-
1
])
embedding_table
=
tf
.
get_variable
(
name
=
word_embedding_name
,
shape
=
[
vocab_size
,
embedding_size
],
initializer
=
create_initializer
(
initializer_range
))
if
use_one_hot_embeddings
:
flat_input_ids
=
tf
.
reshape
(
input_ids
,
[
-
1
])
one_hot_input_ids
=
tf
.
one_hot
(
flat_input_ids
,
depth
=
vocab_size
)
output
=
tf
.
matmul
(
one_hot_input_ids
,
embedding_table
)
else
:
output
=
tf
.
nn
.
embedding_lookup
(
embedding_table
,
input_ids
)
input_shape
=
get_shape_list
(
input_ids
)
output
=
tf
.
reshape
(
output
,
input_shape
[
0
:
-
1
]
+
[
input_shape
[
-
1
]
*
embedding_size
])
return
(
output
,
embedding_table
)
def
embedding_postprocessor
(
input_tensor
,
use_token_type
=
False
,
token_type_ids
=
None
,
token_type_vocab_size
=
16
,
token_type_embedding_name
=
"token_type_embeddings"
,
use_position_embeddings
=
True
,
position_embedding_name
=
"position_embeddings"
,
initializer_range
=
0.02
,
max_position_embeddings
=
512
,
dropout_prob
=
0.1
):
"""Performs various post-processing on a word embedding tensor.
Args:
input_tensor: float Tensor of shape [batch_size, seq_length,
embedding_size].
use_token_type: bool. Whether to add embeddings for `token_type_ids`.
token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
Must be specified if `use_token_type` is True.
token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
token_type_embedding_name: string. The name of the embedding table variable
for token type ids.
use_position_embeddings: bool. Whether to add position embeddings for the
position of each token in the sequence.
position_embedding_name: string. The name of the embedding table variable
for positional embeddings.
initializer_range: float. Range of the weight initialization.
max_position_embeddings: int. Maximum sequence length that might ever be
used with this model. This can be longer than the sequence length of
input_tensor, but cannot be shorter.
dropout_prob: float. Dropout probability applied to the final output tensor.
Returns:
float tensor with same shape as `input_tensor`.
Raises:
ValueError: One of the tensor shapes or input values is invalid.
"""
input_shape
=
get_shape_list
(
input_tensor
,
expected_rank
=
3
)
batch_size
=
input_shape
[
0
]
seq_length
=
input_shape
[
1
]
width
=
input_shape
[
2
]
if
seq_length
>
max_position_embeddings
:
raise
ValueError
(
"The seq length (%d) cannot be greater than "
"`max_position_embeddings` (%d)"
%
(
seq_length
,
max_position_embeddings
))
output
=
input_tensor
if
use_token_type
:
if
token_type_ids
is
None
:
raise
ValueError
(
"`token_type_ids` must be specified if"
"`use_token_type` is True."
)
token_type_table
=
tf
.
get_variable
(
name
=
token_type_embedding_name
,
shape
=
[
token_type_vocab_size
,
width
],
initializer
=
create_initializer
(
initializer_range
))
# This vocab will be small so we always do one-hot here, since it is always
# faster for a small vocabulary.
flat_token_type_ids
=
tf
.
reshape
(
token_type_ids
,
[
-
1
])
one_hot_ids
=
tf
.
one_hot
(
flat_token_type_ids
,
depth
=
token_type_vocab_size
)
token_type_embeddings
=
tf
.
matmul
(
one_hot_ids
,
token_type_table
)
token_type_embeddings
=
tf
.
reshape
(
token_type_embeddings
,
[
batch_size
,
seq_length
,
width
])
output
+=
token_type_embeddings
if
use_position_embeddings
:
full_position_embeddings
=
tf
.
get_variable
(
name
=
position_embedding_name
,
shape
=
[
max_position_embeddings
,
width
],
initializer
=
create_initializer
(
initializer_range
))
# Since the position embedding table is a learned variable, we create it
# using a (long) sequence length `max_position_embeddings`. The actual
# sequence length might be shorter than this, for faster training of
# tasks that do not have long sequences.
#
# So `full_position_embeddings` is effectively an embedding table
# for position [0, 1, 2, ..., max_position_embeddings-1], and the current
# sequence has positions [0, 1, 2, ... seq_length-1], so we can just
# perform a slice.
if
seq_length
<
max_position_embeddings
:
position_embeddings
=
tf
.
slice
(
full_position_embeddings
,
[
0
,
0
],
[
seq_length
,
-
1
])
else
:
position_embeddings
=
full_position_embeddings
num_dims
=
len
(
output
.
shape
.
as_list
())
# Only the last two dimensions are relevant (`seq_length` and `width`), so
# we broadcast among the first dimensions, which is typically just
# the batch size.
position_broadcast_shape
=
[]
for
_
in
range
(
num_dims
-
2
):
position_broadcast_shape
.
append
(
1
)
position_broadcast_shape
.
extend
([
seq_length
,
width
])
position_embeddings
=
tf
.
reshape
(
position_embeddings
,
position_broadcast_shape
)
output
+=
position_embeddings
output
=
layer_norm_and_dropout
(
output
,
dropout_prob
)
return
output
def
create_attention_mask_from_input_mask
(
from_tensor
,
to_mask
):
"""Create 3D attention mask from a 2D tensor mask.
Args:
from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
to_mask: int32 Tensor of shape [batch_size, to_seq_length].
Returns:
float Tensor of shape [batch_size, from_seq_length, to_seq_length].
"""
from_shape
=
get_shape_list
(
from_tensor
,
expected_rank
=
[
2
,
3
])
batch_size
=
from_shape
[
0
]
from_seq_length
=
from_shape
[
1
]
to_shape
=
get_shape_list
(
to_mask
,
expected_rank
=
2
)
to_seq_length
=
to_shape
[
1
]
to_mask
=
tf
.
cast
(
tf
.
reshape
(
to_mask
,
[
batch_size
,
1
,
to_seq_length
]),
tf
.
float32
)
# We don't assume that `from_tensor` is a mask (although it could be). We
# don't actually care if we attend *from* padding tokens (only *to* padding)
# tokens so we create a tensor of all ones.
#
# `broadcast_ones` = [batch_size, from_seq_length, 1]
broadcast_ones
=
tf
.
ones
(
shape
=
[
batch_size
,
from_seq_length
,
1
],
dtype
=
tf
.
float32
)
# Here we broadcast along two dimensions to create the mask.
mask
=
broadcast_ones
*
to_mask
return
mask
def
attention_layer
(
from_tensor
,
to_tensor
,
attention_mask
=
None
,
num_attention_heads
=
1
,
size_per_head
=
512
,
query_act
=
None
,
key_act
=
None
,
value_act
=
None
,
attention_probs_dropout_prob
=
0.0
,
initializer_range
=
0.02
,
do_return_2d_tensor
=
False
,
batch_size
=
None
,
from_seq_length
=
None
,
to_seq_length
=
None
):
"""Performs multi-headed attention from `from_tensor` to `to_tensor`.
This is an implementation of multi-headed attention based on "Attention
is all you Need". If `from_tensor` and `to_tensor` are the same, then
this is self-attention. Each timestep in `from_tensor` attends to the
corresponding sequence in `to_tensor`, and returns a fixed-with vector.
This function first projects `from_tensor` into a "query" tensor and
`to_tensor` into "key" and "value" tensors. These are (effectively) a list
of tensors of length `num_attention_heads`, where each tensor is of shape
[batch_size, seq_length, size_per_head].
Then, the query and key tensors are dot-producted and scaled. These are
softmaxed to obtain attention probabilities. The value tensors are then
interpolated by these probabilities, then concatenated back to a single
tensor and returned.
In practice, the multi-headed attention are done with transposes and
reshapes rather than actual separate tensors.
Args:
from_tensor: float Tensor of shape [batch_size, from_seq_length,
from_width].
to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
attention_mask: (optional) int32 Tensor of shape [batch_size,
from_seq_length, to_seq_length]. The values should be 1 or 0. The
attention scores will effectively be set to -infinity for any positions in
the mask that are 0, and will be unchaged for positions that are 1.
num_attention_heads: int. Number of attention heads.
size_per_head: int. Size of each attention head.
query_act: (optional) Activation function for the query transform.
key_act: (optional) Activation function for the key transform.
value_act: (optional) Activation function for the value transform.
attention_probs_dropout_prob:
initializer_range: float. Range of the weight initializer.
do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
* from_seq_length, num_attention_heads * size_per_head]. If False, the
output will be of shape [batch_size, from_seq_length, num_attention_heads
* size_per_head].
batch_size: (Optional) int. If the input is 2D, this might be the batch size
of the 3D version of the `from_tensor` and `to_tensor`.
from_seq_length: (Optional) If the input is 2D, this might be the seq length
of the 3D version of the `from_tensor`.
to_seq_length: (Optional) If the input is 2D, this might be the seq length
of the 3D version of the `to_tensor`.
Returns:
float Tensor of shape [batch_size, from_seq_length,
num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
true, this will be of shape [batch_size * from_seq_length,
num_attention_heads * size_per_head]).
Raises:
ValueError: Any of the arguments or tensor shapes are invalid.
"""
def
transpose_for_scores
(
input_tensor
,
batch_size
,
num_attention_heads
,
seq_length
,
width
):
output_tensor
=
tf
.
reshape
(
input_tensor
,
[
batch_size
,
seq_length
,
num_attention_heads
,
width
])
output_tensor
=
tf
.
transpose
(
output_tensor
,
[
0
,
2
,
1
,
3
])
return
output_tensor
from_shape
=
get_shape_list
(
from_tensor
,
expected_rank
=
[
2
,
3
])
to_shape
=
get_shape_list
(
to_tensor
,
expected_rank
=
[
2
,
3
])
if
len
(
from_shape
)
!=
len
(
to_shape
):
raise
ValueError
(
"The rank of `from_tensor` must match the rank of `to_tensor`."
)
if
len
(
from_shape
)
==
3
:
batch_size
=
from_shape
[
0
]
from_seq_length
=
from_shape
[
1
]
to_seq_length
=
to_shape
[
1
]
elif
len
(
from_shape
)
==
2
:
if
(
batch_size
is
None
or
from_seq_length
is
None
or
to_seq_length
is
None
):
raise
ValueError
(
"When passing in rank 2 tensors to attention_layer, the values "
"for `batch_size`, `from_seq_length`, and `to_seq_length` "
"must all be specified."
)
# Scalar dimensions referenced here:
# B = batch size (number of sequences)
# F = `from_tensor` sequence length
# T = `to_tensor` sequence length
# N = `num_attention_heads`
# H = `size_per_head`
from_tensor_2d
=
reshape_to_matrix
(
from_tensor
)
to_tensor_2d
=
reshape_to_matrix
(
to_tensor
)
# `query_layer` = [B*F, N*H]
query_layer
=
tf
.
layers
.
dense
(
from_tensor_2d
,
num_attention_heads
*
size_per_head
,
activation
=
query_act
,
name
=
"query"
,
kernel_initializer
=
create_initializer
(
initializer_range
))
# `key_layer` = [B*T, N*H]
key_layer
=
tf
.
layers
.
dense
(
to_tensor_2d
,
num_attention_heads
*
size_per_head
,
activation
=
key_act
,
name
=
"key"
,
kernel_initializer
=
create_initializer
(
initializer_range
))
# `value_layer` = [B*T, N*H]
value_layer
=
tf
.
layers
.
dense
(
to_tensor_2d
,
num_attention_heads
*
size_per_head
,
activation
=
value_act
,
name
=
"value"
,
kernel_initializer
=
create_initializer
(
initializer_range
))
# `query_layer` = [B, N, F, H]
query_layer
=
transpose_for_scores
(
query_layer
,
batch_size
,
num_attention_heads
,
from_seq_length
,
size_per_head
)
# `key_layer` = [B, N, T, H]
key_layer
=
transpose_for_scores
(
key_layer
,
batch_size
,
num_attention_heads
,
to_seq_length
,
size_per_head
)
# Take the dot product between "query" and "key" to get the raw
# attention scores.
# `attention_scores` = [B, N, F, T]
attention_scores
=
tf
.
matmul
(
query_layer
,
key_layer
,
transpose_b
=
True
)
attention_scores
=
tf
.
multiply
(
attention_scores
,
1.0
/
math
.
sqrt
(
float
(
size_per_head
)))
if
attention_mask
is
not
None
:
# `attention_mask` = [B, 1, F, T]
attention_mask
=
tf
.
expand_dims
(
attention_mask
,
axis
=
[
1
])
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
adder
=
(
1.0
-
tf
.
cast
(
attention_mask
,
tf
.
float32
))
*
-
10000.0
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_scores
+=
adder
# Normalize the attention scores to probabilities.
# `attention_probs` = [B, N, F, T]
attention_probs
=
tf
.
nn
.
softmax
(
attention_scores
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
dropout
(
attention_probs
,
attention_probs_dropout_prob
)
# `value_layer` = [B, T, N, H]
value_layer
=
tf
.
reshape
(
value_layer
,
[
batch_size
,
to_seq_length
,
num_attention_heads
,
size_per_head
])
# `value_layer` = [B, N, T, H]
value_layer
=
tf
.
transpose
(
value_layer
,
[
0
,
2
,
1
,
3
])
# `context_layer` = [B, N, F, H]
context_layer
=
tf
.
matmul
(
attention_probs
,
value_layer
)
# `context_layer` = [B, F, N, H]
context_layer
=
tf
.
transpose
(
context_layer
,
[
0
,
2
,
1
,
3
])
if
do_return_2d_tensor
:
# `context_layer` = [B*F, N*V]
context_layer
=
tf
.
reshape
(
context_layer
,
[
batch_size
*
from_seq_length
,
num_attention_heads
*
size_per_head
])
else
:
# `context_layer` = [B, F, N*V]
context_layer
=
tf
.
reshape
(
context_layer
,
[
batch_size
,
from_seq_length
,
num_attention_heads
*
size_per_head
])
return
context_layer
def
transformer_model
(
input_tensor
,
attention_mask
=
None
,
hidden_size
=
768
,
num_hidden_layers
=
12
,
num_attention_heads
=
12
,
intermediate_size
=
3072
,
intermediate_act_fn
=
gelu
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
initializer_range
=
0.02
,
do_return_all_layers
=
False
):
"""Multi-headed, multi-layer Transformer from "Attention is All You Need".
This is almost an exact implementation of the original Transformer encoder.
See the original paper:
https://arxiv.org/abs/1706.03762
Also see:
https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
Args:
input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
seq_length], with 1 for positions that can be attended to and 0 in
positions that should not be.
hidden_size: int. Hidden size of the Transformer.
num_hidden_layers: int. Number of layers (blocks) in the Transformer.
num_attention_heads: int. Number of attention heads in the Transformer.
intermediate_size: int. The size of the "intermediate" (a.k.a., feed
forward) layer.
intermediate_act_fn: function. The non-linear activation function to apply
to the output of the intermediate/feed-forward layer.
hidden_dropout_prob: float. Dropout probability for the hidden layers.
attention_probs_dropout_prob: float. Dropout probability of the attention
probabilities.
initializer_range: float. Range of the initializer (stddev of truncated
normal).
do_return_all_layers: Whether to also return all layers or just the final
layer.
Returns:
float Tensor of shape [batch_size, seq_length, hidden_size], the final
hidden layer of the Transformer.
Raises:
ValueError: A Tensor shape or parameter is invalid.
"""
if
hidden_size
%
num_attention_heads
!=
0
:
raise
ValueError
(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)"
%
(
hidden_size
,
num_attention_heads
))
attention_head_size
=
int
(
hidden_size
/
num_attention_heads
)
input_shape
=
get_shape_list
(
input_tensor
,
expected_rank
=
3
)
batch_size
=
input_shape
[
0
]
seq_length
=
input_shape
[
1
]
input_width
=
input_shape
[
2
]
# The Transformer performs sum residuals on all layers so the input needs
# to be the same as the hidden size.
if
input_width
!=
hidden_size
:
raise
ValueError
(
"The width of the input tensor (%d) != hidden size (%d)"
%
(
input_width
,
hidden_size
))
# We keep the representation as a 2D tensor to avoid re-shaping it back and
# forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
# the GPU/CPU but may not be free on the TPU, so we want to minimize them to
# help the optimizer.
prev_output
=
reshape_to_matrix
(
input_tensor
)
all_layer_outputs
=
[]
for
layer_idx
in
range
(
num_hidden_layers
):
with
tf
.
variable_scope
(
"layer_%d"
%
layer_idx
):
layer_input
=
prev_output
with
tf
.
variable_scope
(
"attention"
):
attention_heads
=
[]
with
tf
.
variable_scope
(
"self"
):
attention_head
=
attention_layer
(
from_tensor
=
layer_input
,
to_tensor
=
layer_input
,
attention_mask
=
attention_mask
,
num_attention_heads
=
num_attention_heads
,
size_per_head
=
attention_head_size
,
attention_probs_dropout_prob
=
attention_probs_dropout_prob
,
initializer_range
=
initializer_range
,
do_return_2d_tensor
=
True
,
batch_size
=
batch_size
,
from_seq_length
=
seq_length
,
to_seq_length
=
seq_length
)
attention_heads
.
append
(
attention_head
)
attention_output
=
None
if
len
(
attention_heads
)
==
1
:
attention_output
=
attention_heads
[
0
]
else
:
# In the case where we have other sequences, we just concatenate
# them to the self-attention head before the projection.
attention_output
=
tf
.
concat
(
attention_heads
,
axis
=-
1
)
# Run a linear projection of `hidden_size` then add a residual
# with `layer_input`.
with
tf
.
variable_scope
(
"output"
):
attention_output
=
tf
.
layers
.
dense
(
attention_output
,
hidden_size
,
kernel_initializer
=
create_initializer
(
initializer_range
))
attention_output
=
dropout
(
attention_output
,
hidden_dropout_prob
)
attention_output
=
layer_norm
(
attention_output
+
layer_input
)
# The activation is only applied to the "intermediate" hidden layer.
with
tf
.
variable_scope
(
"intermediate"
):
intermediate_output
=
tf
.
layers
.
dense
(
attention_output
,
intermediate_size
,
activation
=
intermediate_act_fn
,
kernel_initializer
=
create_initializer
(
initializer_range
))
# Down-project back to `hidden_size` then add the residual.
with
tf
.
variable_scope
(
"output"
):
layer_output
=
tf
.
layers
.
dense
(
intermediate_output
,
hidden_size
,
kernel_initializer
=
create_initializer
(
initializer_range
))
layer_output
=
dropout
(
layer_output
,
hidden_dropout_prob
)
layer_output
=
layer_norm
(
layer_output
+
attention_output
)
prev_output
=
layer_output
all_layer_outputs
.
append
(
layer_output
)
if
do_return_all_layers
:
final_outputs
=
[]
for
layer_output
in
all_layer_outputs
:
final_output
=
reshape_from_matrix
(
layer_output
,
input_shape
)
final_outputs
.
append
(
final_output
)
return
final_outputs
else
:
final_output
=
reshape_from_matrix
(
prev_output
,
input_shape
)
return
final_output
def
get_shape_list
(
tensor
,
expected_rank
=
None
,
name
=
None
):
"""Returns a list of the shape of tensor, preferring static dimensions.
Args:
tensor: A tf.Tensor object to find the shape of.
expected_rank: (optional) int. The expected rank of `tensor`. If this is
specified and the `tensor` has a different rank, and exception will be
thrown.
name: Optional name of the tensor for the error message.
Returns:
A list of dimensions of the shape of tensor. All static dimensions will
be returned as python integers, and dynamic dimensions will be returned
as tf.Tensor scalars.
"""
if
name
is
None
:
name
=
tensor
.
name
if
expected_rank
is
not
None
:
assert_rank
(
tensor
,
expected_rank
,
name
)
shape
=
tensor
.
shape
.
as_list
()
non_static_indexes
=
[]
for
(
index
,
dim
)
in
enumerate
(
shape
):
if
dim
is
None
:
non_static_indexes
.
append
(
index
)
if
not
non_static_indexes
:
return
shape
dyn_shape
=
tf
.
shape
(
tensor
)
for
index
in
non_static_indexes
:
shape
[
index
]
=
dyn_shape
[
index
]
return
shape
def
reshape_to_matrix
(
input_tensor
):
"""Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
ndims
=
input_tensor
.
shape
.
ndims
if
ndims
<
2
:
raise
ValueError
(
"Input tensor must have at least rank 2. Shape = %s"
%
(
input_tensor
.
shape
))
if
ndims
==
2
:
return
input_tensor
width
=
input_tensor
.
shape
[
-
1
]
output_tensor
=
tf
.
reshape
(
input_tensor
,
[
-
1
,
width
])
return
output_tensor
def
reshape_from_matrix
(
output_tensor
,
orig_shape_list
):
"""Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
if
len
(
orig_shape_list
)
==
2
:
return
output_tensor
output_shape
=
get_shape_list
(
output_tensor
)
orig_dims
=
orig_shape_list
[
0
:
-
1
]
width
=
output_shape
[
-
1
]
return
tf
.
reshape
(
output_tensor
,
orig_dims
+
[
width
])
def
assert_rank
(
tensor
,
expected_rank
,
name
=
None
):
"""Raises an exception if the tensor rank is not of the expected rank.
Args:
tensor: A tf.Tensor to check the rank of.
expected_rank: Python integer or list of integers, expected rank.
name: Optional name of the tensor for the error message.
Raises:
ValueError: If the expected shape doesn"t match the actual shape.
"""
if
name
is
None
:
name
=
tensor
.
name
expected_rank_dict
=
{}
if
isinstance
(
expected_rank
,
six
.
integer_types
):
expected_rank_dict
[
expected_rank
]
=
True
else
:
for
x
in
expected_rank
:
expected_rank_dict
[
x
]
=
True
actual_rank
=
tensor
.
shape
.
ndims
if
actual_rank
not
in
expected_rank_dict
:
scope_name
=
tf
.
get_variable_scope
().
name
raise
ValueError
(
"For the tensor `%s` in scope `%s`, the actual rank "
"`%d` (shape = %s) is not equal to the expected rank `%s`"
%
(
name
,
scope_name
,
actual_rank
,
str
(
tensor
.
shape
),
str
(
expected_rank
)))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment