Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
b0218894
Commit
b0218894
authored
Sep 28, 2020
by
Allen Wang
Committed by
A. Unique TensorFlower
Sep 28, 2020
Browse files
Internal change
PiperOrigin-RevId: 334233893
parent
8a670c65
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
264 additions
and
321 deletions
+264
-321
official/nlp/xlnet/data_utils.py
official/nlp/xlnet/data_utils.py
+2
-5
official/nlp/xlnet/run_classifier.py
official/nlp/xlnet/run_classifier.py
+1
-1
official/nlp/xlnet/training_utils.py
official/nlp/xlnet/training_utils.py
+1
-1
official/nlp/xlnet/xlnet_modeling.py
official/nlp/xlnet/xlnet_modeling.py
+260
-261
official/nlp/xlnet/xlnet_modeling_test.py
official/nlp/xlnet/xlnet_modeling_test.py
+0
-53
No files found.
official/nlp/xlnet/data_utils.py
View file @
b0218894
...
@@ -95,7 +95,6 @@ def file_based_input_fn_builder(input_file, name_to_features, batch_size,
...
@@ -95,7 +95,6 @@ def file_based_input_fn_builder(input_file, name_to_features, batch_size,
d
=
d
.
interleave
(
d
=
d
.
interleave
(
tf
.
data
.
TFRecordDataset
,
tf
.
data
.
TFRecordDataset
,
sloppy
=
is_training
,
cycle_length
=
cycle_length
)
cycle_length
=
cycle_length
)
if
is_training
:
if
is_training
:
...
@@ -495,7 +494,7 @@ def create_pretrain_dataset(file_names,
...
@@ -495,7 +494,7 @@ def create_pretrain_dataset(file_names,
# reshape back to fixed shape
# reshape back to fixed shape
example
[
"perm_mask"
]
=
tf
.
reshape
(
perm_mask
,
[
seq_len
,
seq_len
])
example
[
"perm_mask"
]
=
tf
.
reshape
(
perm_mask
,
[
seq_len
,
seq_len
])
example
[
"input_
k
"
]
=
tf
.
reshape
(
input_k
,
[
seq_len
])
example
[
"input_
ids
"
]
=
tf
.
reshape
(
input_k
,
[
seq_len
])
example
[
"input_q"
]
=
tf
.
reshape
(
input_q
,
[
seq_len
])
example
[
"input_q"
]
=
tf
.
reshape
(
input_q
,
[
seq_len
])
# Directly use raw inputs as the target
# Directly use raw inputs as the target
...
@@ -718,11 +717,9 @@ def parse_files_to_dataset(parser,
...
@@ -718,11 +717,9 @@ def parse_files_to_dataset(parser,
cycle_length
=
min
(
8
,
len
(
file_paths
))
cycle_length
=
min
(
8
,
len
(
file_paths
))
logging
.
info
(
"Interleave %d files"
,
cycle_length
)
logging
.
info
(
"Interleave %d files"
,
cycle_length
)
# `sloppy` mode means that the interleaving is not exact. This adds
# even more randomness to the training pipeline.
dataset
=
dataset
.
apply
(
dataset
=
dataset
.
apply
(
tf
.
data
.
experimental
.
parallel_interleave
(
tf
.
data
.
experimental
.
parallel_interleave
(
tf
.
data
.
TFRecordDataset
,
sloppy
=
True
,
cycle_length
=
cycle_length
))
tf
.
data
.
TFRecordDataset
,
cycle_length
=
cycle_length
))
buffer_size
=
2048
buffer_size
=
2048
logging
.
info
(
"Perform sample-level shuffle with size %d"
,
buffer_size
)
logging
.
info
(
"Perform sample-level shuffle with size %d"
,
buffer_size
)
dataset
=
dataset
.
shuffle
(
buffer_size
=
buffer_size
)
dataset
=
dataset
.
shuffle
(
buffer_size
=
buffer_size
)
...
...
official/nlp/xlnet/run_classifier.py
View file @
b0218894
...
@@ -155,7 +155,7 @@ def main(unused_argv):
...
@@ -155,7 +155,7 @@ def main(unused_argv):
adam_epsilon
=
FLAGS
.
adam_epsilon
)
adam_epsilon
=
FLAGS
.
adam_epsilon
)
model_config
=
xlnet_config
.
XLNetConfig
(
FLAGS
)
model_config
=
xlnet_config
.
XLNetConfig
(
FLAGS
)
run_config
=
xlnet_config
.
create_run_config
(
True
,
False
,
FLAGS
)
run_config
=
xlnet_config
.
create_run_config
(
True
,
False
,
FLAGS
)
model_fn
=
functools
.
partial
(
get_
classification
xlnet
_model
,
model_config
,
model_fn
=
functools
.
partial
(
modeling
.
classification_model
,
model_config
,
run_config
,
FLAGS
.
n_class
,
FLAGS
.
summary_type
)
run_config
,
FLAGS
.
n_class
,
FLAGS
.
summary_type
)
input_meta_data
=
{}
input_meta_data
=
{}
input_meta_data
[
"d_model"
]
=
FLAGS
.
d_model
input_meta_data
[
"d_model"
]
=
FLAGS
.
d_model
...
...
official/nlp/xlnet/training_utils.py
View file @
b0218894
...
@@ -213,8 +213,8 @@ def train(
...
@@ -213,8 +213,8 @@ def train(
if
input_meta_data
[
"mem_len"
]
>
0
:
if
input_meta_data
[
"mem_len"
]
>
0
:
for
_
in
range
(
input_meta_data
[
"n_layer"
]):
for
_
in
range
(
input_meta_data
[
"n_layer"
]):
zeros
=
tf
.
zeros
([
zeros
=
tf
.
zeros
([
input_meta_data
[
"mem_len"
],
input_meta_data
[
"batch_size_per_core"
],
input_meta_data
[
"batch_size_per_core"
],
input_meta_data
[
"mem_len"
],
input_meta_data
[
"d_model"
]
input_meta_data
[
"d_model"
]
],
],
dtype
=
tf
.
float32
)
dtype
=
tf
.
float32
)
...
...
official/nlp/xlnet/xlnet_modeling.py
View file @
b0218894
...
@@ -17,6 +17,8 @@
...
@@ -17,6 +17,8 @@
import
copy
import
copy
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.nlp.modeling
import
networks
from
official.nlp.xlnet
import
data_utils
from
official.nlp.xlnet
import
data_utils
...
@@ -24,6 +26,18 @@ def gelu(x):
...
@@ -24,6 +26,18 @@ def gelu(x):
return
tf
.
keras
.
activations
.
gelu
(
x
,
approximate
=
True
)
return
tf
.
keras
.
activations
.
gelu
(
x
,
approximate
=
True
)
def
_get_initializer
(
flags
):
"""Get variable initializer."""
if
flags
.
init_method
==
"uniform"
:
initializer
=
tf
.
keras
.
initializers
.
RandomUniform
(
minval
=-
flags
.
init_range
,
maxval
=
flags
.
init_range
)
elif
flags
.
init_method
==
"normal"
:
initializer
=
tf
.
keras
.
initializers
.
RandomNormal
(
stddev
=
flags
.
init_std
)
else
:
raise
ValueError
(
"Initializer {} not supported"
.
format
(
flags
.
init_method
))
return
initializer
def
rel_shift
(
x
,
klen
=-
1
):
def
rel_shift
(
x
,
klen
=-
1
):
"""Performs relative shift to form the relative attention score."""
"""Performs relative shift to form the relative attention score."""
x_size
=
tf
.
shape
(
x
)
x_size
=
tf
.
shape
(
x
)
...
@@ -36,18 +50,6 @@ def rel_shift(x, klen=-1):
...
@@ -36,18 +50,6 @@ def rel_shift(x, klen=-1):
return
x
return
x
def
_get_initializer
(
flags
):
"""Get variable initializer."""
if
flags
.
init_method
==
'uniform'
:
initializer
=
tf
.
keras
.
initializers
.
RandomUniform
(
minval
=-
flags
.
init_range
,
maxval
=
flags
.
init_range
)
elif
flags
.
init_method
==
'normal'
:
initializer
=
tf
.
keras
.
initializers
.
RandomNormal
(
stddev
=
flags
.
init_std
)
else
:
raise
ValueError
(
'Initializer {} not supported'
.
format
(
flags
.
init_method
))
return
initializer
def
_create_mask
(
qlen
,
mlen
,
dtype
=
tf
.
float32
,
same_length
=
False
):
def
_create_mask
(
qlen
,
mlen
,
dtype
=
tf
.
float32
,
same_length
=
False
):
"""Creates attention mask when single-side context allowed only."""
"""Creates attention mask when single-side context allowed only."""
attn_mask
=
tf
.
ones
([
qlen
,
qlen
],
dtype
=
dtype
)
attn_mask
=
tf
.
ones
([
qlen
,
qlen
],
dtype
=
dtype
)
...
@@ -84,7 +86,7 @@ def is_special_none_tensor(tensor):
...
@@ -84,7 +86,7 @@ def is_special_none_tensor(tensor):
return
tensor
.
shape
.
ndims
==
0
and
tensor
.
dtype
==
tf
.
int32
return
tensor
.
shape
.
ndims
==
0
and
tensor
.
dtype
==
tf
.
int32
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'
Text
'
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"
Text
"
)
class
RelativePositionEncoding
(
tf
.
keras
.
layers
.
Layer
):
class
RelativePositionEncoding
(
tf
.
keras
.
layers
.
Layer
):
"""Creates a relative positional encoding.
"""Creates a relative positional encoding.
...
@@ -121,7 +123,7 @@ class RelativePositionEncoding(tf.keras.layers.Layer):
...
@@ -121,7 +123,7 @@ class RelativePositionEncoding(tf.keras.layers.Layer):
[len(pos_seq), batch_size, hidden_size] if batch_size is provided, else
[len(pos_seq), batch_size, hidden_size] if batch_size is provided, else
[len(pos_seq), 1, hidden_size].
[len(pos_seq), 1, hidden_size].
"""
"""
sinusoid_input
=
tf
.
einsum
(
'
i,d->id
'
,
pos_seq
,
self
.
_inv_freq
)
sinusoid_input
=
tf
.
einsum
(
"
i,d->id
"
,
pos_seq
,
self
.
_inv_freq
)
pos_emb
=
tf
.
concat
([
tf
.
sin
(
sinusoid_input
),
tf
.
cos
(
sinusoid_input
)],
-
1
)
pos_emb
=
tf
.
concat
([
tf
.
sin
(
sinusoid_input
),
tf
.
cos
(
sinusoid_input
)],
-
1
)
pos_emb
=
pos_emb
[:,
None
,
:]
pos_emb
=
pos_emb
[:,
None
,
:]
...
@@ -151,17 +153,17 @@ class RelativeAttention(tf.keras.layers.Layer):
...
@@ -151,17 +153,17 @@ class RelativeAttention(tf.keras.layers.Layer):
"""Implements call() for the layer."""
"""Implements call() for the layer."""
# content based attention score
# content based attention score
ac
=
tf
.
einsum
(
'
ibnd,jbnd->ijbn
'
,
q_head
+
r_w_bias
,
k_head_h
)
ac
=
tf
.
einsum
(
"
ibnd,jbnd->ijbn
"
,
q_head
+
r_w_bias
,
k_head_h
)
# position based attention score
# position based attention score
bd
=
tf
.
einsum
(
'
ibnd,jbnd->ijbn
'
,
q_head
+
r_r_bias
,
k_head_r
)
bd
=
tf
.
einsum
(
"
ibnd,jbnd->ijbn
"
,
q_head
+
r_r_bias
,
k_head_r
)
bd
=
rel_shift
(
bd
,
klen
=
tf
.
shape
(
ac
)[
1
])
bd
=
rel_shift
(
bd
,
klen
=
tf
.
shape
(
ac
)[
1
])
# segment-based attention score
# segment-based attention score
if
seg_mat
is
None
:
if
seg_mat
is
None
:
ef
=
0
ef
=
0
else
:
else
:
ef
=
tf
.
einsum
(
'
ibnd,snd->isbn
'
,
q_head
+
r_s_bias
,
seg_embed
)
ef
=
tf
.
einsum
(
"
ibnd,snd->isbn
"
,
q_head
+
r_s_bias
,
seg_embed
)
tgt_shape
=
tf
.
shape
(
bd
)
tgt_shape
=
tf
.
shape
(
bd
)
ef
=
tf
.
where
(
ef
=
tf
.
where
(
tf
.
broadcast_to
(
tf
.
expand_dims
(
seg_mat
,
3
),
tgt_shape
),
tf
.
broadcast_to
(
tf
.
expand_dims
(
seg_mat
,
3
),
tgt_shape
),
...
@@ -178,7 +180,7 @@ class RelativeAttention(tf.keras.layers.Layer):
...
@@ -178,7 +180,7 @@ class RelativeAttention(tf.keras.layers.Layer):
attn_prob
=
self
.
attention_probs_dropout
(
attn_prob
)
attn_prob
=
self
.
attention_probs_dropout
(
attn_prob
)
# attention output
# attention output
attn_vec
=
tf
.
einsum
(
'
ijbn,jbnd->ibnd
'
,
attn_prob
,
v_head_h
)
attn_vec
=
tf
.
einsum
(
"
ijbn,jbnd->ibnd
"
,
attn_prob
,
v_head_h
)
return
attn_vec
return
attn_vec
...
@@ -197,29 +199,29 @@ class PositionwiseFF(tf.keras.layers.Layer):
...
@@ -197,29 +199,29 @@ class PositionwiseFF(tf.keras.layers.Layer):
def
build
(
self
,
unused_input_shapes
):
def
build
(
self
,
unused_input_shapes
):
"""Implements build() for the layer."""
"""Implements build() for the layer."""
if
self
.
activation_type
==
'
relu
'
:
if
self
.
activation_type
==
"
relu
"
:
activation
=
tf
.
nn
.
relu
activation
=
tf
.
nn
.
relu
elif
self
.
activation_type
==
'
gelu
'
:
elif
self
.
activation_type
==
"
gelu
"
:
activation
=
gelu
activation
=
gelu
else
:
else
:
raise
(
ValueError
(
'
Unsupported activation type {}
'
.
format
(
raise
(
ValueError
(
"
Unsupported activation type {}
"
.
format
(
self
.
activation_type
)))
self
.
activation_type
)))
self
.
inner_projection_layer
=
(
self
.
inner_projection_layer
=
(
tf
.
keras
.
layers
.
Dense
(
tf
.
keras
.
layers
.
Dense
(
units
=
self
.
d_inner
,
units
=
self
.
d_inner
,
activation
=
activation
,
activation
=
activation
,
kernel_initializer
=
self
.
kernel_initializer
,
kernel_initializer
=
self
.
kernel_initializer
,
name
=
'
layer_1
'
))
name
=
"
layer_1
"
))
self
.
output_projection_layer
=
(
self
.
output_projection_layer
=
(
tf
.
keras
.
layers
.
Dense
(
tf
.
keras
.
layers
.
Dense
(
units
=
self
.
d_model
,
units
=
self
.
d_model
,
kernel_initializer
=
self
.
kernel_initializer
,
kernel_initializer
=
self
.
kernel_initializer
,
name
=
'
layer_2
'
))
name
=
"
layer_2
"
))
self
.
output_dropout
=
tf
.
keras
.
layers
.
Dropout
(
self
.
output_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout
,
name
=
'
drop_2
'
)
rate
=
self
.
dropout
,
name
=
"
drop_2
"
)
self
.
output_layer_norm
=
(
self
.
output_layer_norm
=
(
tf
.
keras
.
layers
.
LayerNormalization
(
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
'
LayerNorm
'
,
axis
=-
1
,
epsilon
=
1e-12
))
name
=
"
LayerNorm
"
,
axis
=-
1
,
epsilon
=
1e-12
))
super
(
PositionwiseFF
,
self
).
build
(
unused_input_shapes
)
super
(
PositionwiseFF
,
self
).
build
(
unused_input_shapes
)
def
call
(
self
,
inp
):
def
call
(
self
,
inp
):
...
@@ -244,7 +246,7 @@ class EmbeddingLookup(tf.keras.layers.Layer):
...
@@ -244,7 +246,7 @@ class EmbeddingLookup(tf.keras.layers.Layer):
def
build
(
self
,
unused_input_shapes
):
def
build
(
self
,
unused_input_shapes
):
"""Implements build() for the layer."""
"""Implements build() for the layer."""
self
.
lookup_table
=
self
.
add_weight
(
self
.
lookup_table
=
self
.
add_weight
(
'
lookup_table
'
,
"
lookup_table
"
,
shape
=
[
self
.
n_token
,
self
.
d_embed
],
shape
=
[
self
.
n_token
,
self
.
d_embed
],
initializer
=
self
.
initializer
,
initializer
=
self
.
initializer
,
dtype
=
self
.
dtype
)
dtype
=
self
.
dtype
)
...
@@ -273,22 +275,22 @@ class RelativeMultiheadAttention(tf.keras.layers.Layer):
...
@@ -273,22 +275,22 @@ class RelativeMultiheadAttention(tf.keras.layers.Layer):
self
.
scale
=
1.0
/
(
self
.
d_head
**
0.5
)
self
.
scale
=
1.0
/
(
self
.
d_head
**
0.5
)
self
.
output_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
self
.
output_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
'
LayerNorm
'
,
axis
=-
1
,
epsilon
=
1e-12
)
name
=
"
LayerNorm
"
,
axis
=-
1
,
epsilon
=
1e-12
)
self
.
kh_projection_layer
=
self
.
add_weight
(
self
.
kh_projection_layer
=
self
.
add_weight
(
'
k/kernel
'
,
"
k/kernel
"
,
shape
=
[
self
.
d_model
,
self
.
n_head
,
self
.
d_head
],
shape
=
[
self
.
d_model
,
self
.
n_head
,
self
.
d_head
],
initializer
=
self
.
initializer
)
initializer
=
self
.
initializer
)
self
.
vh_projection_layer
=
self
.
add_weight
(
self
.
vh_projection_layer
=
self
.
add_weight
(
'
v/kernel
'
,
"
v/kernel
"
,
shape
=
[
self
.
d_model
,
self
.
n_head
,
self
.
d_head
],
shape
=
[
self
.
d_model
,
self
.
n_head
,
self
.
d_head
],
initializer
=
self
.
initializer
)
initializer
=
self
.
initializer
)
self
.
kr_projection_layer
=
self
.
add_weight
(
self
.
kr_projection_layer
=
self
.
add_weight
(
'
r/kernel
'
,
"
r/kernel
"
,
shape
=
[
self
.
d_model
,
self
.
n_head
,
self
.
d_head
],
shape
=
[
self
.
d_model
,
self
.
n_head
,
self
.
d_head
],
initializer
=
self
.
initializer
)
initializer
=
self
.
initializer
)
self
.
qh_projection_layer
=
self
.
add_weight
(
self
.
qh_projection_layer
=
self
.
add_weight
(
'
q/kernel
'
,
"
q/kernel
"
,
shape
=
[
self
.
d_model
,
self
.
n_head
,
self
.
d_head
],
shape
=
[
self
.
d_model
,
self
.
n_head
,
self
.
d_head
],
initializer
=
self
.
initializer
)
initializer
=
self
.
initializer
)
...
@@ -296,7 +298,7 @@ class RelativeMultiheadAttention(tf.keras.layers.Layer):
...
@@ -296,7 +298,7 @@ class RelativeMultiheadAttention(tf.keras.layers.Layer):
dropout_att
=
self
.
dropout_att
,
scale
=
self
.
scale
)
dropout_att
=
self
.
dropout_att
,
scale
=
self
.
scale
)
self
.
proj_o
=
self
.
add_weight
(
self
.
proj_o
=
self
.
add_weight
(
'
o/kernel
'
,
"
o/kernel
"
,
shape
=
[
self
.
d_model
,
self
.
n_head
,
self
.
d_head
],
shape
=
[
self
.
d_model
,
self
.
n_head
,
self
.
d_head
],
initializer
=
self
.
initializer
)
initializer
=
self
.
initializer
)
...
@@ -314,12 +316,12 @@ class RelativeMultiheadAttention(tf.keras.layers.Layer):
...
@@ -314,12 +316,12 @@ class RelativeMultiheadAttention(tf.keras.layers.Layer):
cat
=
h
cat
=
h
# content heads
# content heads
q_head_h
=
tf
.
einsum
(
'
ibh,hnd->ibnd
'
,
h
,
self
.
qh_projection_layer
)
q_head_h
=
tf
.
einsum
(
"
ibh,hnd->ibnd
"
,
h
,
self
.
qh_projection_layer
)
k_head_h
=
tf
.
einsum
(
'
ibh,hnd->ibnd
'
,
cat
,
self
.
kh_projection_layer
)
k_head_h
=
tf
.
einsum
(
"
ibh,hnd->ibnd
"
,
cat
,
self
.
kh_projection_layer
)
v_head_h
=
tf
.
einsum
(
'
ibh,hnd->ibnd
'
,
cat
,
self
.
vh_projection_layer
)
v_head_h
=
tf
.
einsum
(
"
ibh,hnd->ibnd
"
,
cat
,
self
.
vh_projection_layer
)
# positional heads
# positional heads
k_head_r
=
tf
.
einsum
(
'
ibh,hnd->ibnd
'
,
r
,
self
.
kr_projection_layer
)
k_head_r
=
tf
.
einsum
(
"
ibh,hnd->ibnd
"
,
r
,
self
.
kr_projection_layer
)
# core attention ops
# core attention ops
attn_vec_h
=
self
.
relative_attention_layer
(
q_head_h
,
k_head_h
,
v_head_h
,
attn_vec_h
=
self
.
relative_attention_layer
(
q_head_h
,
k_head_h
,
v_head_h
,
...
@@ -328,21 +330,21 @@ class RelativeMultiheadAttention(tf.keras.layers.Layer):
...
@@ -328,21 +330,21 @@ class RelativeMultiheadAttention(tf.keras.layers.Layer):
attn_mask_h
)
attn_mask_h
)
# post processing
# post processing
output_h
=
tf
.
einsum
(
'
ibnd,hnd->ibh
'
,
attn_vec_h
,
self
.
proj_o
)
output_h
=
tf
.
einsum
(
"
ibnd,hnd->ibh
"
,
attn_vec_h
,
self
.
proj_o
)
output_h
=
self
.
attention_dropout
(
output_h
)
output_h
=
self
.
attention_dropout
(
output_h
)
output_h
=
self
.
output_layer_norm
(
output_h
+
h
)
output_h
=
self
.
output_layer_norm
(
output_h
+
h
)
output_g
=
None
output_g
=
None
if
g
is
not
None
:
# enable two-stream attention
if
g
is
not
None
:
# enable two-stream attention
# g-stream
# g-stream
q_head_g
=
tf
.
einsum
(
'
ibh,hnd->ibnd
'
,
g
,
self
.
qh_projection_layer
)
q_head_g
=
tf
.
einsum
(
"
ibh,hnd->ibnd
"
,
g
,
self
.
qh_projection_layer
)
if
target_mapping
is
not
None
:
if
target_mapping
is
not
None
:
q_head_g
=
tf
.
einsum
(
'
mbnd,mlb->lbnd
'
,
q_head_g
,
target_mapping
)
q_head_g
=
tf
.
einsum
(
"
mbnd,mlb->lbnd
"
,
q_head_g
,
target_mapping
)
attn_vec_g
=
self
.
relative_attention_layer
(
q_head_g
,
k_head_h
,
v_head_h
,
attn_vec_g
=
self
.
relative_attention_layer
(
q_head_g
,
k_head_h
,
v_head_h
,
k_head_r
,
seg_embed
,
seg_mat
,
k_head_r
,
seg_embed
,
seg_mat
,
r_w_bias
,
r_r_bias
,
r_s_bias
,
r_w_bias
,
r_r_bias
,
r_s_bias
,
attn_mask_g
)
attn_mask_g
)
attn_vec_g
=
tf
.
einsum
(
'
lbnd,mlb->mbnd
'
,
attn_vec_g
,
target_mapping
)
attn_vec_g
=
tf
.
einsum
(
"
lbnd,mlb->mbnd
"
,
attn_vec_g
,
target_mapping
)
else
:
else
:
attn_vec_g
=
self
.
relative_attention_layer
(
q_head_g
,
k_head_h
,
v_head_h
,
attn_vec_g
=
self
.
relative_attention_layer
(
q_head_g
,
k_head_h
,
v_head_h
,
...
@@ -351,7 +353,7 @@ class RelativeMultiheadAttention(tf.keras.layers.Layer):
...
@@ -351,7 +353,7 @@ class RelativeMultiheadAttention(tf.keras.layers.Layer):
attn_mask_g
)
attn_mask_g
)
# post processing
# post processing
output_g
=
tf
.
einsum
(
'
ibnd,hnd->ibh
'
,
attn_vec_g
,
self
.
proj_o
)
output_g
=
tf
.
einsum
(
"
ibnd,hnd->ibh
"
,
attn_vec_g
,
self
.
proj_o
)
output_g
=
self
.
attention_dropout
(
output_g
)
output_g
=
self
.
attention_dropout
(
output_g
)
output_g
=
self
.
output_layer_norm
(
output_g
+
g
)
output_g
=
self
.
output_layer_norm
(
output_g
+
g
)
...
@@ -380,7 +382,7 @@ class TransformerXLModel(tf.keras.layers.Layer):
...
@@ -380,7 +382,7 @@ class TransformerXLModel(tf.keras.layers.Layer):
untie_r
=
False
,
untie_r
=
False
,
use_tpu
=
True
,
use_tpu
=
True
,
reuse_len
=
None
,
reuse_len
=
None
,
ff_activation
=
'
relu
'
,
ff_activation
=
"
relu
"
,
use_cls_mask
=
False
,
use_cls_mask
=
False
,
**
kwargs
):
**
kwargs
):
"""Initializes TransformerXLModel.
"""Initializes TransformerXLModel.
...
@@ -445,7 +447,7 @@ class TransformerXLModel(tf.keras.layers.Layer):
...
@@ -445,7 +447,7 @@ class TransformerXLModel(tf.keras.layers.Layer):
d_embed
=
self
.
d_model
,
d_embed
=
self
.
d_model
,
initializer
=
self
.
initializer
,
initializer
=
self
.
initializer
,
dtype
=
self
.
tf_float
,
dtype
=
self
.
tf_float
,
name
=
'
word_embedding
'
)
name
=
"
word_embedding
"
)
self
.
h_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout
)
self
.
h_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout
)
self
.
g_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout
)
self
.
g_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout
)
...
@@ -453,48 +455,48 @@ class TransformerXLModel(tf.keras.layers.Layer):
...
@@ -453,48 +455,48 @@ class TransformerXLModel(tf.keras.layers.Layer):
if
self
.
untie_r
:
if
self
.
untie_r
:
self
.
r_w_bias
=
(
self
.
r_w_bias
=
(
self
.
add_weight
(
self
.
add_weight
(
'
r_w_bias
'
,
"
r_w_bias
"
,
shape
=
[
self
.
n_layer
,
self
.
n_head
,
self
.
d_head
],
shape
=
[
self
.
n_layer
,
self
.
n_head
,
self
.
d_head
],
dtype
=
self
.
tf_float
,
dtype
=
self
.
tf_float
,
initializer
=
self
.
initializer
))
initializer
=
self
.
initializer
))
self
.
r_r_bias
=
(
self
.
r_r_bias
=
(
self
.
add_weight
(
self
.
add_weight
(
'
r_r_bias
'
,
"
r_r_bias
"
,
shape
=
[
self
.
n_layer
,
self
.
n_head
,
self
.
d_head
],
shape
=
[
self
.
n_layer
,
self
.
n_head
,
self
.
d_head
],
dtype
=
self
.
tf_float
,
dtype
=
self
.
tf_float
,
initializer
=
self
.
initializer
))
initializer
=
self
.
initializer
))
self
.
r_s_bias
=
(
self
.
r_s_bias
=
(
self
.
add_weight
(
self
.
add_weight
(
'
r_s_bias
'
,
"
r_s_bias
"
,
shape
=
[
self
.
n_layer
,
self
.
n_head
,
self
.
d_head
],
shape
=
[
self
.
n_layer
,
self
.
n_head
,
self
.
d_head
],
dtype
=
self
.
tf_float
,
dtype
=
self
.
tf_float
,
initializer
=
self
.
initializer
))
initializer
=
self
.
initializer
))
else
:
else
:
self
.
r_w_bias
=
(
self
.
r_w_bias
=
(
self
.
add_weight
(
self
.
add_weight
(
'
r_w_bias
'
,
"
r_w_bias
"
,
shape
=
[
self
.
n_head
,
self
.
d_head
],
shape
=
[
self
.
n_head
,
self
.
d_head
],
dtype
=
self
.
tf_float
,
dtype
=
self
.
tf_float
,
initializer
=
self
.
initializer
))
initializer
=
self
.
initializer
))
self
.
r_r_bias
=
(
self
.
r_r_bias
=
(
self
.
add_weight
(
self
.
add_weight
(
'
r_r_bias
'
,
"
r_r_bias
"
,
shape
=
[
self
.
n_head
,
self
.
d_head
],
shape
=
[
self
.
n_head
,
self
.
d_head
],
dtype
=
self
.
tf_float
,
dtype
=
self
.
tf_float
,
initializer
=
self
.
initializer
))
initializer
=
self
.
initializer
))
self
.
r_s_bias
=
(
self
.
r_s_bias
=
(
self
.
add_weight
(
self
.
add_weight
(
'
r_s_bias
'
,
[
self
.
n_head
,
self
.
d_head
],
"
r_s_bias
"
,
[
self
.
n_head
,
self
.
d_head
],
dtype
=
self
.
tf_float
,
dtype
=
self
.
tf_float
,
initializer
=
self
.
initializer
))
initializer
=
self
.
initializer
))
self
.
seg_embed
=
self
.
add_weight
(
self
.
seg_embed
=
self
.
add_weight
(
'
seg_embed
'
,
[
self
.
n_layer
,
2
,
self
.
n_head
,
self
.
d_head
],
"
seg_embed
"
,
[
self
.
n_layer
,
2
,
self
.
n_head
,
self
.
d_head
],
dtype
=
self
.
tf_float
,
dtype
=
self
.
tf_float
,
initializer
=
self
.
initializer
)
initializer
=
self
.
initializer
)
self
.
mask_emb
=
self
.
add_weight
(
self
.
mask_emb
=
self
.
add_weight
(
'
mask_emb/mask_emb
'
,
shape
=
[
1
,
1
,
self
.
d_model
],
dtype
=
self
.
tf_float
)
"
mask_emb/mask_emb
"
,
shape
=
[
1
,
1
,
self
.
d_model
],
dtype
=
self
.
tf_float
)
self
.
emb_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout
)
self
.
emb_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout
)
self
.
fwd_position_embedding
=
RelativePositionEncoding
(
self
.
d_model
)
self
.
fwd_position_embedding
=
RelativePositionEncoding
(
self
.
d_model
)
...
@@ -511,7 +513,7 @@ class TransformerXLModel(tf.keras.layers.Layer):
...
@@ -511,7 +513,7 @@ class TransformerXLModel(tf.keras.layers.Layer):
d_head
=
self
.
d_head
,
d_head
=
self
.
d_head
,
dropout_att
=
self
.
dropout_att
,
dropout_att
=
self
.
dropout_att
,
kernel_initializer
=
self
.
initializer
,
kernel_initializer
=
self
.
initializer
,
name
=
'
layer_%d/rel_attn
'
%
(
i
)))
name
=
"
layer_%d/rel_attn
"
%
(
i
)))
self
.
h_positionwise_ffn_layers
.
append
(
self
.
h_positionwise_ffn_layers
.
append
(
PositionwiseFF
(
PositionwiseFF
(
d_model
=
self
.
d_model
,
d_model
=
self
.
d_model
,
...
@@ -519,7 +521,7 @@ class TransformerXLModel(tf.keras.layers.Layer):
...
@@ -519,7 +521,7 @@ class TransformerXLModel(tf.keras.layers.Layer):
dropout
=
self
.
dropout
,
dropout
=
self
.
dropout
,
kernel_initializer
=
self
.
initializer
,
kernel_initializer
=
self
.
initializer
,
activation_type
=
self
.
ff_activation
,
activation_type
=
self
.
ff_activation
,
name
=
'
layer_%d/ff
'
%
(
i
)))
name
=
"
layer_%d/ff
"
%
(
i
)))
self
.
output_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout
)
self
.
output_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout
)
...
@@ -537,25 +539,25 @@ class TransformerXLModel(tf.keras.layers.Layer):
...
@@ -537,25 +539,25 @@ class TransformerXLModel(tf.keras.layers.Layer):
# Uses dict to feed inputs into call() in order to keep mems as a python
# Uses dict to feed inputs into call() in order to keep mems as a python
# list.
# list.
inputs
=
{
inputs
=
{
'
inp_k
'
:
inp_k
,
"
inp_k
"
:
inp_k
,
'
seg_id
'
:
seg_id
,
"
seg_id
"
:
seg_id
,
'
input_mask
'
:
input_mask
,
"
input_mask
"
:
input_mask
,
'
mems
'
:
mems
,
"
mems
"
:
mems
,
'
perm_mask
'
:
perm_mask
,
"
perm_mask
"
:
perm_mask
,
'
target_mapping
'
:
target_mapping
,
"
target_mapping
"
:
target_mapping
,
'
inp_q
'
:
inp_q
"
inp_q
"
:
inp_q
}
}
return
super
(
TransformerXLModel
,
self
).
__call__
(
inputs
,
**
kwargs
)
return
super
(
TransformerXLModel
,
self
).
__call__
(
inputs
,
**
kwargs
)
def
call
(
self
,
inputs
):
def
call
(
self
,
inputs
):
"""Implements call() for the layer."""
"""Implements call() for the layer."""
inp_k
=
inputs
[
'
inp_k
'
]
inp_k
=
inputs
[
"
inp_k
"
]
seg_id
=
inputs
[
'
seg_id
'
]
seg_id
=
inputs
[
"
seg_id
"
]
input_mask
=
inputs
[
'
input_mask
'
]
input_mask
=
inputs
[
"
input_mask
"
]
mems
=
inputs
[
'
mems
'
]
mems
=
inputs
[
"
mems
"
]
perm_mask
=
inputs
[
'
perm_mask
'
]
perm_mask
=
inputs
[
"
perm_mask
"
]
target_mapping
=
inputs
[
'
target_mapping
'
]
target_mapping
=
inputs
[
"
target_mapping
"
]
inp_q
=
inputs
[
'
inp_q
'
]
inp_q
=
inputs
[
"
inp_q
"
]
new_mems
=
[]
new_mems
=
[]
...
@@ -568,14 +570,14 @@ class TransformerXLModel(tf.keras.layers.Layer):
...
@@ -568,14 +570,14 @@ class TransformerXLModel(tf.keras.layers.Layer):
##### Attention mask
##### Attention mask
# causal attention mask
# causal attention mask
if
self
.
attn_type
==
'
uni
'
:
if
self
.
attn_type
==
"
uni
"
:
attn_mask
=
_create_mask
(
qlen
,
mlen
,
self
.
tf_float
,
self
.
same_length
)
attn_mask
=
_create_mask
(
qlen
,
mlen
,
self
.
tf_float
,
self
.
same_length
)
# pylint: enable=protected-access
# pylint: enable=protected-access
attn_mask
=
attn_mask
[:,
:,
None
,
None
]
attn_mask
=
attn_mask
[:,
:,
None
,
None
]
elif
self
.
attn_type
==
'
bi
'
:
elif
self
.
attn_type
==
"
bi
"
:
attn_mask
=
None
attn_mask
=
None
else
:
else
:
raise
ValueError
(
'
Unsupported attention type: {}
'
.
format
(
self
.
attn_type
))
raise
ValueError
(
"
Unsupported attention type: {}
"
.
format
(
self
.
attn_type
))
# data mask: input mask & perm mask
# data mask: input mask & perm mask
if
input_mask
is
not
None
and
perm_mask
is
not
None
:
if
input_mask
is
not
None
and
perm_mask
is
not
None
:
...
@@ -652,12 +654,12 @@ class TransformerXLModel(tf.keras.layers.Layer):
...
@@ -652,12 +654,12 @@ class TransformerXLModel(tf.keras.layers.Layer):
if
dtype
is
not
None
and
dtype
!=
tf
.
float32
:
if
dtype
is
not
None
and
dtype
!=
tf
.
float32
:
freq_seq
=
tf
.
cast
(
freq_seq
,
dtype
=
self
.
dtype
)
freq_seq
=
tf
.
cast
(
freq_seq
,
dtype
=
self
.
dtype
)
if
self
.
attn_type
==
'
bi
'
:
if
self
.
attn_type
==
"
bi
"
:
beg
,
end
=
klen
,
-
qlen
beg
,
end
=
klen
,
-
qlen
elif
self
.
attn_type
==
'
uni
'
:
elif
self
.
attn_type
==
"
uni
"
:
beg
,
end
=
klen
,
-
1
beg
,
end
=
klen
,
-
1
else
:
else
:
raise
ValueError
(
'
Unknown `attn_type` {}.
'
.
format
(
self
.
attn_type
))
raise
ValueError
(
"
Unknown `attn_type` {}.
"
.
format
(
self
.
attn_type
))
if
self
.
bi_data
:
if
self
.
bi_data
:
fwd_pos_seq
=
tf
.
range
(
beg
,
end
,
-
1.0
)
fwd_pos_seq
=
tf
.
range
(
beg
,
end
,
-
1.0
)
...
@@ -749,73 +751,69 @@ class PretrainingXLNetModel(tf.keras.Model):
...
@@ -749,73 +751,69 @@ class PretrainingXLNetModel(tf.keras.Model):
self
.
initializer
=
_get_initializer
(
run_config
)
self
.
initializer
=
_get_initializer
(
run_config
)
self
.
xlnet_config
=
copy
.
deepcopy
(
xlnet_config
)
self
.
xlnet_config
=
copy
.
deepcopy
(
xlnet_config
)
self
.
transformerxl_model
=
TransformerXLModel
(
self
.
xlnet_model
=
networks
.
XLNetBase
(
n_token
=
self
.
xlnet_config
.
n_token
,
vocab_size
=
self
.
xlnet_config
.
n_token
,
initializer
=
self
.
initializer
,
initializer
=
self
.
initializer
,
attn_type
=
'bi'
,
attention_type
=
"bi"
,
n_layer
=
self
.
xlnet_config
.
n_layer
,
num_layers
=
self
.
xlnet_config
.
n_layer
,
d_model
=
self
.
xlnet_config
.
d_model
,
hidden_size
=
self
.
xlnet_config
.
d_model
,
n_head
=
self
.
xlnet_config
.
n_head
,
num_attention_heads
=
self
.
xlnet_config
.
n_head
,
d_head
=
self
.
xlnet_config
.
d_head
,
head_size
=
self
.
xlnet_config
.
d_head
,
d_inner
=
self
.
xlnet_config
.
d_inner
,
inner_size
=
self
.
xlnet_config
.
d_inner
,
ff_activation
=
self
.
xlnet_config
.
ff_activation
,
two_stream
=
True
,
untie_r
=
self
.
xlnet_config
.
untie_r
,
tie_attention_biases
=
not
self
.
xlnet_config
.
untie_r
,
is_training
=
self
.
run_config
.
is_training
,
inner_activation
=
self
.
xlnet_config
.
ff_activation
,
use_tpu
=
self
.
run_config
.
use_tpu
,
dropout_rate
=
self
.
run_config
.
dropout
,
dropout
=
self
.
run_config
.
dropout
,
attention_dropout_rate
=
self
.
run_config
.
dropout_att
,
dropout_att
=
self
.
run_config
.
dropout_att
,
memory_length
=
self
.
run_config
.
mem_len
,
mem_len
=
self
.
run_config
.
mem_len
,
reuse_length
=
self
.
run_config
.
reuse_len
,
reuse_len
=
self
.
run_config
.
reuse_len
,
bi_data
=
self
.
run_config
.
bi_data
,
bi_data
=
self
.
run_config
.
bi_data
,
clamp_len
=
self
.
run_config
.
clamp_len
,
clamp_length
=
self
.
run_config
.
clamp_len
,
same_length
=
self
.
run_config
.
same_length
,
use_cls_mask
=
self
.
run_config
.
use_cls_mask
,
use_cls_mask
=
self
.
run_config
.
use_cls_mask
,
name
=
'transformer'
)
name
=
"xlnet_model"
)
self
.
lmloss_layer
=
LMLossLayer
(
self
.
lmloss_layer
=
LMLossLayer
(
n_token
=
self
.
xlnet_config
.
n_token
,
vocab_size
=
self
.
xlnet_config
.
n_token
,
d_model
=
self
.
xlnet_config
.
d_model
,
hidden_size
=
self
.
xlnet_config
.
d_model
,
initializer
=
self
.
initializer
,
initializer
=
self
.
initializer
,
tie_weight
=
True
,
tie_weight
=
True
,
bi_data
=
self
.
run_config
.
bi_data
,
bi_data
=
self
.
run_config
.
bi_data
,
use_
tpu
=
self
.
run_config
.
use_tpu
,
use_
one_hot
=
self
.
run_config
.
use_tpu
,
use_proj
=
use_proj
,
use_proj
=
use_proj
,
name
=
'
lm_loss
'
)
name
=
"
lm_loss
"
)
def
call
(
self
,
features
):
def
call
(
self
,
features
):
"""Implements call() for the layer."""
"""Implements call() for the layer."""
input_ids
=
tf
.
transpose
(
features
[
'input_k'
],
[
1
,
0
])
input_ids
=
features
[
"input_ids"
]
inp_q
=
tf
.
transpose
(
features
[
'input_q'
],
[
1
,
0
])
masked_tokens
=
features
[
"input_q"
]
seg_ids
=
features
[
"seg_id"
]
seg_ids
=
tf
.
transpose
(
features
[
'seg_id'
],
[
1
,
0
])
perm_mask
=
features
[
"perm_mask"
]
target_mapping
=
features
[
"target_mapping"
]
perm_mask
=
tf
.
transpose
(
features
[
'perm_mask'
],
[
1
,
2
,
0
])
target_mapping
=
tf
.
transpose
(
features
[
'target_mapping'
],
[
1
,
2
,
0
])
# target for LM loss
# target for LM loss
target
=
tf
.
transpose
(
features
[
'
target
'
],
[
1
,
0
])
target
=
features
[
"
target
"
]
# target mask for LM loss
# target mask for LM loss
tgt_mask
=
tf
.
transpose
(
features
[
'
target_mask
'
],
[
1
,
0
])
tgt_mask
=
features
[
"
target_mask
"
]
mems
=
features
.
get
(
'
mems
'
,
None
)
mems
=
features
.
get
(
"
mems
"
,
None
)
transformerx
l_output
,
self
.
new_mems
,
self
.
lookup_table
=
self
.
transformerxl
_model
(
mode
l_output
,
self
.
new_mems
=
self
.
xlnet
_model
(
input_ids
,
input_ids
=
input_ids
,
seg_id
=
seg_ids
,
seg
ment
_id
s
=
seg_ids
,
input_mask
=
None
,
input_mask
=
None
,
mems
=
mems
,
state
=
mems
,
perm_mask
=
perm_mask
,
perm
utation
_mask
=
perm_mask
,
target_mapping
=
target_mapping
,
target_mapping
=
target_mapping
,
inp_q
=
inp_q
)
masked_tokens
=
masked_tokens
)
lm_loss
,
_
=
self
.
lmloss_layer
(
lm_loss
,
_
=
self
.
lmloss_layer
(
hidden
=
transformerx
l_output
,
hidden
=
mode
l_output
,
target
=
target
,
target
=
target
,
lookup_table
=
self
.
transformerxl
_model
.
embedding_lookup
.
lookup
_table
,
lookup_table
=
self
.
xlnet
_model
.
get_
embedding_lookup_table
()
,
target_mask
=
tgt_mask
)
target_mask
=
tgt_mask
)
self
.
add_loss
(
lm_loss
)
self
.
add_loss
(
lm_loss
)
return
self
.
new_mems
,
transformerx
l_output
return
self
.
new_mems
,
mode
l_output
class
ClassificationXLNetModel
(
tf
.
keras
.
Model
):
class
ClassificationXLNetModel
(
tf
.
keras
.
Model
):
...
@@ -831,58 +829,57 @@ class ClassificationXLNetModel(tf.keras.Model):
...
@@ -831,58 +829,57 @@ class ClassificationXLNetModel(tf.keras.Model):
self
.
initializer
=
_get_initializer
(
run_config
)
self
.
initializer
=
_get_initializer
(
run_config
)
self
.
xlnet_config
=
copy
.
deepcopy
(
xlnet_config
)
self
.
xlnet_config
=
copy
.
deepcopy
(
xlnet_config
)
self
.
transformerxl_model
=
TransformerXLModel
(
self
.
xlnet_model
=
networks
.
XLNetBase
(
n_token
=
self
.
xlnet_config
.
n_token
,
vocab_size
=
self
.
xlnet_config
.
n_token
,
initializer
=
self
.
initializer
,
initializer
=
self
.
initializer
,
attn_type
=
'bi'
,
attention_type
=
"bi"
,
n_layer
=
self
.
xlnet_config
.
n_layer
,
num_layers
=
self
.
xlnet_config
.
n_layer
,
d_model
=
self
.
xlnet_config
.
d_model
,
hidden_size
=
self
.
xlnet_config
.
d_model
,
n_head
=
self
.
xlnet_config
.
n_head
,
num_attention_heads
=
self
.
xlnet_config
.
n_head
,
d_head
=
self
.
xlnet_config
.
d_head
,
head_size
=
self
.
xlnet_config
.
d_head
,
d_inner
=
self
.
xlnet_config
.
d_inner
,
inner_size
=
self
.
xlnet_config
.
d_inner
,
ff_activation
=
self
.
xlnet_config
.
ff_activation
,
two_stream
=
False
,
untie_r
=
self
.
xlnet_config
.
untie_r
,
tie_attention_biases
=
not
self
.
xlnet_config
.
untie_r
,
is_training
=
self
.
run_config
.
is_training
,
inner_activation
=
self
.
xlnet_config
.
ff_activation
,
use_tpu
=
self
.
run_config
.
use_tpu
,
dropout_rate
=
self
.
run_config
.
dropout
,
dropout
=
self
.
run_config
.
dropout
,
attention_dropout_rate
=
self
.
run_config
.
dropout_att
,
dropout_att
=
self
.
run_config
.
dropout_att
,
memory_length
=
self
.
run_config
.
mem_len
,
mem_len
=
self
.
run_config
.
mem_len
,
reuse_length
=
self
.
run_config
.
reuse_len
,
reuse_len
=
self
.
run_config
.
reuse_len
,
bi_data
=
self
.
run_config
.
bi_data
,
bi_data
=
self
.
run_config
.
bi_data
,
clamp_len
=
self
.
run_config
.
clamp_len
,
clamp_len
gth
=
self
.
run_config
.
clamp_len
,
same_length
=
self
.
run_config
.
same_length
,
use_cls_mask
=
False
,
name
=
'transformer'
)
name
=
"xlnet_model"
)
self
.
summarization_layer
=
Summarization
(
self
.
summarization_layer
=
Summarization
(
d_model
=
self
.
xlnet_config
.
d_model
,
hidden_size
=
self
.
xlnet_config
.
d_model
,
n_head
=
self
.
xlnet_config
.
n_head
,
num_attentio
n_head
s
=
self
.
xlnet_config
.
n_head
,
d_
head
=
self
.
xlnet_config
.
d_head
,
head
_size
=
self
.
xlnet_config
.
d_head
,
dropout
=
self
.
run_config
.
dropout
,
dropout
_rate
=
self
.
run_config
.
dropout
,
dropout_at
t
=
self
.
run_config
.
dropout_att
,
attention_
dropout_
r
at
e
=
self
.
run_config
.
dropout_att
,
initializer
=
self
.
initializer
,
initializer
=
self
.
initializer
,
use_proj
=
True
,
use_proj
=
True
,
summary_type
=
summary_type
,
summary_type
=
summary_type
,
name
=
'
sequence_summary
'
)
name
=
"
sequence_summary
"
)
self
.
cl_loss_layer
=
ClassificationLossLayer
(
self
.
cl_loss_layer
=
ClassificationLossLayer
(
n_class
=
n_class
,
initializer
=
self
.
initializer
,
name
=
'
classification
'
)
n_class
=
n_class
,
initializer
=
self
.
initializer
,
name
=
"
classification
"
)
def
call
(
self
,
features
):
def
call
(
self
,
features
):
"""Implements call() for the layer."""
"""Implements call() for the layer."""
b
sz
_per_core
=
tf
.
shape
(
features
[
'
input_ids
'
])[
0
]
b
atch_size
_per_core
=
tf
.
shape
(
features
[
"
input_ids
"
])[
0
]
input_ids
=
tf
.
transpose
(
features
[
'
input_ids
'
],
[
1
,
0
])
input_ids
=
features
[
"
input_ids
"
]
seg_ids
=
tf
.
transpose
(
features
[
'
segment_ids
'
],
[
1
,
0
])
seg
ment
_ids
=
features
[
"
segment_ids
"
]
input_mask
=
tf
.
transpose
(
features
[
'
input_mask
'
],
[
1
,
0
])
input_mask
=
features
[
"
input_mask
"
]
label
=
tf
.
reshape
(
features
[
'
label_ids
'
],
[
b
sz
_per_core
])
label
=
tf
.
reshape
(
features
[
"
label_ids
"
],
[
b
atch_size
_per_core
])
mems
=
features
.
get
(
'
mems
'
,
None
)
mems
=
features
.
get
(
"
mems
"
,
None
)
transformerxl
_output
,
new_mems
,
self
.
lookup_table
=
(
attention
_output
,
new_mems
=
(
self
.
transformerxl
_model
(
input_ids
,
seg_ids
,
input_mask
,
mems
))
self
.
xlnet
_model
(
input_ids
,
seg
ment
_ids
,
input_mask
,
mems
))
summary
=
self
.
summarization_layer
(
transformerxl
_output
)
summary
=
self
.
summarization_layer
(
attention
_output
)
per_example_loss
,
logits
=
self
.
cl_loss_layer
(
hidden
=
summary
,
labels
=
label
)
per_example_loss
,
logits
=
self
.
cl_loss_layer
(
hidden
=
summary
,
labels
=
label
)
self
.
add_loss
(
tf
.
keras
.
backend
.
mean
(
per_example_loss
))
self
.
add_loss
(
tf
.
keras
.
backend
.
mean
(
per_example_loss
))
return
new_mems
,
logits
return
new_mems
,
logits
...
@@ -892,56 +889,57 @@ class LMLossLayer(tf.keras.layers.Layer):
...
@@ -892,56 +889,57 @@ class LMLossLayer(tf.keras.layers.Layer):
"""Layer computing cross entropy loss for language modeling."""
"""Layer computing cross entropy loss for language modeling."""
def
__init__
(
self
,
def
__init__
(
self
,
n_token
,
vocab_size
,
d_model
,
hidden_size
,
initializer
,
initializer
,
tie_weight
=
False
,
tie_weight
=
False
,
bi_data
=
True
,
bi_data
=
True
,
use_
tpu
=
False
,
use_
one_hot
=
False
,
use_proj
=
False
,
use_proj
=
False
,
**
kwargs
):
**
kwargs
):
"""Constructs LMLoss layer.
"""Constructs LMLoss layer.
Args:
Args:
n_token
: Number of tokens in vocabulary.
vocab_size
: Number of tokens in vocabulary.
d_model
: The dimension of model hidden state.
hidden_size
: The dimension of model hidden state.
initializer: Initializer used for parameters.
initializer: Initializer used for parameters.
tie_weight: Whether to share weights between embedding lookup layer and
tie_weight: Whether to share weights between embedding lookup layer and
next-token prediction layer.
next-token prediction layer.
bi_data: Whether to use bidirectional input pipeline. Usually set to True
bi_data: Whether to use bidirectional input pipeline. Usually set to True
during pretraining and False during finetuning.
during pretraining and False during finetuning.
use_tpu: bool, whether to use TPU.
use_one_hot: bool, whether to use one hot encodings. This should be used
when TPUs are used.
use_proj: bool, whether to add a projection layer before LM prediction.
use_proj: bool, whether to add a projection layer before LM prediction.
**kwargs: Other parameters.
**kwargs: Other parameters.
"""
"""
super
(
LMLossLayer
,
self
).
__init__
(
**
kwargs
)
super
(
LMLossLayer
,
self
).
__init__
(
**
kwargs
)
self
.
n_token
=
n_token
self
.
vocab_size
=
vocab_size
self
.
d_model
=
d_model
self
.
hidden_size
=
hidden_size
self
.
initializer
=
initializer
self
.
initializer
=
initializer
self
.
tie_weight
=
tie_weight
self
.
tie_weight
=
tie_weight
self
.
bi_data
=
bi_data
self
.
bi_data
=
bi_data
self
.
use_
tpu
=
use_tpu
self
.
use_
one_hot
=
use_one_hot
self
.
use_proj
=
use_proj
self
.
use_proj
=
use_proj
def
build
(
self
,
unused_input_shapes
):
def
build
(
self
,
unused_input_shapes
):
"""Implements build() for the layer."""
"""Implements build() for the layer."""
if
self
.
use_proj
:
if
self
.
use_proj
:
self
.
proj_layer
=
tf
.
keras
.
layers
.
Dense
(
self
.
proj_layer
=
tf
.
keras
.
layers
.
Dense
(
units
=
self
.
d_model
,
units
=
self
.
hidden_size
,
kernel_initializer
=
self
.
initializer
,
kernel_initializer
=
self
.
initializer
,
activation
=
gelu
,
activation
=
gelu
,
name
=
'
lm_projection/dense
'
)
name
=
"
lm_projection/dense
"
)
self
.
proj_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
self
.
proj_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
axis
=-
1
,
epsilon
=
1e-12
,
name
=
'
lm_projection/LayerNorm
'
)
axis
=-
1
,
epsilon
=
1e-12
,
name
=
"
lm_projection/LayerNorm
"
)
if
not
self
.
tie_weight
:
if
not
self
.
tie_weight
:
self
.
softmax_w
=
self
.
add_weight
(
self
.
softmax_w
=
self
.
add_weight
(
'
weight
'
,
"
weight
"
,
shape
=
[
self
.
n_token
,
self
.
d_model
],
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
initializer
=
self
.
initializer
)
initializer
=
self
.
initializer
)
self
.
softmax_b
=
self
.
add_weight
(
self
.
softmax_b
=
self
.
add_weight
(
'
bias
'
,
shape
=
[
self
.
n_token
],
initializer
=
tf
.
zeros_initializer
())
"
bias
"
,
shape
=
[
self
.
vocab_size
],
initializer
=
tf
.
zeros_initializer
())
super
(
LMLossLayer
,
self
).
build
(
unused_input_shapes
)
super
(
LMLossLayer
,
self
).
build
(
unused_input_shapes
)
...
@@ -950,12 +948,12 @@ class LMLossLayer(tf.keras.layers.Layer):
...
@@ -950,12 +948,12 @@ class LMLossLayer(tf.keras.layers.Layer):
if
self
.
use_proj
:
if
self
.
use_proj
:
hidden
=
self
.
proj_layer_norm
(
self
.
proj_layer
(
hidden
))
hidden
=
self
.
proj_layer_norm
(
self
.
proj_layer
(
hidden
))
if
self
.
tie_weight
:
if
self
.
tie_weight
:
logits
=
tf
.
einsum
(
'
ibd,nd->ibn
'
,
hidden
,
lookup_table
)
+
self
.
softmax_b
logits
=
tf
.
einsum
(
"
ibd,nd->ibn
"
,
hidden
,
lookup_table
)
+
self
.
softmax_b
else
:
else
:
logits
=
tf
.
einsum
(
'
ibd,nd->ibn
'
,
hidden
,
self
.
softmax_w
)
+
self
.
softmax_b
logits
=
tf
.
einsum
(
"
ibd,nd->ibn
"
,
hidden
,
self
.
softmax_w
)
+
self
.
softmax_b
if
self
.
use_
tpu
:
if
self
.
use_
one_hot
:
one_hot_target
=
tf
.
one_hot
(
target
,
self
.
n_token
,
dtype
=
logits
.
dtype
)
one_hot_target
=
tf
.
one_hot
(
target
,
self
.
vocab_size
,
dtype
=
logits
.
dtype
)
loss
=
-
tf
.
reduce_sum
(
tf
.
nn
.
log_softmax
(
logits
)
*
one_hot_target
,
-
1
)
loss
=
-
tf
.
reduce_sum
(
tf
.
nn
.
log_softmax
(
logits
)
*
one_hot_target
,
-
1
)
else
:
else
:
loss
=
tf
.
nn
.
sparse_softmax_cross_entropy_with_logits
(
loss
=
tf
.
nn
.
sparse_softmax_cross_entropy_with_logits
(
...
@@ -970,36 +968,36 @@ class Summarization(tf.keras.layers.Layer):
...
@@ -970,36 +968,36 @@ class Summarization(tf.keras.layers.Layer):
"""The layer to pool the output from XLNet model into a vector."""
"""The layer to pool the output from XLNet model into a vector."""
def
__init__
(
self
,
def
__init__
(
self
,
d_model
,
hidden_size
,
n_head
,
num_attentio
n_head
s
,
d_
head
,
head
_size
,
dropout
,
dropout
_rate
,
dropout_at
t
,
attention_
dropout_
r
at
e
,
initializer
,
initializer
,
use_proj
=
True
,
use_proj
=
True
,
summary_type
=
'
last
'
,
summary_type
=
"
last
"
,
**
kwargs
):
**
kwargs
):
"""Constructs Summarization layer.
"""Constructs Summarization layer.
Args:
Args:
d_model
: int, the dimension of model hidden state.
hidden_size
: int, the dimension of model hidden state.
n_head: int, the number of attention heads.
num_attentio
n_head
s
: int, the number of attention heads.
d_
head: int, the dimension size of each attention head.
head
_size
: int, the dimension size of each attention head.
dropout: float, dropout rate.
dropout
_rate
: float, dropout rate.
dropout_at
t
: float, dropout rate on attention probabilities.
attention_
dropout_
r
at
e
: float, dropout rate on attention probabilities.
initializer: Initializer used for parameters.
initializer: Initializer used for parameters.
use_proj: bool, whether to use projection layer for summarization.
use_proj: bool, whether to use projection layer for summarization.
summary_type: Method used to summarize a sequence into a compact vector.
summary_type: Method used to summarize a sequence into a compact vector.
**kwargs: Other parameters.
**kwargs: Other parameters.
"""
"""
super
(
Summarization
,
self
).
__init__
(
**
kwargs
)
super
(
Summarization
,
self
).
__init__
(
**
kwargs
)
self
.
d_model
=
d_model
self
.
hidden_size
=
hidden_size
self
.
n_head
=
n_head
self
.
n
um_attention
_head
s
=
num_attentio
n_head
s
self
.
d_
head
=
d_
head
self
.
head
_size
=
head
_size
self
.
initializer
=
initializer
self
.
initializer
=
initializer
self
.
dropout
=
dropout
self
.
dropout
_rate
=
dropout
_rate
self
.
dropout_at
t
=
dropout_at
t
self
.
attention_
dropout_
r
at
e
=
attention_
dropout_
r
at
e
self
.
use_proj
=
use_proj
self
.
use_proj
=
use_proj
self
.
summary_type
=
summary_type
self
.
summary_type
=
summary_type
...
@@ -1007,22 +1005,22 @@ class Summarization(tf.keras.layers.Layer):
...
@@ -1007,22 +1005,22 @@ class Summarization(tf.keras.layers.Layer):
"""Implements build() for the layer."""
"""Implements build() for the layer."""
if
self
.
use_proj
:
if
self
.
use_proj
:
self
.
proj_layer
=
tf
.
keras
.
layers
.
Dense
(
self
.
proj_layer
=
tf
.
keras
.
layers
.
Dense
(
units
=
self
.
d_model
,
units
=
self
.
hidden_size
,
kernel_initializer
=
self
.
initializer
,
kernel_initializer
=
self
.
initializer
,
activation
=
tf
.
nn
.
tanh
,
activation
=
tf
.
nn
.
tanh
,
name
=
'
summary
'
)
name
=
"
summary
"
)
self
.
dropout_layer
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout
)
self
.
dropout_layer
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout
_rate
)
super
(
Summarization
,
self
).
build
(
unused_input_shapes
)
super
(
Summarization
,
self
).
build
(
unused_input_shapes
)
def
call
(
self
,
inputs
):
def
call
(
self
,
inputs
):
"""Implements call() for the layer."""
"""Implements call() for the layer."""
if
self
.
summary_type
==
'
last
'
:
if
self
.
summary_type
==
"
last
"
:
summary
=
inputs
[
-
1
]
summary
=
inputs
[
:,
-
1
,
:
]
elif
self
.
summary_type
==
'
first
'
:
elif
self
.
summary_type
==
"
first
"
:
summary
=
inputs
[
0
]
summary
=
inputs
[
:,
0
,
:
]
else
:
else
:
raise
ValueError
(
'
Invalid summary type provided: %s
'
%
self
.
summary_type
)
raise
ValueError
(
"
Invalid summary type provided: %s
"
%
self
.
summary_type
)
if
self
.
use_proj
:
if
self
.
use_proj
:
summary
=
self
.
proj_layer
(
summary
)
summary
=
self
.
proj_layer
(
summary
)
summary
=
self
.
dropout_layer
(
summary
)
summary
=
self
.
dropout_layer
(
summary
)
...
@@ -1048,7 +1046,7 @@ class ClassificationLossLayer(tf.keras.layers.Layer):
...
@@ -1048,7 +1046,7 @@ class ClassificationLossLayer(tf.keras.layers.Layer):
def
build
(
self
,
unused_input_shapes
):
def
build
(
self
,
unused_input_shapes
):
"""Implements build() for the layer."""
"""Implements build() for the layer."""
self
.
proj_layer
=
tf
.
keras
.
layers
.
Dense
(
self
.
proj_layer
=
tf
.
keras
.
layers
.
Dense
(
units
=
self
.
n_class
,
kernel_initializer
=
self
.
initializer
,
name
=
'
logit
'
)
units
=
self
.
n_class
,
kernel_initializer
=
self
.
initializer
,
name
=
"
logit
"
)
super
(
ClassificationLossLayer
,
self
).
build
(
unused_input_shapes
)
super
(
ClassificationLossLayer
,
self
).
build
(
unused_input_shapes
)
...
@@ -1076,110 +1074,110 @@ class QAXLNetModel(tf.keras.Model):
...
@@ -1076,110 +1074,110 @@ class QAXLNetModel(tf.keras.Model):
self
.
initializer
=
_get_initializer
(
run_config
)
self
.
initializer
=
_get_initializer
(
run_config
)
self
.
xlnet_config
=
copy
.
deepcopy
(
xlnet_config
)
self
.
xlnet_config
=
copy
.
deepcopy
(
xlnet_config
)
self
.
transformerxl_model
=
TransformerXLModel
(
self
.
xlnet_model
=
networks
.
XLNetBase
(
n_token
=
self
.
xlnet_config
.
n_token
,
vocab_size
=
self
.
xlnet_config
.
n_token
,
initializer
=
self
.
initializer
,
initializer
=
self
.
initializer
,
attn_type
=
'bi'
,
attention_type
=
"bi"
,
n_layer
=
self
.
xlnet_config
.
n_layer
,
num_layers
=
self
.
xlnet_config
.
n_layer
,
d_model
=
self
.
xlnet_config
.
d_model
,
hidden_size
=
self
.
xlnet_config
.
d_model
,
n_head
=
self
.
xlnet_config
.
n_head
,
num_attention_heads
=
self
.
xlnet_config
.
n_head
,
d_head
=
self
.
xlnet_config
.
d_head
,
head_size
=
self
.
xlnet_config
.
d_head
,
d_inner
=
self
.
xlnet_config
.
d_inner
,
inner_size
=
self
.
xlnet_config
.
d_inner
,
ff_activation
=
self
.
xlnet_config
.
ff_activation
,
tie_attention_biases
=
not
self
.
xlnet_config
.
untie_r
,
untie_r
=
self
.
xlnet_config
.
untie_r
,
inner_activation
=
self
.
xlnet_config
.
ff_activation
,
is_training
=
self
.
run_config
.
is_training
,
dropout_rate
=
self
.
run_config
.
dropout
,
use_tpu
=
self
.
run_config
.
use_tpu
,
attention_dropout_rate
=
self
.
run_config
.
dropout_att
,
dropout
=
self
.
run_config
.
dropout
,
two_stream
=
False
,
dropout_att
=
self
.
run_config
.
dropout_att
,
memory_length
=
self
.
run_config
.
mem_len
,
mem_len
=
self
.
run_config
.
mem_len
,
reuse_length
=
self
.
run_config
.
reuse_len
,
reuse_len
=
self
.
run_config
.
reuse_len
,
bi_data
=
self
.
run_config
.
bi_data
,
bi_data
=
self
.
run_config
.
bi_data
,
clamp_len
=
self
.
run_config
.
clamp_len
,
clamp_len
gth
=
self
.
run_config
.
clamp_len
,
same_length
=
self
.
run_config
.
same_length
,
use_cls_mask
=
False
,
name
=
'transformer'
)
name
=
"xlnet_model"
)
self
.
qa_loss_layer
=
QALossLayer
(
self
.
qa_loss_layer
=
QALossLayer
(
d_model
=
self
.
xlnet_config
.
d_model
,
hidden_size
=
self
.
xlnet_config
.
d_model
,
start_n_top
=
start_n_top
,
start_n_top
=
start_n_top
,
end_n_top
=
end_n_top
,
end_n_top
=
end_n_top
,
initializer
=
self
.
initializer
,
initializer
=
self
.
initializer
,
dropout
=
self
.
run_config
.
dropout
)
dropout_rate
=
self
.
run_config
.
dropout
,
name
=
"qa_loss_layer"
)
def
call
(
self
,
features
,
training
=
False
):
def
call
(
self
,
features
,
training
=
False
):
"""Implements call() for the layer."""
"""Implements call() for the layer."""
input_ids
=
tf
.
transpose
(
features
[
'
input_ids
'
],
[
1
,
0
])
input_ids
=
features
[
"
input_ids
"
]
seg_ids
=
tf
.
transpose
(
features
[
'
segment_ids
'
],
[
1
,
0
])
seg
ment
_ids
=
features
[
"
segment_ids
"
]
input_mask
=
tf
.
transpose
(
features
[
'
input_mask
'
],
[
1
,
0
])
input_mask
=
features
[
"
input_mask
"
]
cls_index
=
tf
.
reshape
(
features
[
'
cls_index
'
],
[
-
1
])
cls_index
=
tf
.
reshape
(
features
[
"
cls_index
"
],
[
-
1
])
p_mask
=
features
[
'
p_mask
'
]
p_mask
=
features
[
"
p_mask
"
]
transformerxl
_output
,
new_mems
,
self
.
lookup_table
=
(
attention
_output
,
new_mems
=
(
self
.
transformerxl
_model
(
input_ids
,
seg_ids
,
input_mask
))
self
.
xlnet
_model
(
input_ids
,
seg
ment
_ids
,
input_mask
))
if
training
:
if
training
:
loss
,
logits
=
self
.
qa_loss_layer
(
loss
,
logits
=
self
.
qa_loss_layer
(
hidden
=
transformerxl
_output
,
hidden
=
attention
_output
,
p_mask
=
p_mask
,
p_mask
=
p_mask
,
cls_index
=
cls_index
,
cls_index
=
cls_index
,
start_positions
=
features
[
'
start_positions
'
],
start_positions
=
features
[
"
start_positions
"
],
end_positions
=
features
[
'
end_positions
'
],
end_positions
=
features
[
"
end_positions
"
],
is_impossible
=
features
[
'
is_impossible
'
])
is_impossible
=
features
[
"
is_impossible
"
])
self
.
add_loss
(
loss
)
self
.
add_loss
(
loss
)
return
new_mems
,
logits
return
new_mems
,
logits
else
:
else
:
results
=
self
.
qa_loss_layer
(
results
=
self
.
qa_loss_layer
(
hidden
=
transformerxl
_output
,
p_mask
=
p_mask
,
cls_index
=
cls_index
)
hidden
=
attention
_output
,
p_mask
=
p_mask
,
cls_index
=
cls_index
)
return
results
return
results
class
QALossLayer
(
tf
.
keras
.
layers
.
Layer
):
class
QALossLayer
(
tf
.
keras
.
layers
.
Layer
):
"""Layer computing position and regression loss for question answering task."""
"""Layer computing position and regression loss for question answering task."""
def
__init__
(
self
,
d_model
,
start_n_top
,
end_n_top
,
initializer
,
dropout
,
def
__init__
(
self
,
hidden_size
,
start_n_top
,
end_n_top
,
initializer
,
**
kwargs
):
dropout_rate
,
**
kwargs
):
"""Constructs Summarization layer.
"""Constructs Summarization layer.
Args:
Args:
d_model
: Int, the hidden size.
hidden_size
: Int, the hidden size.
start_n_top: Beam size for span start.
start_n_top: Beam size for span start.
end_n_top: Beam size for span end.
end_n_top: Beam size for span end.
initializer: Initializer used for parameters.
initializer: Initializer used for parameters.
dropout: float, dropout rate.
dropout
_rate
: float, dropout rate.
**kwargs: Other parameters.
**kwargs: Other parameters.
"""
"""
super
(
QALossLayer
,
self
).
__init__
(
**
kwargs
)
super
(
QALossLayer
,
self
).
__init__
(
**
kwargs
)
self
.
d_model
=
d_model
self
.
hidden_size
=
hidden_size
self
.
start_n_top
=
start_n_top
self
.
start_n_top
=
start_n_top
self
.
end_n_top
=
end_n_top
self
.
end_n_top
=
end_n_top
self
.
initializer
=
initializer
self
.
initializer
=
initializer
self
.
dropout
=
dropout
self
.
dropout
_rate
=
dropout
_rate
def
build
(
self
,
unused_input_shapes
):
def
build
(
self
,
unused_input_shapes
):
"""Implements build() for the layer."""
"""Implements build() for the layer."""
self
.
start_logits_proj_layer
=
tf
.
keras
.
layers
.
Dense
(
self
.
start_logits_proj_layer
=
tf
.
keras
.
layers
.
Dense
(
units
=
1
,
kernel_initializer
=
self
.
initializer
,
name
=
'
start_logits/dense
'
)
units
=
1
,
kernel_initializer
=
self
.
initializer
,
name
=
"
start_logits/dense
"
)
self
.
end_logits_proj_layer0
=
tf
.
keras
.
layers
.
Dense
(
self
.
end_logits_proj_layer0
=
tf
.
keras
.
layers
.
Dense
(
units
=
self
.
d_model
,
units
=
self
.
hidden_size
,
kernel_initializer
=
self
.
initializer
,
kernel_initializer
=
self
.
initializer
,
activation
=
tf
.
nn
.
tanh
,
activation
=
tf
.
nn
.
tanh
,
name
=
'
end_logits/dense_0
'
)
name
=
"
end_logits/dense_0
"
)
self
.
end_logits_proj_layer1
=
tf
.
keras
.
layers
.
Dense
(
self
.
end_logits_proj_layer1
=
tf
.
keras
.
layers
.
Dense
(
units
=
1
,
kernel_initializer
=
self
.
initializer
,
name
=
'
end_logits/dense_1
'
)
units
=
1
,
kernel_initializer
=
self
.
initializer
,
name
=
"
end_logits/dense_1
"
)
self
.
end_logits_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
self
.
end_logits_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
axis
=-
1
,
epsilon
=
1e-12
,
name
=
'
end_logits/LayerNorm
'
)
axis
=-
1
,
epsilon
=
1e-12
,
name
=
"
end_logits/LayerNorm
"
)
self
.
answer_class_proj_layer0
=
tf
.
keras
.
layers
.
Dense
(
self
.
answer_class_proj_layer0
=
tf
.
keras
.
layers
.
Dense
(
units
=
self
.
d_model
,
units
=
self
.
hidden_size
,
kernel_initializer
=
self
.
initializer
,
kernel_initializer
=
self
.
initializer
,
activation
=
tf
.
nn
.
tanh
,
activation
=
tf
.
nn
.
tanh
,
name
=
'
answer_class/dense_0
'
)
name
=
"
answer_class/dense_0
"
)
self
.
answer_class_proj_layer1
=
tf
.
keras
.
layers
.
Dense
(
self
.
answer_class_proj_layer1
=
tf
.
keras
.
layers
.
Dense
(
units
=
1
,
units
=
1
,
kernel_initializer
=
self
.
initializer
,
kernel_initializer
=
self
.
initializer
,
use_bias
=
False
,
use_bias
=
False
,
name
=
'
answer_class/dense_1
'
)
name
=
"
answer_class/dense_1
"
)
self
.
ans_feature_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout
)
self
.
ans_feature_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
dropout
_rate
)
super
(
QALossLayer
,
self
).
build
(
unused_input_shapes
)
super
(
QALossLayer
,
self
).
build
(
unused_input_shapes
)
def
__call__
(
self
,
hidden
,
p_mask
,
cls_index
,
**
kwargs
):
def
__call__
(
self
,
hidden
,
p_mask
,
cls_index
,
**
kwargs
):
...
@@ -1190,20 +1188,21 @@ class QALossLayer(tf.keras.layers.Layer):
...
@@ -1190,20 +1188,21 @@ class QALossLayer(tf.keras.layers.Layer):
"""Implements call() for the layer."""
"""Implements call() for the layer."""
hidden
,
p_mask
,
cls_index
,
kwargs
=
inputs
hidden
,
p_mask
,
cls_index
,
kwargs
=
inputs
return_dict
=
{}
return_dict
=
{}
seq_len
=
tf
.
shape
(
hidden
)[
0
]
seq_len
=
tf
.
shape
(
hidden
)[
1
]
hidden
=
tf
.
transpose
(
hidden
,
[
1
,
0
,
2
])
start_logits
=
self
.
start_logits_proj_layer
(
hidden
)
start_logits
=
self
.
start_logits_proj_layer
(
hidden
)
start_logits
=
tf
.
transpose
(
tf
.
squeeze
(
start_logits
,
-
1
),
[
1
,
0
])
start_logits
=
tf
.
transpose
(
tf
.
squeeze
(
start_logits
,
-
1
),
[
1
,
0
])
start_logits_masked
=
start_logits
*
(
1
-
p_mask
)
-
1e30
*
p_mask
start_logits_masked
=
start_logits
*
(
1
-
p_mask
)
-
1e30
*
p_mask
start_log_probs
=
tf
.
nn
.
log_softmax
(
start_logits_masked
,
-
1
)
start_log_probs
=
tf
.
nn
.
log_softmax
(
start_logits_masked
,
-
1
)
if
training
:
if
training
:
start_positions
=
kwargs
[
'
start_positions
'
]
start_positions
=
kwargs
[
"
start_positions
"
]
end_positions
=
kwargs
[
'
end_positions
'
]
end_positions
=
kwargs
[
"
end_positions
"
]
is_impossible
=
kwargs
[
'
is_impossible
'
]
is_impossible
=
kwargs
[
"
is_impossible
"
]
start_positions
=
tf
.
reshape
(
start_positions
,
[
-
1
])
start_positions
=
tf
.
reshape
(
start_positions
,
[
-
1
])
start_index
=
tf
.
one_hot
(
start_index
=
tf
.
one_hot
(
start_positions
,
depth
=
seq_len
,
axis
=-
1
,
dtype
=
tf
.
float32
)
start_positions
,
depth
=
seq_len
,
axis
=-
1
,
dtype
=
tf
.
float32
)
start_features
=
tf
.
einsum
(
'
lbh,bl->bh
'
,
hidden
,
start_index
)
start_features
=
tf
.
einsum
(
"
lbh,bl->bh
"
,
hidden
,
start_index
)
start_features
=
tf
.
tile
(
start_features
[
None
],
[
seq_len
,
1
,
1
])
start_features
=
tf
.
tile
(
start_features
[
None
],
[
seq_len
,
1
,
1
])
end_logits
=
self
.
end_logits_proj_layer0
(
end_logits
=
self
.
end_logits_proj_layer0
(
tf
.
concat
([
hidden
,
start_features
],
axis
=-
1
))
tf
.
concat
([
hidden
,
start_features
],
axis
=-
1
))
...
@@ -1221,16 +1220,16 @@ class QALossLayer(tf.keras.layers.Layer):
...
@@ -1221,16 +1220,16 @@ class QALossLayer(tf.keras.layers.Layer):
start_log_probs
,
k
=
self
.
start_n_top
)
start_log_probs
,
k
=
self
.
start_n_top
)
start_index
=
tf
.
one_hot
(
start_index
=
tf
.
one_hot
(
start_top_index
,
depth
=
seq_len
,
axis
=-
1
,
dtype
=
tf
.
float32
)
start_top_index
,
depth
=
seq_len
,
axis
=-
1
,
dtype
=
tf
.
float32
)
start_features
=
tf
.
einsum
(
'
lbh,bkl->bkh
'
,
hidden
,
start_index
)
start_features
=
tf
.
einsum
(
"
lbh,bkl->bkh
"
,
hidden
,
start_index
)
end_input
=
tf
.
tile
(
hidden
[:,
:,
None
],
[
1
,
1
,
self
.
start_n_top
,
1
])
end_input
=
tf
.
tile
(
hidden
[:,
:,
None
],
[
1
,
1
,
self
.
start_n_top
,
1
])
start_features
=
tf
.
tile
(
start_features
[
None
],
[
seq_len
,
1
,
1
,
1
])
start_features
=
tf
.
tile
(
start_features
[
None
],
[
seq_len
,
1
,
1
,
1
])
end_input
=
tf
.
concat
([
end_input
,
start_features
],
axis
=-
1
)
end_input
=
tf
.
concat
([
end_input
,
start_features
],
axis
=-
1
)
end_logits
=
self
.
end_logits_proj_layer0
(
end_input
)
end_logits
=
self
.
end_logits_proj_layer0
(
end_input
)
end_logits
=
tf
.
reshape
(
end_logits
,
[
seq_len
,
-
1
,
self
.
d_model
])
end_logits
=
tf
.
reshape
(
end_logits
,
[
seq_len
,
-
1
,
self
.
hidden_size
])
end_logits
=
self
.
end_logits_layer_norm
(
end_logits
)
end_logits
=
self
.
end_logits_layer_norm
(
end_logits
)
end_logits
=
tf
.
reshape
(
end_logits
,
end_logits
=
tf
.
reshape
(
end_logits
,
[
seq_len
,
-
1
,
self
.
start_n_top
,
self
.
d_model
])
[
seq_len
,
-
1
,
self
.
start_n_top
,
self
.
hidden_size
])
end_logits
=
self
.
end_logits_proj_layer1
(
end_logits
)
end_logits
=
self
.
end_logits_proj_layer1
(
end_logits
)
end_logits
=
tf
.
reshape
(
end_logits
,
[
seq_len
,
-
1
,
self
.
start_n_top
])
end_logits
=
tf
.
reshape
(
end_logits
,
[
seq_len
,
-
1
,
self
.
start_n_top
])
...
@@ -1246,29 +1245,29 @@ class QALossLayer(tf.keras.layers.Layer):
...
@@ -1246,29 +1245,29 @@ class QALossLayer(tf.keras.layers.Layer):
[
-
1
,
self
.
start_n_top
*
self
.
end_n_top
])
[
-
1
,
self
.
start_n_top
*
self
.
end_n_top
])
if
training
:
if
training
:
return_dict
[
'
start_log_probs
'
]
=
start_log_probs
return_dict
[
"
start_log_probs
"
]
=
start_log_probs
return_dict
[
'
end_log_probs
'
]
=
end_log_probs
return_dict
[
"
end_log_probs
"
]
=
end_log_probs
else
:
else
:
return_dict
[
'
start_top_log_probs
'
]
=
start_top_log_probs
return_dict
[
"
start_top_log_probs
"
]
=
start_top_log_probs
return_dict
[
'
start_top_index
'
]
=
start_top_index
return_dict
[
"
start_top_index
"
]
=
start_top_index
return_dict
[
'
end_top_log_probs
'
]
=
end_top_log_probs
return_dict
[
"
end_top_log_probs
"
]
=
end_top_log_probs
return_dict
[
'
end_top_index
'
]
=
end_top_index
return_dict
[
"
end_top_index
"
]
=
end_top_index
# an additional layer to predict answerability
# an additional layer to predict answerability
# get the representation of CLS
# get the representation of CLS
cls_index
=
tf
.
one_hot
(
cls_index
,
seq_len
,
axis
=-
1
,
dtype
=
tf
.
float32
)
cls_index
=
tf
.
one_hot
(
cls_index
,
seq_len
,
axis
=-
1
,
dtype
=
tf
.
float32
)
cls_feature
=
tf
.
einsum
(
'
lbh,bl->bh
'
,
hidden
,
cls_index
)
cls_feature
=
tf
.
einsum
(
"
lbh,bl->bh
"
,
hidden
,
cls_index
)
# get the representation of START
# get the representation of START
start_p
=
tf
.
nn
.
softmax
(
start_logits_masked
,
axis
=-
1
,
name
=
'
softmax_start
'
)
start_p
=
tf
.
nn
.
softmax
(
start_logits_masked
,
axis
=-
1
,
name
=
"
softmax_start
"
)
start_feature
=
tf
.
einsum
(
'
lbh,bl->bh
'
,
hidden
,
start_p
)
start_feature
=
tf
.
einsum
(
"
lbh,bl->bh
"
,
hidden
,
start_p
)
ans_feature
=
tf
.
concat
([
start_feature
,
cls_feature
],
-
1
)
ans_feature
=
tf
.
concat
([
start_feature
,
cls_feature
],
-
1
)
ans_feature
=
self
.
answer_class_proj_layer0
(
ans_feature
)
ans_feature
=
self
.
answer_class_proj_layer0
(
ans_feature
)
ans_feature
=
self
.
ans_feature_dropout
(
ans_feature
)
ans_feature
=
self
.
ans_feature_dropout
(
ans_feature
)
cls_logits
=
self
.
answer_class_proj_layer1
(
ans_feature
)
cls_logits
=
self
.
answer_class_proj_layer1
(
ans_feature
)
cls_logits
=
tf
.
squeeze
(
cls_logits
,
-
1
)
cls_logits
=
tf
.
squeeze
(
cls_logits
,
-
1
)
return_dict
[
'
cls_logits
'
]
=
cls_logits
return_dict
[
"
cls_logits
"
]
=
cls_logits
if
not
training
:
if
not
training
:
return
return_dict
return
return_dict
...
...
official/nlp/xlnet/xlnet_modeling_test.py
deleted
100644 → 0
View file @
8a670c65
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
absl
import
logging
import
numpy
as
np
import
tensorflow
as
tf
from
official.nlp.xlnet
import
xlnet_modeling
class
PositionalEmbeddingLayerTest
(
tf
.
test
.
TestCase
):
def
test_positional_embedding
(
self
):
"""A low-dimensional example is tested.
With len(pos_seq)=2 and d_model=4:
pos_seq = [[1.], [0.]]
inv_freq = [1., 0.01]
pos_seq x inv_freq = [[1, 0.01], [0., 0.]]
pos_emb = [[sin(1.), sin(0.01), cos(1.), cos(0.01)],
[sin(0.), sin(0.), cos(0.), cos(0.)]]
= [[0.84147096, 0.00999983, 0.54030228, 0.99994999],
[0., 0., 1., 1.]]
"""
target
=
np
.
array
([[[
0.84147096
,
0.00999983
,
0.54030228
,
0.99994999
]],
[[
0.
,
0.
,
1.
,
1.
]]])
d_model
=
4
pos_seq
=
tf
.
range
(
1
,
-
1
,
-
1.0
)
# [1., 0.]
pos_emb_layer
=
xlnet_modeling
.
RelativePositionEncoding
(
d_model
)
pos_emb
=
pos_emb_layer
(
pos_seq
,
batch_size
=
None
).
numpy
().
astype
(
float
)
logging
.
info
(
pos_emb
)
self
.
assertAllClose
(
pos_emb
,
target
)
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment