Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
16b63617
Commit
16b63617
authored
Sep 10, 2019
by
thomwolf
Browse files
xlnet paassing first test
parent
32aabe8c
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
66 additions
and
62 deletions
+66
-62
pytorch_transformers/modeling_tf_bert.py
pytorch_transformers/modeling_tf_bert.py
+9
-16
pytorch_transformers/modeling_tf_gpt2.py
pytorch_transformers/modeling_tf_gpt2.py
+4
-8
pytorch_transformers/modeling_tf_xlnet.py
pytorch_transformers/modeling_tf_xlnet.py
+47
-32
pytorch_transformers/tests/modeling_tf_xlnet_test.py
pytorch_transformers/tests/modeling_tf_xlnet_test.py
+6
-6
No files found.
pytorch_transformers/modeling_tf_bert.py
View file @
16b63617
...
@@ -218,8 +218,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
...
@@ -218,8 +218,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
embeddings
=
words_embeddings
+
position_embeddings
+
token_type_embeddings
embeddings
=
words_embeddings
+
position_embeddings
+
token_type_embeddings
embeddings
=
self
.
LayerNorm
(
embeddings
)
embeddings
=
self
.
LayerNorm
(
embeddings
)
if
training
:
embeddings
=
self
.
dropout
(
embeddings
,
training
=
training
)
embeddings
=
self
.
dropout
(
embeddings
)
return
embeddings
return
embeddings
def
_linear
(
self
,
inputs
):
def
_linear
(
self
,
inputs
):
...
@@ -286,10 +285,9 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
...
@@ -286,10 +285,9 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
# Normalize the attention scores to probabilities.
# Normalize the attention scores to probabilities.
attention_probs
=
tf
.
nn
.
softmax
(
attention_scores
,
axis
=-
1
)
attention_probs
=
tf
.
nn
.
softmax
(
attention_scores
,
axis
=-
1
)
if
training
:
# This is actually dropping out entire tokens to attend to, which might
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
self
.
dropout
(
attention_probs
)
attention_probs
=
self
.
dropout
(
attention_probs
,
training
=
training
)
# Mask heads if we want to
# Mask heads if we want to
if
head_mask
is
not
None
:
if
head_mask
is
not
None
:
...
@@ -316,8 +314,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
...
@@ -316,8 +314,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
hidden_states
,
input_tensor
=
inputs
hidden_states
,
input_tensor
=
inputs
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dense
(
hidden_states
)
if
training
:
hidden_states
=
self
.
dropout
(
hidden_states
,
training
=
training
)
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
return
hidden_states
return
hidden_states
...
@@ -366,8 +363,7 @@ class TFBertOutput(tf.keras.layers.Layer):
...
@@ -366,8 +363,7 @@ class TFBertOutput(tf.keras.layers.Layer):
hidden_states
,
input_tensor
=
inputs
hidden_states
,
input_tensor
=
inputs
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dense
(
hidden_states
)
if
training
:
hidden_states
=
self
.
dropout
(
hidden_states
,
training
=
training
)
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
return
hidden_states
return
hidden_states
...
@@ -871,8 +867,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
...
@@ -871,8 +867,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
pooled_output
=
outputs
[
1
]
pooled_output
=
outputs
[
1
]
if
training
:
pooled_output
=
self
.
dropout
(
pooled_output
,
training
=
training
)
pooled_output
=
self
.
dropout
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
outputs
=
(
logits
,)
+
outputs
[
2
:]
# add hidden states and attention if they are here
outputs
=
(
logits
,)
+
outputs
[
2
:]
# add hidden states and attention if they are here
...
@@ -947,8 +942,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
...
@@ -947,8 +942,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
pooled_output
=
outputs
[
1
]
pooled_output
=
outputs
[
1
]
if
training
:
pooled_output
=
self
.
dropout
(
pooled_output
,
training
=
training
)
pooled_output
=
self
.
dropout
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
reshaped_logits
=
tf
.
reshape
(
logits
,
(
-
1
,
num_choices
))
reshaped_logits
=
tf
.
reshape
(
logits
,
(
-
1
,
num_choices
))
...
@@ -995,8 +989,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
...
@@ -995,8 +989,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
sequence_output
=
outputs
[
0
]
sequence_output
=
outputs
[
0
]
if
training
:
sequence_output
=
self
.
dropout
(
sequence_output
,
training
=
training
)
sequence_output
=
self
.
dropout
(
sequence_output
)
logits
=
self
.
classifier
(
sequence_output
)
logits
=
self
.
classifier
(
sequence_output
)
outputs
=
(
logits
,)
+
outputs
[
2
:]
# add hidden states and attention if they are here
outputs
=
(
logits
,)
+
outputs
[
2
:]
# add hidden states and attention if they are here
...
...
pytorch_transformers/modeling_tf_gpt2.py
View file @
16b63617
...
@@ -178,8 +178,7 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -178,8 +178,7 @@ class TFAttention(tf.keras.layers.Layer):
w
=
w
+
attention_mask
w
=
w
+
attention_mask
w
=
tf
.
nn
.
softmax
(
w
,
axis
=-
1
)
w
=
tf
.
nn
.
softmax
(
w
,
axis
=-
1
)
if
training
:
w
=
self
.
attn_dropout
(
w
,
training
=
training
)
w
=
self
.
attn_dropout
(
w
)
# Mask heads if we want to
# Mask heads if we want to
if
head_mask
is
not
None
:
if
head_mask
is
not
None
:
...
@@ -221,8 +220,7 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -221,8 +220,7 @@ class TFAttention(tf.keras.layers.Layer):
a
=
self
.
merge_heads
(
a
)
a
=
self
.
merge_heads
(
a
)
a
=
self
.
c_proj
(
a
)
a
=
self
.
c_proj
(
a
)
if
training
:
a
=
self
.
resid_dropout
(
a
,
training
=
training
)
a
=
self
.
resid_dropout
(
a
)
outputs
=
[
a
,
present
]
+
attn_outputs
[
1
:]
outputs
=
[
a
,
present
]
+
attn_outputs
[
1
:]
return
outputs
# a, present, (attentions)
return
outputs
# a, present, (attentions)
...
@@ -240,8 +238,7 @@ class TFMLP(tf.keras.layers.Layer):
...
@@ -240,8 +238,7 @@ class TFMLP(tf.keras.layers.Layer):
def
call
(
self
,
x
,
training
=
False
):
def
call
(
self
,
x
,
training
=
False
):
h
=
self
.
act
(
self
.
c_fc
(
x
))
h
=
self
.
act
(
self
.
c_fc
(
x
))
h2
=
self
.
c_proj
(
h
)
h2
=
self
.
c_proj
(
h
)
if
training
:
h2
=
self
.
dropout
(
h2
,
training
=
training
)
h2
=
self
.
dropout
(
h2
)
return
h2
return
h2
...
@@ -368,8 +365,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
...
@@ -368,8 +365,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
else
:
else
:
token_type_embeds
=
0
token_type_embeds
=
0
hidden_states
=
inputs_embeds
+
position_embeds
+
token_type_embeds
hidden_states
=
inputs_embeds
+
position_embeds
+
token_type_embeds
if
training
:
hidden_states
=
self
.
drop
(
hidden_states
,
training
=
training
)
hidden_states
=
self
.
drop
(
hidden_states
)
output_shape
=
input_shape
+
[
shape_list
(
hidden_states
)[
-
1
]]
output_shape
=
input_shape
+
[
shape_list
(
hidden_states
)[
-
1
]]
...
...
pytorch_transformers/modeling_tf_xlnet.py
View file @
16b63617
...
@@ -145,7 +145,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
...
@@ -145,7 +145,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
'layer_norm'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
'layer_norm'
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
def
build
(
input_shape
):
def
build
(
self
,
input_shape
):
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
initializer_range
)
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
initializer_range
)
self
.
q
=
self
.
add_weight
(
shape
=
(
self
.
d_model
,
self
.
n_head
,
self
.
d_head
),
self
.
q
=
self
.
add_weight
(
shape
=
(
self
.
d_model
,
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
initializer
=
initializer
,
...
@@ -221,10 +221,9 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
...
@@ -221,10 +221,9 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
attn_score
=
attn_score
-
1e30
*
attn_mask
attn_score
=
attn_score
-
1e30
*
attn_mask
# attention probability
# attention probability
attn_prob
=
tf
.
softmax
(
attn_score
,
axis
=
1
)
attn_prob
=
tf
.
nn
.
softmax
(
attn_score
,
axis
=
1
)
if
training
:
attn_prob
=
self
.
dropout
(
attn_prob
,
training
=
training
)
attn_prob
=
self
.
dropout
(
attn_prob
)
# Mask heads if we want to
# Mask heads if we want to
if
head_mask
is
not
None
:
if
head_mask
is
not
None
:
...
@@ -245,10 +244,9 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
...
@@ -245,10 +244,9 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
attn_out
=
tf
.
einsum
(
'ibnd,hnd->ibh'
,
attn_vec
,
self
.
o
)
attn_out
=
tf
.
einsum
(
'ibnd,hnd->ibh'
,
attn_vec
,
self
.
o
)
if
training
:
attn_out
=
self
.
dropout
(
attn_out
,
training
=
training
)
attn_out
=
self
.
dropout
(
attn_out
)
if
residual
:
if
residual
is
not
None
:
attn_out
=
attn_out
+
h
attn_out
=
attn_out
+
h
output
=
self
.
layer_norm
(
attn_out
)
output
=
self
.
layer_norm
(
attn_out
)
...
@@ -288,7 +286,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
...
@@ -288,7 +286,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
attn_vec_h
,
attn_prob_h
=
attn_vec_h
attn_vec_h
,
attn_prob_h
=
attn_vec_h
# post processing
# post processing
output_h
=
self
.
post_attention
([
h
,
attn_vec_h
],
training
=
training
)
output_h
=
self
.
post_attention
([
h
,
attn_vec_h
,
None
],
training
=
training
)
##### g-stream
##### g-stream
# query-stream query head
# query-stream query head
...
@@ -314,7 +312,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
...
@@ -314,7 +312,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
attn_vec_g
,
attn_prob_g
=
attn_vec_g
attn_vec_g
,
attn_prob_g
=
attn_vec_g
# post processing
# post processing
output_g
=
self
.
post_attention
([
g
,
attn_vec_g
],
training
=
training
)
output_g
=
self
.
post_attention
([
g
,
attn_vec_g
,
None
],
training
=
training
)
if
self
.
output_attentions
:
if
self
.
output_attentions
:
attn_prob
=
attn_prob_h
,
attn_prob_g
attn_prob
=
attn_prob_h
,
attn_prob_g
...
@@ -343,7 +341,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
...
@@ -343,7 +341,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
attn_vec
,
attn_prob
=
attn_vec
attn_vec
,
attn_prob
=
attn_vec
# post processing
# post processing
output_h
=
self
.
post_attention
([
h
,
attn_vec
],
training
=
training
)
output_h
=
self
.
post_attention
([
h
,
attn_vec
,
None
],
training
=
training
)
output_g
=
None
output_g
=
None
outputs
=
(
output_h
,
output_g
)
outputs
=
(
output_h
,
output_g
)
...
@@ -368,11 +366,9 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
...
@@ -368,11 +366,9 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
output
=
inp
output
=
inp
output
=
self
.
layer_1
(
output
)
output
=
self
.
layer_1
(
output
)
output
=
self
.
activation_function
(
output
)
output
=
self
.
activation_function
(
output
)
if
training
:
output
=
self
.
dropout
(
output
,
training
=
training
)
output
=
self
.
dropout
(
output
)
output
=
self
.
layer_2
(
output
)
output
=
self
.
layer_2
(
output
)
if
training
:
output
=
self
.
dropout
(
output
,
training
=
training
)
output
=
self
.
dropout
(
output
)
output
=
self
.
layer_norm
(
output
+
inp
)
output
=
self
.
layer_norm
(
output
+
inp
)
return
output
return
output
...
@@ -413,12 +409,12 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
...
@@ -413,12 +409,12 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
self
.
initializer_range
=
config
.
initializer_range
self
.
initializer_range
=
config
.
initializer_range
self
.
word_embedding
=
TFSharedEmbeddings
(
config
.
n_token
,
config
.
d_model
,
initializer_range
=
config
.
initializer_range
,
name
=
'word_embedding'
)
self
.
word_embedding
=
TFSharedEmbeddings
(
config
.
n_token
,
config
.
d_model
,
initializer_range
=
config
.
initializer_range
,
name
=
'word_embedding'
)
self
.
layer
=
[
XLNetLayer
(
config
,
name
=
'layer_{}'
.
format
(
i
))
for
i
in
range
(
config
.
n_layer
)]
self
.
layer
=
[
TF
XLNetLayer
(
config
,
name
=
'layer_{}'
.
format
(
i
))
for
i
in
range
(
config
.
n_layer
)]
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
def
build
(
input_shape
):
def
build
(
self
,
input_shape
):
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
initializer_range
)
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
initializer_range
)
self
.
mask_emb
=
self
.
add_weight
(
shape
=
(
1
,
1
,
config
.
d_model
),
self
.
mask_emb
=
self
.
add_weight
(
shape
=
(
1
,
1
,
self
.
d_model
),
initializer
=
initializer
,
initializer
=
initializer
,
trainable
=
True
,
name
=
'mask_emb'
)
trainable
=
True
,
name
=
'mask_emb'
)
...
@@ -532,16 +528,39 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
...
@@ -532,16 +528,39 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
return
pos_emb
return
pos_emb
def
call
(
self
,
inputs
,
training
=
False
):
def
call
(
self
,
inputs
,
training
=
False
):
(
input_ids
,
attention_mask
,
mems
,
perm_mask
,
target_mapping
,
if
not
isinstance
(
inputs
,
(
dict
,
tuple
,
list
)):
token_type_ids
,
input_mask
,
head_mask
)
=
inputs
input_ids
=
inputs
(
attention_mask
,
mems
,
perm_mask
,
target_mapping
,
token_type_ids
,
input_mask
,
head_mask
)
=
None
,
None
,
None
,
None
,
None
,
None
,
None
elif
isinstance
(
inputs
,
(
tuple
,
list
)):
input_ids
=
inputs
[
0
]
attention_mask
=
inputs
[
1
]
if
len
(
inputs
)
>
1
else
None
mems
=
inputs
[
2
]
if
len
(
inputs
)
>
2
else
None
perm_mask
=
inputs
[
3
]
if
len
(
inputs
)
>
3
else
None
target_mapping
=
inputs
[
4
]
if
len
(
inputs
)
>
4
else
None
token_type_ids
=
inputs
[
5
]
if
len
(
inputs
)
>
5
else
None
input_mask
=
inputs
[
6
]
if
len
(
inputs
)
>
6
else
None
head_mask
=
inputs
[
7
]
if
len
(
inputs
)
>
7
else
None
assert
len
(
inputs
)
<=
8
,
"Too many inputs."
else
:
input_ids
=
inputs
.
get
(
'input_ids'
)
attention_mask
=
inputs
.
get
(
'attention_mask'
,
None
)
mems
=
inputs
.
get
(
'mems'
,
None
)
perm_mask
=
inputs
.
get
(
'perm_mask'
,
None
)
target_mapping
=
inputs
.
get
(
'target_mapping'
,
None
)
token_type_ids
=
inputs
.
get
(
'token_type_ids'
,
None
)
input_mask
=
inputs
.
get
(
'input_mask'
,
None
)
head_mask
=
inputs
.
get
(
'head_mask'
,
None
)
assert
len
(
inputs
)
<=
8
,
"Too many inputs."
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension
# but we want a unified interface in the library with the batch size on the first dimension
# so we move here the first dimension (batch) to the end
# so we move here the first dimension (batch) to the end
input_ids
=
tf
.
transpose
(
input_ids
,
perm
=
(
0
,
1
))
input_ids
=
tf
.
transpose
(
input_ids
,
perm
=
(
1
,
0
))
token_type_ids
=
tf
.
transpose
(
token_type_ids
,
perm
=
(
0
,
1
))
if
token_type_ids
is
not
None
else
None
token_type_ids
=
tf
.
transpose
(
token_type_ids
,
perm
=
(
1
,
0
))
if
token_type_ids
is
not
None
else
None
input_mask
=
tf
.
transpose
(
input_mask
,
perm
=
(
0
,
1
))
if
input_mask
is
not
None
else
None
input_mask
=
tf
.
transpose
(
input_mask
,
perm
=
(
1
,
0
))
if
input_mask
is
not
None
else
None
attention_mask
=
tf
.
transpose
(
attention_mask
,
perm
=
(
0
,
1
))
if
attention_mask
is
not
None
else
None
attention_mask
=
tf
.
transpose
(
attention_mask
,
perm
=
(
1
,
0
))
if
attention_mask
is
not
None
else
None
perm_mask
=
tf
.
transpose
(
perm_mask
,
perm
=
(
1
,
2
,
0
))
if
perm_mask
is
not
None
else
None
perm_mask
=
tf
.
transpose
(
perm_mask
,
perm
=
(
1
,
2
,
0
))
if
perm_mask
is
not
None
else
None
target_mapping
=
tf
.
transpose
(
target_mapping
,
perm
=
(
1
,
2
,
0
))
if
target_mapping
is
not
None
else
None
target_mapping
=
tf
.
transpose
(
target_mapping
,
perm
=
(
1
,
2
,
0
))
if
target_mapping
is
not
None
else
None
...
@@ -597,15 +616,13 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
...
@@ -597,15 +616,13 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
##### Word embeddings and prepare h & g hidden states
##### Word embeddings and prepare h & g hidden states
word_emb_k
=
self
.
word_embedding
(
input_ids
)
word_emb_k
=
self
.
word_embedding
(
input_ids
)
if
training
:
output_h
=
self
.
dropout
(
word_emb_k
,
training
=
training
)
output_h
=
self
.
dropout
(
word_emb_k
)
if
target_mapping
is
not
None
:
if
target_mapping
is
not
None
:
word_emb_q
=
tf
.
tile
(
mask_emb
,
[
tf
.
shape
(
target_mapping
)[
0
],
bsz
,
1
])
word_emb_q
=
tf
.
tile
(
mask_emb
,
[
tf
.
shape
(
target_mapping
)[
0
],
bsz
,
1
])
# else: # We removed the inp_q input which was same as target mapping
# else: # We removed the inp_q input which was same as target mapping
# inp_q_ext = inp_q[:, :, None]
# inp_q_ext = inp_q[:, :, None]
# word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
# word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
if
training
:
output_g
=
self
.
dropout
(
word_emb_q
,
training
=
training
)
output_g
=
self
.
dropout
(
word_emb_q
)
else
:
else
:
output_g
=
None
output_g
=
None
...
@@ -625,8 +642,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
...
@@ -625,8 +642,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
##### Positional encoding
##### Positional encoding
pos_emb
=
self
.
relative_positional_encoding
(
qlen
,
klen
,
bsz
=
bsz
,
dtype
=
dtype_float
)
pos_emb
=
self
.
relative_positional_encoding
(
qlen
,
klen
,
bsz
=
bsz
,
dtype
=
dtype_float
)
if
training
:
pos_emb
=
self
.
dropout
(
pos_emb
,
training
=
training
)
pos_emb
=
self
.
dropout
(
pos_emb
)
# Prepare head mask if needed
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# 1.0 in head_mask indicate we keep the head
...
@@ -666,8 +682,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
...
@@ -666,8 +682,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
if
self
.
output_hidden_states
:
if
self
.
output_hidden_states
:
hidden_states
.
append
((
output_h
,
output_g
)
if
output_g
is
not
None
else
output_h
)
hidden_states
.
append
((
output_h
,
output_g
)
if
output_g
is
not
None
else
output_h
)
if
training
:
output
=
self
.
dropout
(
output_g
if
output_g
is
not
None
else
output_h
,
training
=
training
)
output
=
self
.
dropout
(
output_g
if
output_g
is
not
None
else
output_h
)
# Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
# Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
outputs
=
(
tf
.
transpose
(
output
,
perm
=
(
1
,
0
,
2
)),
new_mems
)
outputs
=
(
tf
.
transpose
(
output
,
perm
=
(
1
,
0
,
2
)),
new_mems
)
...
@@ -805,7 +820,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
...
@@ -805,7 +820,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
(
TFXLNetModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TF
Ber
tMainLayer
(
config
,
name
=
'transformer'
)
self
.
transformer
=
TF
XLNe
tMainLayer
(
config
,
name
=
'transformer'
)
def
call
(
self
,
inputs
,
training
=
False
):
def
call
(
self
,
inputs
,
training
=
False
):
outputs
=
self
.
transformer
(
inputs
,
training
=
training
)
outputs
=
self
.
transformer
(
inputs
,
training
=
training
)
...
...
pytorch_transformers/tests/modeling_tf_xlnet_test.py
View file @
16b63617
...
@@ -105,8 +105,8 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
...
@@ -105,8 +105,8 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
perm_mask_last
=
tf
.
ones
((
self
.
batch_size
,
self
.
seq_length
+
1
,
1
),
dtype
=
tf
.
float32
)
perm_mask_last
=
tf
.
ones
((
self
.
batch_size
,
self
.
seq_length
+
1
,
1
),
dtype
=
tf
.
float32
)
perm_mask
=
tf
.
concat
([
perm_mask
,
perm_mask_last
],
axis
=-
1
)
perm_mask
=
tf
.
concat
([
perm_mask
,
perm_mask_last
],
axis
=-
1
)
# perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
# perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
target_mapping
=
tf
.
zeros
((
self
.
batch_size
,
1
,
self
.
seq_length
),
dtype
=
t
orch
.
float32
)
target_mapping
=
tf
.
zeros
((
self
.
batch_size
,
1
,
self
.
seq_length
),
dtype
=
t
f
.
float32
)
target_mapping_last
=
tf
.
ones
((
self
.
batch_size
,
1
,
1
),
dtype
=
t
orch
.
float32
)
target_mapping_last
=
tf
.
ones
((
self
.
batch_size
,
1
,
1
),
dtype
=
t
f
.
float32
)
target_mapping
=
tf
.
concat
([
target_mapping
,
target_mapping_last
],
axis
=-
1
)
target_mapping
=
tf
.
concat
([
target_mapping
,
target_mapping_last
],
axis
=-
1
)
# target_mapping[:, 0, -1] = 1.0 # predict last token
# target_mapping[:, 0, -1] = 1.0 # predict last token
...
@@ -145,18 +145,18 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
...
@@ -145,18 +145,18 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
):
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
):
model
=
TFXLNetModel
(
config
)
model
=
TFXLNetModel
(
config
)
inputs
=
{
'input_ids'
:
input_ids
,
inputs
=
{
'input_ids'
:
input_ids
_1
,
'input_mask'
:
input_mask
,
'input_mask'
:
input_mask
,
'token_type_ids'
:
token_type
_ids
}
'token_type_ids'
:
segment
_ids
}
_
,
_
=
model
(
inputs
)
_
,
_
=
model
(
inputs
)
inputs
=
[
input_ids
,
input_mask
]
inputs
=
[
input_ids
_1
,
input_mask
]
outputs
,
mems_1
=
model
(
inputs
)
outputs
,
mems_1
=
model
(
inputs
)
result
=
{
result
=
{
"mems_1"
:
[
mem
.
numpy
()
for
m
in
mems_1
],
"mems_1"
:
[
mem
.
numpy
()
for
m
em
in
mems_1
],
"outputs"
:
outputs
.
numpy
(),
"outputs"
:
outputs
.
numpy
(),
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment