Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
16b63617
Commit
16b63617
authored
Sep 10, 2019
by
thomwolf
Browse files
xlnet paassing first test
parent
32aabe8c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
66 additions
and
62 deletions
+66
-62
pytorch_transformers/modeling_tf_bert.py
pytorch_transformers/modeling_tf_bert.py
+9
-16
pytorch_transformers/modeling_tf_gpt2.py
pytorch_transformers/modeling_tf_gpt2.py
+4
-8
pytorch_transformers/modeling_tf_xlnet.py
pytorch_transformers/modeling_tf_xlnet.py
+47
-32
pytorch_transformers/tests/modeling_tf_xlnet_test.py
pytorch_transformers/tests/modeling_tf_xlnet_test.py
+6
-6
No files found.
pytorch_transformers/modeling_tf_bert.py
View file @
16b63617
...
...
@@ -218,8 +218,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
embeddings
=
words_embeddings
+
position_embeddings
+
token_type_embeddings
embeddings
=
self
.
LayerNorm
(
embeddings
)
if
training
:
embeddings
=
self
.
dropout
(
embeddings
)
embeddings
=
self
.
dropout
(
embeddings
,
training
=
training
)
return
embeddings
def
_linear
(
self
,
inputs
):
...
...
@@ -286,10 +285,9 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
# Normalize the attention scores to probabilities.
attention_probs
=
tf
.
nn
.
softmax
(
attention_scores
,
axis
=-
1
)
if
training
:
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
self
.
dropout
(
attention_probs
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
self
.
dropout
(
attention_probs
,
training
=
training
)
# Mask heads if we want to
if
head_mask
is
not
None
:
...
...
@@ -316,8 +314,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
hidden_states
,
input_tensor
=
inputs
hidden_states
=
self
.
dense
(
hidden_states
)
if
training
:
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
,
training
=
training
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
return
hidden_states
...
...
@@ -366,8 +363,7 @@ class TFBertOutput(tf.keras.layers.Layer):
hidden_states
,
input_tensor
=
inputs
hidden_states
=
self
.
dense
(
hidden_states
)
if
training
:
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
,
training
=
training
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
return
hidden_states
...
...
@@ -871,8 +867,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
pooled_output
=
outputs
[
1
]
if
training
:
pooled_output
=
self
.
dropout
(
pooled_output
)
pooled_output
=
self
.
dropout
(
pooled_output
,
training
=
training
)
logits
=
self
.
classifier
(
pooled_output
)
outputs
=
(
logits
,)
+
outputs
[
2
:]
# add hidden states and attention if they are here
...
...
@@ -947,8 +942,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
pooled_output
=
outputs
[
1
]
if
training
:
pooled_output
=
self
.
dropout
(
pooled_output
)
pooled_output
=
self
.
dropout
(
pooled_output
,
training
=
training
)
logits
=
self
.
classifier
(
pooled_output
)
reshaped_logits
=
tf
.
reshape
(
logits
,
(
-
1
,
num_choices
))
...
...
@@ -995,8 +989,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
sequence_output
=
outputs
[
0
]
if
training
:
sequence_output
=
self
.
dropout
(
sequence_output
)
sequence_output
=
self
.
dropout
(
sequence_output
,
training
=
training
)
logits
=
self
.
classifier
(
sequence_output
)
outputs
=
(
logits
,)
+
outputs
[
2
:]
# add hidden states and attention if they are here
...
...
pytorch_transformers/modeling_tf_gpt2.py
View file @
16b63617
...
...
@@ -178,8 +178,7 @@ class TFAttention(tf.keras.layers.Layer):
w
=
w
+
attention_mask
w
=
tf
.
nn
.
softmax
(
w
,
axis
=-
1
)
if
training
:
w
=
self
.
attn_dropout
(
w
)
w
=
self
.
attn_dropout
(
w
,
training
=
training
)
# Mask heads if we want to
if
head_mask
is
not
None
:
...
...
@@ -221,8 +220,7 @@ class TFAttention(tf.keras.layers.Layer):
a
=
self
.
merge_heads
(
a
)
a
=
self
.
c_proj
(
a
)
if
training
:
a
=
self
.
resid_dropout
(
a
)
a
=
self
.
resid_dropout
(
a
,
training
=
training
)
outputs
=
[
a
,
present
]
+
attn_outputs
[
1
:]
return
outputs
# a, present, (attentions)
...
...
@@ -240,8 +238,7 @@ class TFMLP(tf.keras.layers.Layer):
def
call
(
self
,
x
,
training
=
False
):
h
=
self
.
act
(
self
.
c_fc
(
x
))
h2
=
self
.
c_proj
(
h
)
if
training
:
h2
=
self
.
dropout
(
h2
)
h2
=
self
.
dropout
(
h2
,
training
=
training
)
return
h2
...
...
@@ -368,8 +365,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
else
:
token_type_embeds
=
0
hidden_states
=
inputs_embeds
+
position_embeds
+
token_type_embeds
if
training
:
hidden_states
=
self
.
drop
(
hidden_states
)
hidden_states
=
self
.
drop
(
hidden_states
,
training
=
training
)
output_shape
=
input_shape
+
[
shape_list
(
hidden_states
)[
-
1
]]
...
...
pytorch_transformers/modeling_tf_xlnet.py
View file @
16b63617
...
...
@@ -145,7 +145,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
'layer_norm'
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
def
build
(
input_shape
):
def
build
(
self
,
input_shape
):
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
initializer_range
)
self
.
q
=
self
.
add_weight
(
shape
=
(
self
.
d_model
,
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
...
...
@@ -221,10 +221,9 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
attn_score
=
attn_score
-
1e30
*
attn_mask
# attention probability
attn_prob
=
tf
.
softmax
(
attn_score
,
axis
=
1
)
attn_prob
=
tf
.
nn
.
softmax
(
attn_score
,
axis
=
1
)
if
training
:
attn_prob
=
self
.
dropout
(
attn_prob
)
attn_prob
=
self
.
dropout
(
attn_prob
,
training
=
training
)
# Mask heads if we want to
if
head_mask
is
not
None
:
...
...
@@ -245,10 +244,9 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
attn_out
=
tf
.
einsum
(
'ibnd,hnd->ibh'
,
attn_vec
,
self
.
o
)
if
training
:
attn_out
=
self
.
dropout
(
attn_out
)
attn_out
=
self
.
dropout
(
attn_out
,
training
=
training
)
if
residual
:
if
residual
is
not
None
:
attn_out
=
attn_out
+
h
output
=
self
.
layer_norm
(
attn_out
)
...
...
@@ -288,7 +286,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
attn_vec_h
,
attn_prob_h
=
attn_vec_h
# post processing
output_h
=
self
.
post_attention
([
h
,
attn_vec_h
],
training
=
training
)
output_h
=
self
.
post_attention
([
h
,
attn_vec_h
,
None
],
training
=
training
)
##### g-stream
# query-stream query head
...
...
@@ -314,7 +312,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
attn_vec_g
,
attn_prob_g
=
attn_vec_g
# post processing
output_g
=
self
.
post_attention
([
g
,
attn_vec_g
],
training
=
training
)
output_g
=
self
.
post_attention
([
g
,
attn_vec_g
,
None
],
training
=
training
)
if
self
.
output_attentions
:
attn_prob
=
attn_prob_h
,
attn_prob_g
...
...
@@ -343,7 +341,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
attn_vec
,
attn_prob
=
attn_vec
# post processing
output_h
=
self
.
post_attention
([
h
,
attn_vec
],
training
=
training
)
output_h
=
self
.
post_attention
([
h
,
attn_vec
,
None
],
training
=
training
)
output_g
=
None
outputs
=
(
output_h
,
output_g
)
...
...
@@ -368,11 +366,9 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
output
=
inp
output
=
self
.
layer_1
(
output
)
output
=
self
.
activation_function
(
output
)
if
training
:
output
=
self
.
dropout
(
output
)
output
=
self
.
dropout
(
output
,
training
=
training
)
output
=
self
.
layer_2
(
output
)
if
training
:
output
=
self
.
dropout
(
output
)
output
=
self
.
dropout
(
output
,
training
=
training
)
output
=
self
.
layer_norm
(
output
+
inp
)
return
output
...
...
@@ -413,12 +409,12 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
self
.
initializer_range
=
config
.
initializer_range
self
.
word_embedding
=
TFSharedEmbeddings
(
config
.
n_token
,
config
.
d_model
,
initializer_range
=
config
.
initializer_range
,
name
=
'word_embedding'
)
self
.
layer
=
[
XLNetLayer
(
config
,
name
=
'layer_{}'
.
format
(
i
))
for
i
in
range
(
config
.
n_layer
)]
self
.
layer
=
[
TF
XLNetLayer
(
config
,
name
=
'layer_{}'
.
format
(
i
))
for
i
in
range
(
config
.
n_layer
)]
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
def
build
(
input_shape
):
def
build
(
self
,
input_shape
):
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
initializer_range
)
self
.
mask_emb
=
self
.
add_weight
(
shape
=
(
1
,
1
,
config
.
d_model
),
self
.
mask_emb
=
self
.
add_weight
(
shape
=
(
1
,
1
,
self
.
d_model
),
initializer
=
initializer
,
trainable
=
True
,
name
=
'mask_emb'
)
...
...
@@ -532,16 +528,39 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
return
pos_emb
def
call
(
self
,
inputs
,
training
=
False
):
(
input_ids
,
attention_mask
,
mems
,
perm_mask
,
target_mapping
,
token_type_ids
,
input_mask
,
head_mask
)
=
inputs
if
not
isinstance
(
inputs
,
(
dict
,
tuple
,
list
)):
input_ids
=
inputs
(
attention_mask
,
mems
,
perm_mask
,
target_mapping
,
token_type_ids
,
input_mask
,
head_mask
)
=
None
,
None
,
None
,
None
,
None
,
None
,
None
elif
isinstance
(
inputs
,
(
tuple
,
list
)):
input_ids
=
inputs
[
0
]
attention_mask
=
inputs
[
1
]
if
len
(
inputs
)
>
1
else
None
mems
=
inputs
[
2
]
if
len
(
inputs
)
>
2
else
None
perm_mask
=
inputs
[
3
]
if
len
(
inputs
)
>
3
else
None
target_mapping
=
inputs
[
4
]
if
len
(
inputs
)
>
4
else
None
token_type_ids
=
inputs
[
5
]
if
len
(
inputs
)
>
5
else
None
input_mask
=
inputs
[
6
]
if
len
(
inputs
)
>
6
else
None
head_mask
=
inputs
[
7
]
if
len
(
inputs
)
>
7
else
None
assert
len
(
inputs
)
<=
8
,
"Too many inputs."
else
:
input_ids
=
inputs
.
get
(
'input_ids'
)
attention_mask
=
inputs
.
get
(
'attention_mask'
,
None
)
mems
=
inputs
.
get
(
'mems'
,
None
)
perm_mask
=
inputs
.
get
(
'perm_mask'
,
None
)
target_mapping
=
inputs
.
get
(
'target_mapping'
,
None
)
token_type_ids
=
inputs
.
get
(
'token_type_ids'
,
None
)
input_mask
=
inputs
.
get
(
'input_mask'
,
None
)
head_mask
=
inputs
.
get
(
'head_mask'
,
None
)
assert
len
(
inputs
)
<=
8
,
"Too many inputs."
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension
# so we move here the first dimension (batch) to the end
input_ids
=
tf
.
transpose
(
input_ids
,
perm
=
(
0
,
1
))
token_type_ids
=
tf
.
transpose
(
token_type_ids
,
perm
=
(
0
,
1
))
if
token_type_ids
is
not
None
else
None
input_mask
=
tf
.
transpose
(
input_mask
,
perm
=
(
0
,
1
))
if
input_mask
is
not
None
else
None
attention_mask
=
tf
.
transpose
(
attention_mask
,
perm
=
(
0
,
1
))
if
attention_mask
is
not
None
else
None
input_ids
=
tf
.
transpose
(
input_ids
,
perm
=
(
1
,
0
))
token_type_ids
=
tf
.
transpose
(
token_type_ids
,
perm
=
(
1
,
0
))
if
token_type_ids
is
not
None
else
None
input_mask
=
tf
.
transpose
(
input_mask
,
perm
=
(
1
,
0
))
if
input_mask
is
not
None
else
None
attention_mask
=
tf
.
transpose
(
attention_mask
,
perm
=
(
1
,
0
))
if
attention_mask
is
not
None
else
None
perm_mask
=
tf
.
transpose
(
perm_mask
,
perm
=
(
1
,
2
,
0
))
if
perm_mask
is
not
None
else
None
target_mapping
=
tf
.
transpose
(
target_mapping
,
perm
=
(
1
,
2
,
0
))
if
target_mapping
is
not
None
else
None
...
...
@@ -597,15 +616,13 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
##### Word embeddings and prepare h & g hidden states
word_emb_k
=
self
.
word_embedding
(
input_ids
)
if
training
:
output_h
=
self
.
dropout
(
word_emb_k
)
output_h
=
self
.
dropout
(
word_emb_k
,
training
=
training
)
if
target_mapping
is
not
None
:
word_emb_q
=
tf
.
tile
(
mask_emb
,
[
tf
.
shape
(
target_mapping
)[
0
],
bsz
,
1
])
# else: # We removed the inp_q input which was same as target mapping
# inp_q_ext = inp_q[:, :, None]
# word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
if
training
:
output_g
=
self
.
dropout
(
word_emb_q
)
output_g
=
self
.
dropout
(
word_emb_q
,
training
=
training
)
else
:
output_g
=
None
...
...
@@ -625,8 +642,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
##### Positional encoding
pos_emb
=
self
.
relative_positional_encoding
(
qlen
,
klen
,
bsz
=
bsz
,
dtype
=
dtype_float
)
if
training
:
pos_emb
=
self
.
dropout
(
pos_emb
)
pos_emb
=
self
.
dropout
(
pos_emb
,
training
=
training
)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
...
...
@@ -666,8 +682,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
if
self
.
output_hidden_states
:
hidden_states
.
append
((
output_h
,
output_g
)
if
output_g
is
not
None
else
output_h
)
if
training
:
output
=
self
.
dropout
(
output_g
if
output_g
is
not
None
else
output_h
)
output
=
self
.
dropout
(
output_g
if
output_g
is
not
None
else
output_h
,
training
=
training
)
# Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
outputs
=
(
tf
.
transpose
(
output
,
perm
=
(
1
,
0
,
2
)),
new_mems
)
...
...
@@ -805,7 +820,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TF
Ber
tMainLayer
(
config
,
name
=
'transformer'
)
self
.
transformer
=
TF
XLNe
tMainLayer
(
config
,
name
=
'transformer'
)
def
call
(
self
,
inputs
,
training
=
False
):
outputs
=
self
.
transformer
(
inputs
,
training
=
training
)
...
...
pytorch_transformers/tests/modeling_tf_xlnet_test.py
View file @
16b63617
...
...
@@ -105,8 +105,8 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
perm_mask_last
=
tf
.
ones
((
self
.
batch_size
,
self
.
seq_length
+
1
,
1
),
dtype
=
tf
.
float32
)
perm_mask
=
tf
.
concat
([
perm_mask
,
perm_mask_last
],
axis
=-
1
)
# perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
target_mapping
=
tf
.
zeros
((
self
.
batch_size
,
1
,
self
.
seq_length
),
dtype
=
t
orch
.
float32
)
target_mapping_last
=
tf
.
ones
((
self
.
batch_size
,
1
,
1
),
dtype
=
t
orch
.
float32
)
target_mapping
=
tf
.
zeros
((
self
.
batch_size
,
1
,
self
.
seq_length
),
dtype
=
t
f
.
float32
)
target_mapping_last
=
tf
.
ones
((
self
.
batch_size
,
1
,
1
),
dtype
=
t
f
.
float32
)
target_mapping
=
tf
.
concat
([
target_mapping
,
target_mapping_last
],
axis
=-
1
)
# target_mapping[:, 0, -1] = 1.0 # predict last token
...
...
@@ -145,18 +145,18 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
):
model
=
TFXLNetModel
(
config
)
inputs
=
{
'input_ids'
:
input_ids
,
inputs
=
{
'input_ids'
:
input_ids
_1
,
'input_mask'
:
input_mask
,
'token_type_ids'
:
token_type
_ids
}
'token_type_ids'
:
segment
_ids
}
_
,
_
=
model
(
inputs
)
inputs
=
[
input_ids
,
input_mask
]
inputs
=
[
input_ids
_1
,
input_mask
]
outputs
,
mems_1
=
model
(
inputs
)
result
=
{
"mems_1"
:
[
mem
.
numpy
()
for
m
in
mems_1
],
"mems_1"
:
[
mem
.
numpy
()
for
m
em
in
mems_1
],
"outputs"
:
outputs
.
numpy
(),
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment