Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
16b63617
Commit
16b63617
authored
Sep 10, 2019
by
thomwolf
Browse files
xlnet paassing first test
parent
32aabe8c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
66 additions
and
62 deletions
+66
-62
pytorch_transformers/modeling_tf_bert.py
pytorch_transformers/modeling_tf_bert.py
+9
-16
pytorch_transformers/modeling_tf_gpt2.py
pytorch_transformers/modeling_tf_gpt2.py
+4
-8
pytorch_transformers/modeling_tf_xlnet.py
pytorch_transformers/modeling_tf_xlnet.py
+47
-32
pytorch_transformers/tests/modeling_tf_xlnet_test.py
pytorch_transformers/tests/modeling_tf_xlnet_test.py
+6
-6
No files found.
pytorch_transformers/modeling_tf_bert.py
View file @
16b63617
...
...
@@ -218,8 +218,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
embeddings
=
words_embeddings
+
position_embeddings
+
token_type_embeddings
embeddings
=
self
.
LayerNorm
(
embeddings
)
if
training
:
embeddings
=
self
.
dropout
(
embeddings
)
embeddings
=
self
.
dropout
(
embeddings
,
training
=
training
)
return
embeddings
def
_linear
(
self
,
inputs
):
...
...
@@ -286,10 +285,9 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
# Normalize the attention scores to probabilities.
attention_probs
=
tf
.
nn
.
softmax
(
attention_scores
,
axis
=-
1
)
if
training
:
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
self
.
dropout
(
attention_probs
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
self
.
dropout
(
attention_probs
,
training
=
training
)
# Mask heads if we want to
if
head_mask
is
not
None
:
...
...
@@ -316,8 +314,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
hidden_states
,
input_tensor
=
inputs
hidden_states
=
self
.
dense
(
hidden_states
)
if
training
:
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
,
training
=
training
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
return
hidden_states
...
...
@@ -366,8 +363,7 @@ class TFBertOutput(tf.keras.layers.Layer):
hidden_states
,
input_tensor
=
inputs
hidden_states
=
self
.
dense
(
hidden_states
)
if
training
:
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
,
training
=
training
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
return
hidden_states
...
...
@@ -871,8 +867,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
pooled_output
=
outputs
[
1
]
if
training
:
pooled_output
=
self
.
dropout
(
pooled_output
)
pooled_output
=
self
.
dropout
(
pooled_output
,
training
=
training
)
logits
=
self
.
classifier
(
pooled_output
)
outputs
=
(
logits
,)
+
outputs
[
2
:]
# add hidden states and attention if they are here
...
...
@@ -947,8 +942,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
pooled_output
=
outputs
[
1
]
if
training
:
pooled_output
=
self
.
dropout
(
pooled_output
)
pooled_output
=
self
.
dropout
(
pooled_output
,
training
=
training
)
logits
=
self
.
classifier
(
pooled_output
)
reshaped_logits
=
tf
.
reshape
(
logits
,
(
-
1
,
num_choices
))
...
...
@@ -995,8 +989,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
sequence_output
=
outputs
[
0
]
if
training
:
sequence_output
=
self
.
dropout
(
sequence_output
)
sequence_output
=
self
.
dropout
(
sequence_output
,
training
=
training
)
logits
=
self
.
classifier
(
sequence_output
)
outputs
=
(
logits
,)
+
outputs
[
2
:]
# add hidden states and attention if they are here
...
...
pytorch_transformers/modeling_tf_gpt2.py
View file @
16b63617
...
...
@@ -178,8 +178,7 @@ class TFAttention(tf.keras.layers.Layer):
w
=
w
+
attention_mask
w
=
tf
.
nn
.
softmax
(
w
,
axis
=-
1
)
if
training
:
w
=
self
.
attn_dropout
(
w
)
w
=
self
.
attn_dropout
(
w
,
training
=
training
)
# Mask heads if we want to
if
head_mask
is
not
None
:
...
...
@@ -221,8 +220,7 @@ class TFAttention(tf.keras.layers.Layer):
a
=
self
.
merge_heads
(
a
)
a
=
self
.
c_proj
(
a
)
if
training
:
a
=
self
.
resid_dropout
(
a
)
a
=
self
.
resid_dropout
(
a
,
training
=
training
)
outputs
=
[
a
,
present
]
+
attn_outputs
[
1
:]
return
outputs
# a, present, (attentions)
...
...
@@ -240,8 +238,7 @@ class TFMLP(tf.keras.layers.Layer):
def
call
(
self
,
x
,
training
=
False
):
h
=
self
.
act
(
self
.
c_fc
(
x
))
h2
=
self
.
c_proj
(
h
)
if
training
:
h2
=
self
.
dropout
(
h2
)
h2
=
self
.
dropout
(
h2
,
training
=
training
)
return
h2
...
...
@@ -368,8 +365,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
else
:
token_type_embeds
=
0
hidden_states
=
inputs_embeds
+
position_embeds
+
token_type_embeds
if
training
:
hidden_states
=
self
.
drop
(
hidden_states
)
hidden_states
=
self
.
drop
(
hidden_states
,
training
=
training
)
output_shape
=
input_shape
+
[
shape_list
(
hidden_states
)[
-
1
]]
...
...
pytorch_transformers/modeling_tf_xlnet.py
View file @
16b63617
...
...
@@ -145,7 +145,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
'layer_norm'
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
def
build
(
input_shape
):
def
build
(
self
,
input_shape
):
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
initializer_range
)
self
.
q
=
self
.
add_weight
(
shape
=
(
self
.
d_model
,
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
...
...
@@ -221,10 +221,9 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
attn_score
=
attn_score
-
1e30
*
attn_mask
# attention probability
attn_prob
=
tf
.
softmax
(
attn_score
,
axis
=
1
)
attn_prob
=
tf
.
nn
.
softmax
(
attn_score
,
axis
=
1
)
if
training
:
attn_prob
=
self
.
dropout
(
attn_prob
)
attn_prob
=
self
.
dropout
(
attn_prob
,
training
=
training
)
# Mask heads if we want to
if
head_mask
is
not
None
:
...
...
@@ -245,10 +244,9 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
attn_out
=
tf
.
einsum
(
'ibnd,hnd->ibh'
,
attn_vec
,
self
.
o
)
if
training
:
attn_out
=
self
.
dropout
(
attn_out
)
attn_out
=
self
.
dropout
(
attn_out
,
training
=
training
)
if
residual
:
if
residual
is
not
None
:
attn_out
=
attn_out
+
h
output
=
self
.
layer_norm
(
attn_out
)
...
...
@@ -288,7 +286,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
attn_vec_h
,
attn_prob_h
=
attn_vec_h
# post processing
output_h
=
self
.
post_attention
([
h
,
attn_vec_h
],
training
=
training
)
output_h
=
self
.
post_attention
([
h
,
attn_vec_h
,
None
],
training
=
training
)
##### g-stream
# query-stream query head
...
...
@@ -314,7 +312,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
attn_vec_g
,
attn_prob_g
=
attn_vec_g
# post processing
output_g
=
self
.
post_attention
([
g
,
attn_vec_g
],
training
=
training
)
output_g
=
self
.
post_attention
([
g
,
attn_vec_g
,
None
],
training
=
training
)
if
self
.
output_attentions
:
attn_prob
=
attn_prob_h
,
attn_prob_g
...
...
@@ -343,7 +341,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
attn_vec
,
attn_prob
=
attn_vec
# post processing
output_h
=
self
.
post_attention
([
h
,
attn_vec
],
training
=
training
)
output_h
=
self
.
post_attention
([
h
,
attn_vec
,
None
],
training
=
training
)
output_g
=
None
outputs
=
(
output_h
,
output_g
)
...
...
@@ -368,11 +366,9 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
output
=
inp
output
=
self
.
layer_1
(
output
)
output
=
self
.
activation_function
(
output
)
if
training
:
output
=
self
.
dropout
(
output
)
output
=
self
.
dropout
(
output
,
training
=
training
)
output
=
self
.
layer_2
(
output
)
if
training
:
output
=
self
.
dropout
(
output
)
output
=
self
.
dropout
(
output
,
training
=
training
)
output
=
self
.
layer_norm
(
output
+
inp
)
return
output
...
...
@@ -413,12 +409,12 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
self
.
initializer_range
=
config
.
initializer_range
self
.
word_embedding
=
TFSharedEmbeddings
(
config
.
n_token
,
config
.
d_model
,
initializer_range
=
config
.
initializer_range
,
name
=
'word_embedding'
)
self
.
layer
=
[
XLNetLayer
(
config
,
name
=
'layer_{}'
.
format
(
i
))
for
i
in
range
(
config
.
n_layer
)]
self
.
layer
=
[
TF
XLNetLayer
(
config
,
name
=
'layer_{}'
.
format
(
i
))
for
i
in
range
(
config
.
n_layer
)]
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
def
build
(
input_shape
):
def
build
(
self
,
input_shape
):
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
initializer_range
)
self
.
mask_emb
=
self
.
add_weight
(
shape
=
(
1
,
1
,
config
.
d_model
),
self
.
mask_emb
=
self
.
add_weight
(
shape
=
(
1
,
1
,
self
.
d_model
),
initializer
=
initializer
,
trainable
=
True
,
name
=
'mask_emb'
)
...
...
@@ -532,16 +528,39 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
return
pos_emb
def
call
(
self
,
inputs
,
training
=
False
):
(
input_ids
,
attention_mask
,
mems
,
perm_mask
,
target_mapping
,
token_type_ids
,
input_mask
,
head_mask
)
=
inputs
if
not
isinstance
(
inputs
,
(
dict
,
tuple
,
list
)):
input_ids
=
inputs
(
attention_mask
,
mems
,
perm_mask
,
target_mapping
,
token_type_ids
,
input_mask
,
head_mask
)
=
None
,
None
,
None
,
None
,
None
,
None
,
None
elif
isinstance
(
inputs
,
(
tuple
,
list
)):
input_ids
=
inputs
[
0
]
attention_mask
=
inputs
[
1
]
if
len
(
inputs
)
>
1
else
None
mems
=
inputs
[
2
]
if
len
(
inputs
)
>
2
else
None
perm_mask
=
inputs
[
3
]
if
len
(
inputs
)
>
3
else
None
target_mapping
=
inputs
[
4
]
if
len
(
inputs
)
>
4
else
None
token_type_ids
=
inputs
[
5
]
if
len
(
inputs
)
>
5
else
None
input_mask
=
inputs
[
6
]
if
len
(
inputs
)
>
6
else
None
head_mask
=
inputs
[
7
]
if
len
(
inputs
)
>
7
else
None
assert
len
(
inputs
)
<=
8
,
"Too many inputs."
else
:
input_ids
=
inputs
.
get
(
'input_ids'
)
attention_mask
=
inputs
.
get
(
'attention_mask'
,
None
)
mems
=
inputs
.
get
(
'mems'
,
None
)
perm_mask
=
inputs
.
get
(
'perm_mask'
,
None
)
target_mapping
=
inputs
.
get
(
'target_mapping'
,
None
)
token_type_ids
=
inputs
.
get
(
'token_type_ids'
,
None
)
input_mask
=
inputs
.
get
(
'input_mask'
,
None
)
head_mask
=
inputs
.
get
(
'head_mask'
,
None
)
assert
len
(
inputs
)
<=
8
,
"Too many inputs."
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension
# so we move here the first dimension (batch) to the end
input_ids
=
tf
.
transpose
(
input_ids
,
perm
=
(
0
,
1
))
token_type_ids
=
tf
.
transpose
(
token_type_ids
,
perm
=
(
0
,
1
))
if
token_type_ids
is
not
None
else
None
input_mask
=
tf
.
transpose
(
input_mask
,
perm
=
(
0
,
1
))
if
input_mask
is
not
None
else
None
attention_mask
=
tf
.
transpose
(
attention_mask
,
perm
=
(
0
,
1
))
if
attention_mask
is
not
None
else
None
input_ids
=
tf
.
transpose
(
input_ids
,
perm
=
(
1
,
0
))
token_type_ids
=
tf
.
transpose
(
token_type_ids
,
perm
=
(
1
,
0
))
if
token_type_ids
is
not
None
else
None
input_mask
=
tf
.
transpose
(
input_mask
,
perm
=
(
1
,
0
))
if
input_mask
is
not
None
else
None
attention_mask
=
tf
.
transpose
(
attention_mask
,
perm
=
(
1
,
0
))
if
attention_mask
is
not
None
else
None
perm_mask
=
tf
.
transpose
(
perm_mask
,
perm
=
(
1
,
2
,
0
))
if
perm_mask
is
not
None
else
None
target_mapping
=
tf
.
transpose
(
target_mapping
,
perm
=
(
1
,
2
,
0
))
if
target_mapping
is
not
None
else
None
...
...
@@ -597,15 +616,13 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
##### Word embeddings and prepare h & g hidden states
word_emb_k
=
self
.
word_embedding
(
input_ids
)
if
training
:
output_h
=
self
.
dropout
(
word_emb_k
)
output_h
=
self
.
dropout
(
word_emb_k
,
training
=
training
)
if
target_mapping
is
not
None
:
word_emb_q
=
tf
.
tile
(
mask_emb
,
[
tf
.
shape
(
target_mapping
)[
0
],
bsz
,
1
])
# else: # We removed the inp_q input which was same as target mapping
# inp_q_ext = inp_q[:, :, None]
# word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
if
training
:
output_g
=
self
.
dropout
(
word_emb_q
)
output_g
=
self
.
dropout
(
word_emb_q
,
training
=
training
)
else
:
output_g
=
None
...
...
@@ -625,8 +642,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
##### Positional encoding
pos_emb
=
self
.
relative_positional_encoding
(
qlen
,
klen
,
bsz
=
bsz
,
dtype
=
dtype_float
)
if
training
:
pos_emb
=
self
.
dropout
(
pos_emb
)
pos_emb
=
self
.
dropout
(
pos_emb
,
training
=
training
)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
...
...
@@ -666,8 +682,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
if
self
.
output_hidden_states
:
hidden_states
.
append
((
output_h
,
output_g
)
if
output_g
is
not
None
else
output_h
)
if
training
:
output
=
self
.
dropout
(
output_g
if
output_g
is
not
None
else
output_h
)
output
=
self
.
dropout
(
output_g
if
output_g
is
not
None
else
output_h
,
training
=
training
)
# Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
outputs
=
(
tf
.
transpose
(
output
,
perm
=
(
1
,
0
,
2
)),
new_mems
)
...
...
@@ -805,7 +820,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TF
Ber
tMainLayer
(
config
,
name
=
'transformer'
)
self
.
transformer
=
TF
XLNe
tMainLayer
(
config
,
name
=
'transformer'
)
def
call
(
self
,
inputs
,
training
=
False
):
outputs
=
self
.
transformer
(
inputs
,
training
=
training
)
...
...
pytorch_transformers/tests/modeling_tf_xlnet_test.py
View file @
16b63617
...
...
@@ -105,8 +105,8 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
perm_mask_last
=
tf
.
ones
((
self
.
batch_size
,
self
.
seq_length
+
1
,
1
),
dtype
=
tf
.
float32
)
perm_mask
=
tf
.
concat
([
perm_mask
,
perm_mask_last
],
axis
=-
1
)
# perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
target_mapping
=
tf
.
zeros
((
self
.
batch_size
,
1
,
self
.
seq_length
),
dtype
=
t
orch
.
float32
)
target_mapping_last
=
tf
.
ones
((
self
.
batch_size
,
1
,
1
),
dtype
=
t
orch
.
float32
)
target_mapping
=
tf
.
zeros
((
self
.
batch_size
,
1
,
self
.
seq_length
),
dtype
=
t
f
.
float32
)
target_mapping_last
=
tf
.
ones
((
self
.
batch_size
,
1
,
1
),
dtype
=
t
f
.
float32
)
target_mapping
=
tf
.
concat
([
target_mapping
,
target_mapping_last
],
axis
=-
1
)
# target_mapping[:, 0, -1] = 1.0 # predict last token
...
...
@@ -145,18 +145,18 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
target_mapping
,
segment_ids
,
lm_labels
,
sequence_labels
,
is_impossible_labels
):
model
=
TFXLNetModel
(
config
)
inputs
=
{
'input_ids'
:
input_ids
,
inputs
=
{
'input_ids'
:
input_ids
_1
,
'input_mask'
:
input_mask
,
'token_type_ids'
:
token_type
_ids
}
'token_type_ids'
:
segment
_ids
}
_
,
_
=
model
(
inputs
)
inputs
=
[
input_ids
,
input_mask
]
inputs
=
[
input_ids
_1
,
input_mask
]
outputs
,
mems_1
=
model
(
inputs
)
result
=
{
"mems_1"
:
[
mem
.
numpy
()
for
m
in
mems_1
],
"mems_1"
:
[
mem
.
numpy
()
for
m
em
in
mems_1
],
"outputs"
:
outputs
.
numpy
(),
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment