Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
57053334
Commit
57053334
authored
Sep 26, 2019
by
thomwolf
Browse files
add initialization for everybody
parent
f2a337b3
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
195 additions
and
99 deletions
+195
-99
examples/run_tf_glue.py
examples/run_tf_glue.py
+30
-19
pytorch_transformers/modeling_tf_distilbert.py
pytorch_transformers/modeling_tf_distilbert.py
+42
-15
pytorch_transformers/modeling_tf_gpt2.py
pytorch_transformers/modeling_tf_gpt2.py
+14
-8
pytorch_transformers/modeling_tf_openai.py
pytorch_transformers/modeling_tf_openai.py
+14
-8
pytorch_transformers/modeling_tf_roberta.py
pytorch_transformers/modeling_tf_roberta.py
+11
-4
pytorch_transformers/modeling_tf_transfo_xl.py
pytorch_transformers/modeling_tf_transfo_xl.py
+36
-13
pytorch_transformers/modeling_tf_utils.py
pytorch_transformers/modeling_tf_utils.py
+9
-9
pytorch_transformers/modeling_tf_xlm.py
pytorch_transformers/modeling_tf_xlm.py
+20
-12
pytorch_transformers/modeling_tf_xlnet.py
pytorch_transformers/modeling_tf_xlnet.py
+19
-11
No files found.
examples/run_tf_glue.py
View file @
57053334
import
tensorflow
as
tf
import
tensorflow
as
tf
import
tensorflow_datasets
import
tensorflow_datasets
from
transformers
import
*
from
pytorch_
transformers
import
*
# Load dataset, tokenizer, model from pretrained model/vocabulary
# Load dataset, tokenizer, model from pretrained model/vocabulary
tokenizer
=
BertTokenizer
.
from_pretrained
(
'bert-base-cased'
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
'bert-base-cased'
)
dataset
=
tensorflow_datasets
.
load
(
'glue/mrpc'
)
model
=
TFBertForSequenceClassification
.
from_pretrained
(
'bert-base-cased'
)
model
=
TFBertForSequenceClassification
.
from_pretrained
(
'bert-base-cased'
)
data
=
tensorflow_datasets
.
load
(
'glue/mrpc'
)
# Prepare dataset for GLUE as a tf.data.Dataset instance
# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset
=
glue_convert_examples_to_features
(
data
set
[
'train'
],
tokenizer
,
task
=
'mrpc'
)
train_dataset
=
glue_convert_examples_to_features
(
data
[
'train'
],
tokenizer
,
128
,
'mrpc'
)
valid_dataset
=
glue_convert_examples_to_features
(
data
set
[
'validation'
],
tokenizer
,
task
=
'mrpc'
)
valid_dataset
=
glue_convert_examples_to_features
(
data
[
'validation'
],
tokenizer
,
128
,
'mrpc'
)
train_dataset
=
train_dataset
.
shuffle
(
100
).
batch
(
32
).
repeat
(
3
)
train_dataset
=
train_dataset
.
shuffle
(
100
).
batch
(
32
).
repeat
(
2
)
valid_dataset
=
valid_dataset
.
batch
(
64
)
valid_dataset
=
valid_dataset
.
batch
(
64
)
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
learning_rate
=
tf
.
keras
.
optimizers
.
schedules
.
PolynomialDecay
(
2e-5
,
345
,
end_learning_rate
=
0
)
optimizer
=
tf
.
keras
.
optimizers
.
Adam
(
learning_rate
=
3e-5
,
epsilon
=
1e-08
,
clipnorm
=
1.0
)
optimizer
=
tf
.
keras
.
optimizers
.
Adam
(
learning_rate
=
learning_rate
,
epsilon
=
1e-08
,
clipnorm
=
1.0
)
loss
=
tf
.
keras
.
losses
.
SparseCategoricalCrossentropy
(
from_logits
=
True
)
loss
=
tf
.
keras
.
losses
.
SparseCategoricalCrossentropy
(
from_logits
=
True
)
metric
=
tf
.
keras
.
metrics
.
SparseCategoricalAccuracy
(
'accuracy'
)
model
.
compile
(
optimizer
=
optimizer
,
loss
=
loss
,
metrics
=
[
'sparse_categorical_accuracy'
])
model
.
compile
(
optimizer
=
optimizer
,
loss
=
loss
,
metrics
=
[
metric
])
# Train and evaluate using tf.keras.Model.fit()
# Train and evaluate using tf.keras.Model.fit()
model
.
fit
(
train_dataset
,
epochs
=
3
,
steps_per_epoch
=
115
,
history
=
model
.
fit
(
train_dataset
,
epochs
=
2
,
steps_per_epoch
=
115
,
validation_data
=
valid_dataset
,
validation_steps
=
7
)
validation_data
=
valid_dataset
,
validation_steps
=
7
)
# Save the TensorFlow model and load it in PyTorch
>>>
Train
for
115
steps
,
validate
for
7
steps
>>>
Epoch
1
/
2
>>>
115
/
115
[
==============================
]
-
53
s
459
ms
/
step
-
loss
:
0.6033
-
accuracy
:
0.6712
-
val_loss
:
0.4964
-
val_accuracy
:
0.7647
>>>
Epoch
2
/
2
>>>
115
/
115
[
==============================
]
-
33
s
289
ms
/
step
-
loss
:
0.4141
-
accuracy
:
0.8160
-
val_loss
:
0.3914
-
val_accuracy
:
0.8382
# Load the TensorFlow model in PyTorch for inspection
model
.
save_pretrained
(
'./save/'
)
model
.
save_pretrained
(
'./save/'
)
pytorch_model
=
BertForSequenceClassification
.
from_pretrained
(
'./save/'
,
from_tf
=
True
)
pytorch_model
=
BertForSequenceClassification
.
from_pretrained
(
'./save/'
,
from_tf
=
True
)
# Quickly inspect a few predictions - MRPC is a paraphrasing task
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
inputs
=
tokenizer
.
encode_plus
(
"The company is doing great"
,
sentence_0
=
"This research was consistent with his findings."
"The company has good results"
,
sentence_1
=
"His findings were compatible with this research."
add_special_tokens
=
True
,
sentence_2
=
"His findings were not compatible with this research."
return_tensors
=
'pt'
)
inputs_1
=
tokenizer
.
encode_plus
(
sentence_0
,
sentence_1
,
add_special_tokens
=
True
,
return_tensors
=
'pt'
)
pred
=
pytorch_model
(
**
inputs
)
inputs_2
=
tokenizer
.
encode_plus
(
sentence_0
,
sentence_2
,
add_special_tokens
=
True
,
return_tensors
=
'pt'
)
print
(
"Paraphrase"
if
pred
.
argmax
().
item
()
==
0
else
"Not paraphrase"
)
pred_1
=
pytorch_model
(
**
inputs_1
)[
0
].
argmax
().
item
()
pred_2
=
pytorch_model
(
**
inputs_2
)[
0
].
argmax
().
item
()
print
(
"sentence_1 is"
,
"a paraphrase"
if
pred_1
else
"not a paraphrase"
,
"of sentence_0"
)
print
(
"sentence_2 is"
,
"a paraphrase"
if
pred_2
else
"not a paraphrase"
,
"of sentence_0"
)
>>>
sentence_1
is
a
paraphrase
of
sentence_0
>>>
sentence_2
is
not
a
paraphrase
of
sentence_0
\ No newline at end of file
pytorch_transformers/modeling_tf_distilbert.py
View file @
57053334
...
@@ -29,7 +29,7 @@ import numpy as np
...
@@ -29,7 +29,7 @@ import numpy as np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
.configuration_distilbert
import
DistilBertConfig
from
.configuration_distilbert
import
DistilBertConfig
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFSharedEmbeddings
,
shape_list
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFSharedEmbeddings
,
shape_list
,
get_initializer
from
.file_utils
import
add_start_docstrings
from
.file_utils
import
add_start_docstrings
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
...
@@ -79,8 +79,15 @@ class TFEmbeddings(tf.keras.layers.Layer):
...
@@ -79,8 +79,15 @@ class TFEmbeddings(tf.keras.layers.Layer):
super
(
TFEmbeddings
,
self
).
__init__
(
**
kwargs
)
super
(
TFEmbeddings
,
self
).
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
dim
=
config
.
dim
self
.
dim
=
config
.
dim
self
.
word_embeddings
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
dim
,
name
=
'word_embeddings'
)
# padding_idx=0)
self
.
initializer_range
=
config
.
initializer_range
self
.
position_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
config
.
max_position_embeddings
,
config
.
dim
,
name
=
'position_embeddings'
)
self
.
word_embeddings
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
dim
,
initializer_range
=
config
.
initializer_range
,
name
=
'word_embeddings'
)
# padding_idx=0)
self
.
position_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
config
.
max_position_embeddings
,
config
.
dim
,
embeddings_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'position_embeddings'
)
if
config
.
sinusoidal_pos_embds
:
if
config
.
sinusoidal_pos_embds
:
raise
NotImplementedError
raise
NotImplementedError
...
@@ -95,8 +102,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
...
@@ -95,8 +102,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
self
.
word_embeddings
=
self
.
add_weight
(
self
.
word_embeddings
=
self
.
add_weight
(
"weight"
,
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
dim
],
shape
=
[
self
.
vocab_size
,
self
.
dim
],
initializer
=
tf
.
random_normal_initializer
(
initializer
=
get_initializer
(
self
.
initializer_range
))
mean
=
0.
,
stddev
=
self
.
dim
**-
0.5
))
super
(
TFEmbeddings
,
self
).
build
(
input_shape
)
super
(
TFEmbeddings
,
self
).
build
(
input_shape
)
def
call
(
self
,
inputs
,
mode
=
"embedding"
,
training
=
False
):
def
call
(
self
,
inputs
,
mode
=
"embedding"
,
training
=
False
):
...
@@ -178,10 +184,18 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
...
@@ -178,10 +184,18 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
assert
self
.
dim
%
self
.
n_heads
==
0
assert
self
.
dim
%
self
.
n_heads
==
0
self
.
q_lin
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
name
=
"q_lin"
)
self
.
q_lin
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
self
.
k_lin
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
name
=
"k_lin"
)
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
self
.
v_lin
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
name
=
"v_lin"
)
name
=
"q_lin"
)
self
.
out_lin
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
name
=
"out_lin"
)
self
.
k_lin
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"k_lin"
)
self
.
v_lin
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"v_lin"
)
self
.
out_lin
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"out_lin"
)
self
.
pruned_heads
=
set
()
self
.
pruned_heads
=
set
()
...
@@ -254,8 +268,12 @@ class TFFFN(tf.keras.layers.Layer):
...
@@ -254,8 +268,12 @@ class TFFFN(tf.keras.layers.Layer):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFFFN
,
self
).
__init__
(
**
kwargs
)
super
(
TFFFN
,
self
).
__init__
(
**
kwargs
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_dim
,
name
=
"lin1"
)
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_dim
,
self
.
lin2
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
name
=
"lin2"
)
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"lin1"
)
self
.
lin2
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"lin2"
)
assert
config
.
activation
in
[
'relu'
,
'gelu'
],
"activation ({}) must be in ['relu', 'gelu']"
.
format
(
config
.
activation
)
assert
config
.
activation
in
[
'relu'
,
'gelu'
],
"activation ({}) must be in ['relu', 'gelu']"
.
format
(
config
.
activation
)
self
.
activation
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
if
config
.
activation
==
'gelu'
else
tf
.
keras
.
activations
.
relu
self
.
activation
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
if
config
.
activation
==
'gelu'
else
tf
.
keras
.
activations
.
relu
...
@@ -596,7 +614,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
...
@@ -596,7 +614,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
vocab_transform
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
name
=
"vocab_transform"
)
self
.
vocab_transform
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"vocab_transform"
)
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
self
.
vocab_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
1e-12
,
name
=
"vocab_layer_norm"
)
self
.
vocab_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
1e-12
,
name
=
"vocab_layer_norm"
)
self
.
vocab_projector
=
TFDistilBertLMHead
(
config
,
self
.
distilbert
.
embeddings
,
name
=
"vocab_projector"
)
self
.
vocab_projector
=
TFDistilBertLMHead
(
config
,
self
.
distilbert
.
embeddings
,
name
=
"vocab_projector"
)
...
@@ -647,8 +667,13 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
...
@@ -647,8 +667,13 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
pre_classifier
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
activation
=
'relu'
,
name
=
"pre_classifier"
)
self
.
pre_classifier
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
self
.
classifier
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
name
=
"classifier"
)
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
activation
=
'relu'
,
name
=
"pre_classifier"
)
self
.
classifier
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"classifier"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
seq_classif_dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
seq_classif_dropout
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -700,7 +725,9 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
...
@@ -700,7 +725,9 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
super
(
TFDistilBertForQuestionAnswering
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
(
TFDistilBertForQuestionAnswering
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
name
=
'qa_outputs'
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'qa_outputs'
)
assert
config
.
num_labels
==
2
assert
config
.
num_labels
==
2
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
qa_dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
qa_dropout
)
...
...
pytorch_transformers/modeling_tf_gpt2.py
View file @
57053334
...
@@ -29,7 +29,7 @@ import numpy as np
...
@@ -29,7 +29,7 @@ import numpy as np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
.modeling_tf_utils
import
(
TFPreTrainedModel
,
TFConv1D
,
TFSharedEmbeddings
,
from
.modeling_tf_utils
import
(
TFPreTrainedModel
,
TFConv1D
,
TFSharedEmbeddings
,
TFSequenceSummary
,
shape_list
)
TFSequenceSummary
,
shape_list
,
get_initializer
)
from
.configuration_gpt2
import
GPT2Config
from
.configuration_gpt2
import
GPT2Config
from
.file_utils
import
add_start_docstrings
from
.file_utils
import
add_start_docstrings
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
...
@@ -76,8 +76,8 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -76,8 +76,8 @@ class TFAttention(tf.keras.layers.Layer):
self
.
split_size
=
n_state
self
.
split_size
=
n_state
self
.
scale
=
scale
self
.
scale
=
scale
self
.
c_attn
=
TFConv1D
(
n_state
*
3
,
nx
,
name
=
'c_attn'
)
self
.
c_attn
=
TFConv1D
(
n_state
*
3
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
'c_attn'
)
self
.
c_proj
=
TFConv1D
(
n_state
,
nx
,
name
=
'c_proj'
)
self
.
c_proj
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
'c_proj'
)
self
.
attn_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attn_pdrop
)
self
.
attn_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attn_pdrop
)
self
.
resid_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
self
.
resid_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
self
.
pruned_heads
=
set
()
self
.
pruned_heads
=
set
()
...
@@ -166,8 +166,8 @@ class TFMLP(tf.keras.layers.Layer):
...
@@ -166,8 +166,8 @@ class TFMLP(tf.keras.layers.Layer):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
name
=
'c_fc'
)
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
'c_fc'
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
name
=
'c_proj'
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
initializer_range
=
config
.
initializer_range
,
name
=
'c_proj'
)
self
.
act
=
gelu
self
.
act
=
gelu
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
...
@@ -212,8 +212,14 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
...
@@ -212,8 +212,14 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
n_embd
=
config
.
n_embd
self
.
n_embd
=
config
.
n_embd
self
.
wte
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
hidden_size
,
name
=
'wte'
)
self
.
wte
=
TFSharedEmbeddings
(
config
.
vocab_size
,
self
.
wpe
=
tf
.
keras
.
layers
.
Embedding
(
config
.
n_positions
,
config
.
n_embd
,
name
=
'wpe'
)
config
.
hidden_size
,
initializer_range
=
config
.
initializer_range
,
name
=
'wte'
)
self
.
wpe
=
tf
.
keras
.
layers
.
Embedding
(
config
.
n_positions
,
config
.
n_embd
,
embeddings_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'wpe'
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
embd_pdrop
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
embd_pdrop
)
self
.
h
=
[
TFBlock
(
config
.
n_ctx
,
self
.
h
=
[
TFBlock
(
config
.
n_ctx
,
config
,
config
,
...
@@ -557,7 +563,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
...
@@ -557,7 +563,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFGPT2DoubleHeadsModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
(
TFGPT2DoubleHeadsModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
'transformer'
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
'transformer'
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
config
,
name
=
'multiple_choice_head'
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
config
,
initializer_range
=
config
.
initializer_range
,
name
=
'multiple_choice_head'
)
def
call
(
self
,
inputs
,
past
=
None
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
mc_token_ids
=
None
,
training
=
False
):
def
call
(
self
,
inputs
,
past
=
None
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
mc_token_ids
=
None
,
training
=
False
):
if
isinstance
(
inputs
,
(
tuple
,
list
)):
if
isinstance
(
inputs
,
(
tuple
,
list
)):
...
...
pytorch_transformers/modeling_tf_openai.py
View file @
57053334
...
@@ -29,7 +29,7 @@ import numpy as np
...
@@ -29,7 +29,7 @@ import numpy as np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
.modeling_tf_utils
import
(
TFPreTrainedModel
,
TFConv1D
,
TFSharedEmbeddings
,
from
.modeling_tf_utils
import
(
TFPreTrainedModel
,
TFConv1D
,
TFSharedEmbeddings
,
TFSequenceSummary
,
shape_list
)
TFSequenceSummary
,
shape_list
,
get_initializer
)
from
.configuration_openai
import
OpenAIGPTConfig
from
.configuration_openai
import
OpenAIGPTConfig
from
.file_utils
import
add_start_docstrings
from
.file_utils
import
add_start_docstrings
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
...
@@ -83,8 +83,8 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -83,8 +83,8 @@ class TFAttention(tf.keras.layers.Layer):
self
.
split_size
=
n_state
self
.
split_size
=
n_state
self
.
scale
=
scale
self
.
scale
=
scale
self
.
c_attn
=
TFConv1D
(
n_state
*
3
,
nx
,
name
=
'c_attn'
)
self
.
c_attn
=
TFConv1D
(
n_state
*
3
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
'c_attn'
)
self
.
c_proj
=
TFConv1D
(
n_state
,
nx
,
name
=
'c_proj'
)
self
.
c_proj
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
'c_proj'
)
self
.
attn_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attn_pdrop
)
self
.
attn_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attn_pdrop
)
self
.
resid_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
self
.
resid_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
self
.
pruned_heads
=
set
()
self
.
pruned_heads
=
set
()
...
@@ -168,8 +168,8 @@ class TFMLP(tf.keras.layers.Layer):
...
@@ -168,8 +168,8 @@ class TFMLP(tf.keras.layers.Layer):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
name
=
'c_fc'
)
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
'c_fc'
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
name
=
'c_proj'
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
initializer_range
=
config
.
initializer_range
,
name
=
'c_proj'
)
self
.
act
=
gelu
self
.
act
=
gelu
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
...
@@ -212,8 +212,14 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
...
@@ -212,8 +212,14 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
n_embd
=
config
.
n_embd
self
.
n_embd
=
config
.
n_embd
self
.
tokens_embed
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
n_embd
,
name
=
'tokens_embed'
)
self
.
tokens_embed
=
TFSharedEmbeddings
(
config
.
vocab_size
,
self
.
positions_embed
=
tf
.
keras
.
layers
.
Embedding
(
config
.
n_positions
,
config
.
n_embd
,
name
=
'positions_embed'
)
config
.
n_embd
,
initializer_range
=
config
.
initializer_range
,
name
=
'tokens_embed'
)
self
.
positions_embed
=
tf
.
keras
.
layers
.
Embedding
(
config
.
n_positions
,
config
.
n_embd
,
embeddings_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'positions_embed'
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
embd_pdrop
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
embd_pdrop
)
self
.
h
=
[
TFBlock
(
config
.
n_ctx
,
self
.
h
=
[
TFBlock
(
config
.
n_ctx
,
config
,
config
,
...
@@ -522,7 +528,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
...
@@ -522,7 +528,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTDoubleHeadsModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
(
TFOpenAIGPTDoubleHeadsModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
'transformer'
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
'transformer'
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
config
,
name
=
'multiple_choice_head'
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
config
,
initializer_range
=
config
.
initializer_range
,
name
=
'multiple_choice_head'
)
def
call
(
self
,
inputs
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
mc_token_ids
=
None
,
training
=
False
):
def
call
(
self
,
inputs
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
mc_token_ids
=
None
,
training
=
False
):
if
isinstance
(
inputs
,
(
tuple
,
list
)):
if
isinstance
(
inputs
,
(
tuple
,
list
)):
...
...
pytorch_transformers/modeling_tf_roberta.py
View file @
57053334
...
@@ -24,7 +24,7 @@ import numpy as np
...
@@ -24,7 +24,7 @@ import numpy as np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
.configuration_roberta
import
RobertaConfig
from
.configuration_roberta
import
RobertaConfig
from
.modeling_tf_utils
import
TFPreTrainedModel
from
.modeling_tf_utils
import
TFPreTrainedModel
,
get_initializer
from
.file_utils
import
add_start_docstrings
from
.file_utils
import
add_start_docstrings
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
...
@@ -232,7 +232,9 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
...
@@ -232,7 +232,9 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFRobertaLMHead
,
self
).
__init__
(
**
kwargs
)
super
(
TFRobertaLMHead
,
self
).
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
name
=
'dense'
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'dense'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
'layer_norm'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
'layer_norm'
)
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
...
@@ -315,9 +317,14 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
...
@@ -315,9 +317,14 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFRobertaClassificationHead
,
self
).
__init__
(
config
,
**
kwargs
)
super
(
TFRobertaClassificationHead
,
self
).
__init__
(
config
,
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
activation
=
'tanh'
,
name
=
"dense"
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
activation
=
'tanh'
,
name
=
"dense"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
out_proj
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
name
=
"out_proj"
)
self
.
out_proj
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"out_proj"
)
def
call
(
self
,
features
,
training
=
False
):
def
call
(
self
,
features
,
training
=
False
):
x
=
features
[:,
0
,
:]
# take <s> token (equiv. to [CLS])
x
=
features
[:,
0
,
:]
# take <s> token (equiv. to [CLS])
...
...
pytorch_transformers/modeling_tf_transfo_xl.py
View file @
57053334
...
@@ -30,7 +30,7 @@ import numpy as np
...
@@ -30,7 +30,7 @@ import numpy as np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
.configuration_transfo_xl
import
TransfoXLConfig
from
.configuration_transfo_xl
import
TransfoXLConfig
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFConv1D
,
TFSequenceSummary
,
shape_list
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFConv1D
,
TFSequenceSummary
,
shape_list
,
get_initializer
from
.modeling_tf_transfo_xl_utilities
import
TFAdaptiveSoftmaxMask
from
.modeling_tf_transfo_xl_utilities
import
TFAdaptiveSoftmaxMask
from
.file_utils
import
add_start_docstrings
from
.file_utils
import
add_start_docstrings
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
...
@@ -66,16 +66,21 @@ class TFPositionalEmbedding(tf.keras.layers.Layer):
...
@@ -66,16 +66,21 @@ class TFPositionalEmbedding(tf.keras.layers.Layer):
class
TFPositionwiseFF
(
tf
.
keras
.
layers
.
Layer
):
class
TFPositionwiseFF
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
layer_norm_epsilon
=
1e-5
,
**
kwargs
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
layer_norm_epsilon
=
1e-5
,
init_std
=
0.02
,
**
kwargs
):
super
(
TFPositionwiseFF
,
self
).
__init__
(
**
kwargs
)
super
(
TFPositionwiseFF
,
self
).
__init__
(
**
kwargs
)
self
.
d_model
=
d_model
self
.
d_model
=
d_model
self
.
d_inner
=
d_inner
self
.
d_inner
=
d_inner
self
.
dropout
=
dropout
self
.
dropout
=
dropout
self
.
layer_1
=
tf
.
keras
.
layers
.
Dense
(
d_inner
,
activation
=
tf
.
nn
.
relu
,
name
=
'CoreNet_._0'
)
self
.
layer_1
=
tf
.
keras
.
layers
.
Dense
(
d_inner
,
kernel_initializer
=
get_initializer
(
init_std
),
activation
=
tf
.
nn
.
relu
,
name
=
'CoreNet_._0'
)
self
.
drop_1
=
tf
.
keras
.
layers
.
Dropout
(
dropout
)
self
.
drop_1
=
tf
.
keras
.
layers
.
Dropout
(
dropout
)
self
.
layer_2
=
tf
.
keras
.
layers
.
Dense
(
d_model
,
name
=
'CoreNet_._3'
)
self
.
layer_2
=
tf
.
keras
.
layers
.
Dense
(
d_model
,
kernel_initializer
=
get_initializer
(
init_std
),
name
=
'CoreNet_._3'
)
self
.
drop_2
=
tf
.
keras
.
layers
.
Dropout
(
dropout
)
self
.
drop_2
=
tf
.
keras
.
layers
.
Dropout
(
dropout
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
layer_norm_epsilon
,
name
=
'layer_norm'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
layer_norm_epsilon
,
name
=
'layer_norm'
)
...
@@ -110,7 +115,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
...
@@ -110,7 +115,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
def
__init__
(
self
,
n_head
,
d_model
,
d_head
,
dropout
,
dropatt
=
0
,
def
__init__
(
self
,
n_head
,
d_model
,
d_head
,
dropout
,
dropatt
=
0
,
tgt_len
=
None
,
ext_len
=
None
,
mem_len
=
None
,
pre_lnorm
=
False
,
tgt_len
=
None
,
ext_len
=
None
,
mem_len
=
None
,
pre_lnorm
=
False
,
r_r_bias
=
None
,
r_w_bias
=
None
,
output_attentions
=
False
,
r_r_bias
=
None
,
r_w_bias
=
None
,
output_attentions
=
False
,
layer_norm_epsilon
=
1e-5
,
**
kwargs
):
layer_norm_epsilon
=
1e-5
,
init_std
=
0.02
,
**
kwargs
):
super
(
TFRelPartialLearnableMultiHeadAttn
,
self
).
__init__
(
**
kwargs
)
super
(
TFRelPartialLearnableMultiHeadAttn
,
self
).
__init__
(
**
kwargs
)
self
.
output_attentions
=
output_attentions
self
.
output_attentions
=
output_attentions
...
@@ -119,11 +124,17 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
...
@@ -119,11 +124,17 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
self
.
d_head
=
d_head
self
.
d_head
=
d_head
self
.
dropout
=
dropout
self
.
dropout
=
dropout
self
.
qkv_net
=
tf
.
keras
.
layers
.
Dense
(
3
*
n_head
*
d_head
,
use_bias
=
False
,
name
=
'qkv_net'
)
self
.
qkv_net
=
tf
.
keras
.
layers
.
Dense
(
3
*
n_head
*
d_head
,
kernel_initializer
=
get_initializer
(
init_std
),
use_bias
=
False
,
name
=
'qkv_net'
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
dropout
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
dropout
)
self
.
dropatt
=
tf
.
keras
.
layers
.
Dropout
(
dropatt
)
self
.
dropatt
=
tf
.
keras
.
layers
.
Dropout
(
dropatt
)
self
.
o_net
=
tf
.
keras
.
layers
.
Dense
(
d_model
,
use_bias
=
False
,
name
=
'o_net'
)
self
.
o_net
=
tf
.
keras
.
layers
.
Dense
(
d_model
,
kernel_initializer
=
get_initializer
(
init_std
),
use_bias
=
False
,
name
=
'o_net'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
layer_norm_epsilon
,
name
=
'layer_norm'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
layer_norm_epsilon
,
name
=
'layer_norm'
)
...
@@ -138,14 +149,19 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
...
@@ -138,14 +149,19 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
self
.
r_r_bias
=
None
self
.
r_r_bias
=
None
self
.
r_w_bias
=
None
self
.
r_w_bias
=
None
self
.
r_net
=
tf
.
keras
.
layers
.
Dense
(
self
.
n_head
*
self
.
d_head
,
use_bias
=
False
,
name
=
'r_net'
)
self
.
r_net
=
tf
.
keras
.
layers
.
Dense
(
self
.
n_head
*
self
.
d_head
,
kernel_initializer
=
get_initializer
(
init_std
),
use_bias
=
False
,
name
=
'r_net'
)
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
if
self
.
r_r_bias
is
None
or
self
.
r_w_bias
is
None
:
# Biases are not shared
if
self
.
r_r_bias
is
None
or
self
.
r_w_bias
is
None
:
# Biases are not shared
self
.
r_r_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
self
.
r_r_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
'zeros'
,
trainable
=
True
,
trainable
=
True
,
name
=
'r_r_bias'
)
name
=
'r_r_bias'
)
self
.
r_w_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
self
.
r_w_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
'zeros'
,
trainable
=
True
,
trainable
=
True
,
name
=
'r_w_bias'
)
name
=
'r_w_bias'
)
super
(
TFRelPartialLearnableMultiHeadAttn
,
self
).
build
(
input_shape
)
super
(
TFRelPartialLearnableMultiHeadAttn
,
self
).
build
(
input_shape
)
...
@@ -249,17 +265,18 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
...
@@ -249,17 +265,18 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
r_r_bias
=
None
,
r_r_bias
=
None
,
output_attentions
=
False
,
output_attentions
=
False
,
layer_norm_epsilon
=
1e-5
,
layer_norm_epsilon
=
1e-5
,
init_std
=
0.02
,
**
kwargs
):
**
kwargs
):
super
(
TFRelPartialLearnableDecoderLayer
,
self
).
__init__
(
**
kwargs
)
super
(
TFRelPartialLearnableDecoderLayer
,
self
).
__init__
(
**
kwargs
)
self
.
dec_attn
=
TFRelPartialLearnableMultiHeadAttn
(
n_head
,
d_model
,
self
.
dec_attn
=
TFRelPartialLearnableMultiHeadAttn
(
n_head
,
d_model
,
d_head
,
dropout
,
tgt_len
=
tgt_len
,
ext_len
=
ext_len
,
d_head
,
dropout
,
tgt_len
=
tgt_len
,
ext_len
=
ext_len
,
mem_len
=
mem_len
,
dropatt
=
dropatt
,
pre_lnorm
=
pre_lnorm
,
mem_len
=
mem_len
,
dropatt
=
dropatt
,
pre_lnorm
=
pre_lnorm
,
r_w_bias
=
r_w_bias
,
r_r_bias
=
r_r_bias
,
r_w_bias
=
r_w_bias
,
r_r_bias
=
r_r_bias
,
init_std
=
init_std
,
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
layer_norm_epsilon
=
layer_norm_epsilon
,
name
=
'dec_attn'
)
layer_norm_epsilon
=
layer_norm_epsilon
,
name
=
'dec_attn'
)
self
.
pos_ff
=
TFPositionwiseFF
(
d_model
,
d_inner
,
dropout
,
self
.
pos_ff
=
TFPositionwiseFF
(
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
pre_lnorm
,
pre_lnorm
=
pre_lnorm
,
init_std
=
init_std
,
layer_norm_epsilon
=
layer_norm_epsilon
,
layer_norm_epsilon
=
layer_norm_epsilon
,
name
=
'pos_ff'
)
name
=
'pos_ff'
)
...
@@ -275,12 +292,13 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
...
@@ -275,12 +292,13 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
class
TFAdaptiveEmbedding
(
tf
.
keras
.
layers
.
Layer
):
class
TFAdaptiveEmbedding
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
init_std
=
0.02
,
sample_softmax
=
False
,
**
kwargs
):
sample_softmax
=
False
,
**
kwargs
):
super
(
TFAdaptiveEmbedding
,
self
).
__init__
(
**
kwargs
)
super
(
TFAdaptiveEmbedding
,
self
).
__init__
(
**
kwargs
)
self
.
n_token
=
n_token
self
.
n_token
=
n_token
self
.
d_embed
=
d_embed
self
.
d_embed
=
d_embed
self
.
init_std
=
init_std
self
.
cutoffs
=
cutoffs
+
[
n_token
]
self
.
cutoffs
=
cutoffs
+
[
n_token
]
self
.
div_val
=
div_val
self
.
div_val
=
div_val
...
@@ -298,12 +316,16 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
...
@@ -298,12 +316,16 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
for
i
in
range
(
len
(
self
.
cutoffs
)):
for
i
in
range
(
len
(
self
.
cutoffs
)):
l_idx
,
r_idx
=
self
.
cutoff_ends
[
i
],
self
.
cutoff_ends
[
i
+
1
]
l_idx
,
r_idx
=
self
.
cutoff_ends
[
i
],
self
.
cutoff_ends
[
i
+
1
]
d_emb_i
=
d_embed
//
(
div_val
**
i
)
d_emb_i
=
d_embed
//
(
div_val
**
i
)
self
.
emb_layers
.
append
(
tf
.
keras
.
layers
.
Embedding
(
r_idx
-
l_idx
,
d_emb_i
,
name
=
'emb_layers_._{}'
.
format
(
i
)))
self
.
emb_layers
.
append
(
tf
.
keras
.
layers
.
Embedding
(
r_idx
-
l_idx
,
d_emb_i
,
embeddings_initializer
=
get_initializer
(
init_std
),
name
=
'emb_layers_._{}'
.
format
(
i
)))
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
for
i
in
range
(
len
(
self
.
cutoffs
)):
for
i
in
range
(
len
(
self
.
cutoffs
)):
d_emb_i
=
self
.
d_embed
//
(
self
.
div_val
**
i
)
d_emb_i
=
self
.
d_embed
//
(
self
.
div_val
**
i
)
self
.
emb_projs
.
append
(
self
.
add_weight
(
shape
=
(
d_emb_i
,
self
.
d_proj
),
self
.
emb_projs
.
append
(
self
.
add_weight
(
shape
=
(
d_emb_i
,
self
.
d_proj
),
initializer
=
get_initializer
(
self
.
init_std
),
trainable
=
True
,
trainable
=
True
,
name
=
'emb_projs_._{}'
.
format
(
i
)))
name
=
'emb_projs_._{}'
.
format
(
i
)))
super
(
TFAdaptiveEmbedding
,
self
).
build
(
input_shape
)
super
(
TFAdaptiveEmbedding
,
self
).
build
(
input_shape
)
...
@@ -349,7 +371,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
...
@@ -349,7 +371,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
self
.
untie_r
=
config
.
untie_r
self
.
untie_r
=
config
.
untie_r
self
.
word_emb
=
TFAdaptiveEmbedding
(
config
.
n_token
,
config
.
d_embed
,
config
.
d_model
,
config
.
cutoffs
,
self
.
word_emb
=
TFAdaptiveEmbedding
(
config
.
n_token
,
config
.
d_embed
,
config
.
d_model
,
config
.
cutoffs
,
div_val
=
config
.
div_val
,
name
=
'word_emb'
)
div_val
=
config
.
div_val
,
init_std
=
config
.
init_std
,
name
=
'word_emb'
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
...
@@ -374,6 +396,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
...
@@ -374,6 +396,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
r_r_bias
=
None
if
self
.
untie_r
else
self
.
r_r_bias
,
r_r_bias
=
None
if
self
.
untie_r
else
self
.
r_r_bias
,
output_attentions
=
self
.
output_attentions
,
output_attentions
=
self
.
output_attentions
,
layer_norm_epsilon
=
config
.
layer_norm_epsilon
,
layer_norm_epsilon
=
config
.
layer_norm_epsilon
,
init_std
=
config
.
init_std
,
name
=
'layers_._{}'
.
format
(
i
))
name
=
'layers_._{}'
.
format
(
i
))
)
)
else
:
# learnable embeddings and absolute embeddings
else
:
# learnable embeddings and absolute embeddings
...
...
pytorch_transformers/modeling_tf_utils.py
View file @
57053334
...
@@ -277,20 +277,20 @@ class TFPreTrainedModel(tf.keras.Model):
...
@@ -277,20 +277,20 @@ class TFPreTrainedModel(tf.keras.Model):
return
model
return
model
class
TFConv1D
(
tf
.
keras
.
layers
.
Layer
):
class
TFConv1D
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
nf
,
nx
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
nf
,
nx
,
*
inputs
,
initializer_range
=
0.02
,
**
kwargs
):
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
Basically works like a Linear layer but the weights are transposed
Basically works like a Linear layer but the weights are transposed
"""
"""
super
(
TFConv1D
,
self
).
__init__
(
*
inputs
,
**
kwargs
)
super
(
TFConv1D
,
self
).
__init__
(
*
inputs
,
**
kwargs
)
self
.
nf
=
nf
self
.
nf
=
nf
self
.
nx
=
nx
self
.
nx
=
nx
self
.
initializer_range
=
initializer_range
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
self
.
weight
=
self
.
add_weight
(
self
.
weight
=
self
.
add_weight
(
"weight"
,
"weight"
,
shape
=
[
self
.
nx
,
self
.
nf
],
shape
=
[
self
.
nx
,
self
.
nf
],
initializer
=
tf
.
random_normal_initializer
(
initializer
=
get_initializer
(
self
.
initializer_range
))
mean
=
0.
,
stddev
=
0.02
))
self
.
bias
=
self
.
add_weight
(
self
.
bias
=
self
.
add_weight
(
"bias"
,
"bias"
,
shape
=
[
1
,
self
.
nf
],
shape
=
[
1
,
self
.
nf
],
...
@@ -314,19 +314,17 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
...
@@ -314,19 +314,17 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
super
(
TFSharedEmbeddings
,
self
).
__init__
(
**
kwargs
)
super
(
TFSharedEmbeddings
,
self
).
__init__
(
**
kwargs
)
self
.
vocab_size
=
vocab_size
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
hidden_size
=
hidden_size
self
.
initializer_range
=
initializer_range
self
.
initializer_range
=
hidden_size
**-
0.5
if
initializer_range
is
None
else
initializer_range
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
"""Build shared word embedding layer
"""Build shared word embedding layer
Shared weights logic adapted from
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
"""
initializer_range
=
self
.
hidden_size
**-
0.5
if
self
.
initializer_range
is
None
else
self
.
initializer_range
self
.
weight
=
self
.
add_weight
(
self
.
weight
=
self
.
add_weight
(
"weight"
,
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
initializer
=
tf
.
random_normal_initializer
(
initializer
=
get_initializer
(
self
.
initializer_range
))
mean
=
0.
,
stddev
=
initializer_range
))
super
(
TFSharedEmbeddings
,
self
).
build
(
input_shape
)
super
(
TFSharedEmbeddings
,
self
).
build
(
input_shape
)
def
call
(
self
,
inputs
,
mode
=
"embedding"
):
def
call
(
self
,
inputs
,
mode
=
"embedding"
):
...
@@ -385,7 +383,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
...
@@ -385,7 +383,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
summary_first_dropout: Add a dropout before the projection and activation
summary_first_dropout: Add a dropout before the projection and activation
summary_last_dropout: Add a dropout after the projection and activation
summary_last_dropout: Add a dropout after the projection and activation
"""
"""
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
initializer_range
=
0.02
,
**
kwargs
):
super
(
TFSequenceSummary
,
self
).
__init__
(
**
kwargs
)
super
(
TFSequenceSummary
,
self
).
__init__
(
**
kwargs
)
self
.
summary_type
=
config
.
summary_type
if
hasattr
(
config
,
'summary_use_proj'
)
else
'last'
self
.
summary_type
=
config
.
summary_type
if
hasattr
(
config
,
'summary_use_proj'
)
else
'last'
...
@@ -401,7 +399,9 @@ class TFSequenceSummary(tf.keras.layers.Layer):
...
@@ -401,7 +399,9 @@ class TFSequenceSummary(tf.keras.layers.Layer):
num_classes
=
config
.
num_labels
num_classes
=
config
.
num_labels
else
:
else
:
num_classes
=
config
.
hidden_size
num_classes
=
config
.
hidden_size
self
.
summary
=
tf
.
keras
.
layers
.
Dense
(
num_classes
,
name
=
'summary'
)
self
.
summary
=
tf
.
keras
.
layers
.
Dense
(
num_classes
,
kernel_initializer
=
get_initializer
(
initializer_range
),
name
=
'summary'
)
self
.
activation
=
None
self
.
activation
=
None
if
hasattr
(
config
,
'summary_activation'
)
and
config
.
summary_activation
==
'tanh'
:
if
hasattr
(
config
,
'summary_activation'
)
and
config
.
summary_activation
==
'tanh'
:
...
...
pytorch_transformers/modeling_tf_xlm.py
View file @
57053334
...
@@ -25,7 +25,7 @@ import numpy as np
...
@@ -25,7 +25,7 @@ import numpy as np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
.configuration_xlm
import
XLMConfig
from
.configuration_xlm
import
XLMConfig
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFSharedEmbeddings
,
TFSequenceSummary
,
shape_list
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFSharedEmbeddings
,
TFSequenceSummary
,
shape_list
,
get_initializer
from
.file_utils
import
add_start_docstrings
from
.file_utils
import
add_start_docstrings
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
...
@@ -119,10 +119,10 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
...
@@ -119,10 +119,10 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
self
.
n_heads
=
n_heads
self
.
n_heads
=
n_heads
assert
self
.
dim
%
self
.
n_heads
==
0
assert
self
.
dim
%
self
.
n_heads
==
0
self
.
q_lin
=
tf
.
keras
.
layers
.
Dense
(
dim
,
name
=
'q_lin'
)
self
.
q_lin
=
tf
.
keras
.
layers
.
Dense
(
dim
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
'q_lin'
)
self
.
k_lin
=
tf
.
keras
.
layers
.
Dense
(
dim
,
name
=
'k_lin'
)
self
.
k_lin
=
tf
.
keras
.
layers
.
Dense
(
dim
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
'k_lin'
)
self
.
v_lin
=
tf
.
keras
.
layers
.
Dense
(
dim
,
name
=
'v_lin'
)
self
.
v_lin
=
tf
.
keras
.
layers
.
Dense
(
dim
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
'v_lin'
)
self
.
out_lin
=
tf
.
keras
.
layers
.
Dense
(
dim
,
name
=
'out_lin'
)
self
.
out_lin
=
tf
.
keras
.
layers
.
Dense
(
dim
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
'out_lin'
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attention_dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attention_dropout
)
self
.
pruned_heads
=
set
()
self
.
pruned_heads
=
set
()
...
@@ -199,8 +199,8 @@ class TFTransformerFFN(tf.keras.layers.Layer):
...
@@ -199,8 +199,8 @@ class TFTransformerFFN(tf.keras.layers.Layer):
def
__init__
(
self
,
in_dim
,
dim_hidden
,
out_dim
,
config
,
**
kwargs
):
def
__init__
(
self
,
in_dim
,
dim_hidden
,
out_dim
,
config
,
**
kwargs
):
super
(
TFTransformerFFN
,
self
).
__init__
(
**
kwargs
)
super
(
TFTransformerFFN
,
self
).
__init__
(
**
kwargs
)
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
dim_hidden
,
name
=
'lin1'
)
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
dim_hidden
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
'lin1'
)
self
.
lin2
=
tf
.
keras
.
layers
.
Dense
(
out_dim
,
name
=
'lin2'
)
self
.
lin2
=
tf
.
keras
.
layers
.
Dense
(
out_dim
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
'lin2'
)
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
if
config
.
gelu_activation
else
tf
.
keras
.
activations
.
relu
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
if
config
.
gelu_activation
else
tf
.
keras
.
activations
.
relu
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
...
@@ -249,13 +249,19 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
...
@@ -249,13 +249,19 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
attention_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attention_dropout
)
self
.
attention_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attention_dropout
)
self
.
position_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
config
.
max_position_embeddings
,
self
.
dim
,
name
=
'position_embeddings'
)
self
.
position_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
config
.
max_position_embeddings
,
self
.
dim
,
embeddings_initializer
=
get_initializer
(
config
.
embed_init_std
),
name
=
'position_embeddings'
)
if
config
.
sinusoidal_embeddings
:
if
config
.
sinusoidal_embeddings
:
raise
NotImplementedError
raise
NotImplementedError
# create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
# create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
if
config
.
n_langs
>
1
and
config
.
use_lang_emb
:
if
config
.
n_langs
>
1
and
config
.
use_lang_emb
:
self
.
lang_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
self
.
n_langs
,
self
.
dim
,
name
=
'lang_embeddings'
)
self
.
lang_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
self
.
n_langs
,
self
.
embeddings
=
TFSharedEmbeddings
(
self
.
n_words
,
self
.
dim
,
name
=
'embeddings'
)
# padding_idx=self.pad_index)
self
.
dim
,
embeddings_initializer
=
get_initializer
(
config
.
embed_init_std
),
name
=
'lang_embeddings'
)
self
.
embeddings
=
TFSharedEmbeddings
(
self
.
n_words
,
self
.
dim
,
initializer_range
=
config
.
embed_init_std
,
name
=
'embeddings'
)
# padding_idx=self.pad_index)
self
.
layer_norm_emb
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
'layer_norm_emb'
)
self
.
layer_norm_emb
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
'layer_norm_emb'
)
# transformer layers
# transformer layers
...
@@ -676,7 +682,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
...
@@ -676,7 +682,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
'transformer'
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
'transformer'
)
self
.
sequence_summary
=
TFSequenceSummary
(
config
,
name
=
'sequence_summary'
)
self
.
sequence_summary
=
TFSequenceSummary
(
config
,
initializer_range
=
config
.
init_std
,
name
=
'sequence_summary'
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
transformer_outputs
=
self
.
transformer
(
inputs
,
**
kwargs
)
transformer_outputs
=
self
.
transformer
(
inputs
,
**
kwargs
)
...
@@ -721,7 +727,9 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
...
@@ -721,7 +727,9 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLMForQuestionAnsweringSimple
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
(
TFXLMForQuestionAnsweringSimple
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
'transformer'
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
'transformer'
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
name
=
'qa_outputs'
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
'qa_outputs'
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
transformer_outputs
=
self
.
transformer
(
inputs
,
**
kwargs
)
transformer_outputs
=
self
.
transformer
(
inputs
,
**
kwargs
)
...
...
pytorch_transformers/modeling_tf_xlnet.py
View file @
57053334
...
@@ -28,7 +28,7 @@ import numpy as np
...
@@ -28,7 +28,7 @@ import numpy as np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
.configuration_xlnet
import
XLNetConfig
from
.configuration_xlnet
import
XLNetConfig
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFSharedEmbeddings
,
TFSequenceSummary
,
shape_list
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFSharedEmbeddings
,
TFSequenceSummary
,
shape_list
,
get_initializer
from
.file_utils
import
add_start_docstrings
from
.file_utils
import
add_start_docstrings
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
...
@@ -87,7 +87,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
...
@@ -87,7 +87,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
initializer_range
)
initializer
=
get_initializer
(
self
.
initializer_range
)
self
.
q
=
self
.
add_weight
(
shape
=
(
self
.
d_model
,
self
.
n_head
,
self
.
d_head
),
self
.
q
=
self
.
add_weight
(
shape
=
(
self
.
d_model
,
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
initializer
=
initializer
,
trainable
=
True
,
name
=
'q'
)
trainable
=
True
,
name
=
'q'
)
...
@@ -104,13 +104,13 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
...
@@ -104,13 +104,13 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
initializer
=
initializer
,
initializer
=
initializer
,
trainable
=
True
,
name
=
'r'
)
trainable
=
True
,
name
=
'r'
)
self
.
r_r_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
self
.
r_r_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
initializer
=
'zeros'
,
trainable
=
True
,
name
=
'r_r_bias'
)
trainable
=
True
,
name
=
'r_r_bias'
)
self
.
r_s_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
self
.
r_s_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
initializer
=
'zeros'
,
trainable
=
True
,
name
=
'r_s_bias'
)
trainable
=
True
,
name
=
'r_s_bias'
)
self
.
r_w_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
self
.
r_w_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
initializer
=
'zeros'
,
trainable
=
True
,
name
=
'r_w_bias'
)
trainable
=
True
,
name
=
'r_w_bias'
)
self
.
seg_embed
=
self
.
add_weight
(
shape
=
(
2
,
self
.
n_head
,
self
.
d_head
),
self
.
seg_embed
=
self
.
add_weight
(
shape
=
(
2
,
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
initializer
=
initializer
,
...
@@ -294,8 +294,12 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
...
@@ -294,8 +294,12 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLNetFeedForward
,
self
).
__init__
(
**
kwargs
)
super
(
TFXLNetFeedForward
,
self
).
__init__
(
**
kwargs
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
'layer_norm'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
'layer_norm'
)
self
.
layer_1
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_inner
,
name
=
'layer_1'
)
self
.
layer_1
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_inner
,
self
.
layer_2
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_model
,
name
=
'layer_2'
)
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'layer_1'
)
self
.
layer_2
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_model
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'layer_2'
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
if
isinstance
(
config
.
ff_activation
,
str
)
or
\
if
isinstance
(
config
.
ff_activation
,
str
)
or
\
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
config
.
ff_activation
,
unicode
)):
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
config
.
ff_activation
,
unicode
)):
...
@@ -375,7 +379,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
...
@@ -375,7 +379,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
initializer_range
)
initializer
=
get_initializer
(
self
.
initializer_range
)
self
.
mask_emb
=
self
.
add_weight
(
shape
=
(
1
,
1
,
self
.
d_model
),
self
.
mask_emb
=
self
.
add_weight
(
shape
=
(
1
,
1
,
self
.
d_model
),
initializer
=
initializer
,
initializer
=
initializer
,
trainable
=
True
,
name
=
'mask_emb'
)
trainable
=
True
,
name
=
'mask_emb'
)
...
@@ -900,8 +904,10 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
...
@@ -900,8 +904,10 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
'transformer'
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
'transformer'
)
self
.
sequence_summary
=
TFSequenceSummary
(
config
,
name
=
'sequence_summary'
)
self
.
sequence_summary
=
TFSequenceSummary
(
config
,
initializer_range
=
config
.
initializer_range
,
name
=
'sequence_summary'
)
self
.
logits_proj
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
name
=
'logits_proj'
)
self
.
logits_proj
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'logits_proj'
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
transformer_outputs
=
self
.
transformer
(
inputs
,
**
kwargs
)
transformer_outputs
=
self
.
transformer
(
inputs
,
**
kwargs
)
...
@@ -949,7 +955,9 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
...
@@ -949,7 +955,9 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetForQuestionAnsweringSimple
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
(
TFXLNetForQuestionAnsweringSimple
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
'transformer'
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
'transformer'
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
name
=
'qa_outputs'
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'qa_outputs'
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
transformer_outputs
=
self
.
transformer
(
inputs
,
**
kwargs
)
transformer_outputs
=
self
.
transformer
(
inputs
,
**
kwargs
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment