Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
57053334
"git@developer.sourcefind.cn:yangrong/internvl2_pytorch.git" did not exist on "045dc06ea405e713ff3c96ff6d62bfb61d844f74"
Commit
57053334
authored
Sep 26, 2019
by
thomwolf
Browse files
add initialization for everybody
parent
f2a337b3
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
195 additions
and
99 deletions
+195
-99
examples/run_tf_glue.py
examples/run_tf_glue.py
+30
-19
pytorch_transformers/modeling_tf_distilbert.py
pytorch_transformers/modeling_tf_distilbert.py
+42
-15
pytorch_transformers/modeling_tf_gpt2.py
pytorch_transformers/modeling_tf_gpt2.py
+14
-8
pytorch_transformers/modeling_tf_openai.py
pytorch_transformers/modeling_tf_openai.py
+14
-8
pytorch_transformers/modeling_tf_roberta.py
pytorch_transformers/modeling_tf_roberta.py
+11
-4
pytorch_transformers/modeling_tf_transfo_xl.py
pytorch_transformers/modeling_tf_transfo_xl.py
+36
-13
pytorch_transformers/modeling_tf_utils.py
pytorch_transformers/modeling_tf_utils.py
+9
-9
pytorch_transformers/modeling_tf_xlm.py
pytorch_transformers/modeling_tf_xlm.py
+20
-12
pytorch_transformers/modeling_tf_xlnet.py
pytorch_transformers/modeling_tf_xlnet.py
+19
-11
No files found.
examples/run_tf_glue.py
View file @
57053334
import
tensorflow
as
tf
import
tensorflow
as
tf
import
tensorflow_datasets
import
tensorflow_datasets
from
transformers
import
*
from
pytorch_
transformers
import
*
# Load dataset, tokenizer, model from pretrained model/vocabulary
# Load dataset, tokenizer, model from pretrained model/vocabulary
tokenizer
=
BertTokenizer
.
from_pretrained
(
'bert-base-cased'
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
'bert-base-cased'
)
dataset
=
tensorflow_datasets
.
load
(
'glue/mrpc'
)
model
=
TFBertForSequenceClassification
.
from_pretrained
(
'bert-base-cased'
)
model
=
TFBertForSequenceClassification
.
from_pretrained
(
'bert-base-cased'
)
data
=
tensorflow_datasets
.
load
(
'glue/mrpc'
)
# Prepare dataset for GLUE as a tf.data.Dataset instance
# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset
=
glue_convert_examples_to_features
(
data
set
[
'train'
],
tokenizer
,
task
=
'mrpc'
)
train_dataset
=
glue_convert_examples_to_features
(
data
[
'train'
],
tokenizer
,
128
,
'mrpc'
)
valid_dataset
=
glue_convert_examples_to_features
(
data
set
[
'validation'
],
tokenizer
,
task
=
'mrpc'
)
valid_dataset
=
glue_convert_examples_to_features
(
data
[
'validation'
],
tokenizer
,
128
,
'mrpc'
)
train_dataset
=
train_dataset
.
shuffle
(
100
).
batch
(
32
).
repeat
(
3
)
train_dataset
=
train_dataset
.
shuffle
(
100
).
batch
(
32
).
repeat
(
2
)
valid_dataset
=
valid_dataset
.
batch
(
64
)
valid_dataset
=
valid_dataset
.
batch
(
64
)
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
learning_rate
=
tf
.
keras
.
optimizers
.
schedules
.
PolynomialDecay
(
2e-5
,
345
,
end_learning_rate
=
0
)
optimizer
=
tf
.
keras
.
optimizers
.
Adam
(
learning_rate
=
3e-5
,
epsilon
=
1e-08
,
clipnorm
=
1.0
)
optimizer
=
tf
.
keras
.
optimizers
.
Adam
(
learning_rate
=
learning_rate
,
epsilon
=
1e-08
,
clipnorm
=
1.0
)
loss
=
tf
.
keras
.
losses
.
SparseCategoricalCrossentropy
(
from_logits
=
True
)
loss
=
tf
.
keras
.
losses
.
SparseCategoricalCrossentropy
(
from_logits
=
True
)
metric
=
tf
.
keras
.
metrics
.
SparseCategoricalAccuracy
(
'accuracy'
)
model
.
compile
(
optimizer
=
optimizer
,
loss
=
loss
,
metrics
=
[
'sparse_categorical_accuracy'
])
model
.
compile
(
optimizer
=
optimizer
,
loss
=
loss
,
metrics
=
[
metric
])
# Train and evaluate using tf.keras.Model.fit()
# Train and evaluate using tf.keras.Model.fit()
model
.
fit
(
train_dataset
,
epochs
=
3
,
steps_per_epoch
=
115
,
history
=
model
.
fit
(
train_dataset
,
epochs
=
2
,
steps_per_epoch
=
115
,
validation_data
=
valid_dataset
,
validation_steps
=
7
)
validation_data
=
valid_dataset
,
validation_steps
=
7
)
>>>
Train
for
115
steps
,
validate
for
7
steps
>>>
Epoch
1
/
2
>>>
115
/
115
[
==============================
]
-
53
s
459
ms
/
step
-
loss
:
0.6033
-
accuracy
:
0.6712
-
val_loss
:
0.4964
-
val_accuracy
:
0.7647
>>>
Epoch
2
/
2
>>>
115
/
115
[
==============================
]
-
33
s
289
ms
/
step
-
loss
:
0.4141
-
accuracy
:
0.8160
-
val_loss
:
0.3914
-
val_accuracy
:
0.8382
#
Save
the TensorFlow model
and load it in PyTorch
#
Load
the TensorFlow model
in PyTorch for inspection
model
.
save_pretrained
(
'./save/'
)
model
.
save_pretrained
(
'./save/'
)
pytorch_model
=
BertForSequenceClassification
.
from_pretrained
(
'./save/'
,
from_tf
=
True
)
pytorch_model
=
BertForSequenceClassification
.
from_pretrained
(
'./save/'
,
from_tf
=
True
)
# Quickly inspect a few predictions - MRPC is a paraphrasing task
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
inputs
=
tokenizer
.
encode_plus
(
"The company is doing great"
,
sentence_0
=
"This research was consistent with his findings."
"The company has good results"
,
sentence_1
=
"His findings were compatible with this research."
add_special_tokens
=
True
,
sentence_2
=
"His findings were not compatible with this research."
return_tensors
=
'pt'
)
inputs_1
=
tokenizer
.
encode_plus
(
sentence_0
,
sentence_1
,
add_special_tokens
=
True
,
return_tensors
=
'pt'
)
pred
=
pytorch_model
(
**
inputs
)
inputs_2
=
tokenizer
.
encode_plus
(
sentence_0
,
sentence_2
,
add_special_tokens
=
True
,
return_tensors
=
'pt'
)
print
(
"Paraphrase"
if
pred
.
argmax
().
item
()
==
0
else
"Not paraphrase"
)
pred_1
=
pytorch_model
(
**
inputs_1
)[
0
].
argmax
().
item
()
pred_2
=
pytorch_model
(
**
inputs_2
)[
0
].
argmax
().
item
()
print
(
"sentence_1 is"
,
"a paraphrase"
if
pred_1
else
"not a paraphrase"
,
"of sentence_0"
)
print
(
"sentence_2 is"
,
"a paraphrase"
if
pred_2
else
"not a paraphrase"
,
"of sentence_0"
)
>>>
sentence_1
is
a
paraphrase
of
sentence_0
>>>
sentence_2
is
not
a
paraphrase
of
sentence_0
\ No newline at end of file
pytorch_transformers/modeling_tf_distilbert.py
View file @
57053334
...
@@ -29,7 +29,7 @@ import numpy as np
...
@@ -29,7 +29,7 @@ import numpy as np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
.configuration_distilbert
import
DistilBertConfig
from
.configuration_distilbert
import
DistilBertConfig
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFSharedEmbeddings
,
shape_list
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFSharedEmbeddings
,
shape_list
,
get_initializer
from
.file_utils
import
add_start_docstrings
from
.file_utils
import
add_start_docstrings
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
...
@@ -79,8 +79,15 @@ class TFEmbeddings(tf.keras.layers.Layer):
...
@@ -79,8 +79,15 @@ class TFEmbeddings(tf.keras.layers.Layer):
super
(
TFEmbeddings
,
self
).
__init__
(
**
kwargs
)
super
(
TFEmbeddings
,
self
).
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
dim
=
config
.
dim
self
.
dim
=
config
.
dim
self
.
word_embeddings
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
dim
,
name
=
'word_embeddings'
)
# padding_idx=0)
self
.
initializer_range
=
config
.
initializer_range
self
.
position_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
config
.
max_position_embeddings
,
config
.
dim
,
name
=
'position_embeddings'
)
self
.
word_embeddings
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
dim
,
initializer_range
=
config
.
initializer_range
,
name
=
'word_embeddings'
)
# padding_idx=0)
self
.
position_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
config
.
max_position_embeddings
,
config
.
dim
,
embeddings_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'position_embeddings'
)
if
config
.
sinusoidal_pos_embds
:
if
config
.
sinusoidal_pos_embds
:
raise
NotImplementedError
raise
NotImplementedError
...
@@ -95,8 +102,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
...
@@ -95,8 +102,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
self
.
word_embeddings
=
self
.
add_weight
(
self
.
word_embeddings
=
self
.
add_weight
(
"weight"
,
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
dim
],
shape
=
[
self
.
vocab_size
,
self
.
dim
],
initializer
=
tf
.
random_normal_initializer
(
initializer
=
get_initializer
(
self
.
initializer_range
))
mean
=
0.
,
stddev
=
self
.
dim
**-
0.5
))
super
(
TFEmbeddings
,
self
).
build
(
input_shape
)
super
(
TFEmbeddings
,
self
).
build
(
input_shape
)
def
call
(
self
,
inputs
,
mode
=
"embedding"
,
training
=
False
):
def
call
(
self
,
inputs
,
mode
=
"embedding"
,
training
=
False
):
...
@@ -178,10 +184,18 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
...
@@ -178,10 +184,18 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
assert
self
.
dim
%
self
.
n_heads
==
0
assert
self
.
dim
%
self
.
n_heads
==
0
self
.
q_lin
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
name
=
"q_lin"
)
self
.
q_lin
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
self
.
k_lin
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
name
=
"k_lin"
)
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
self
.
v_lin
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
name
=
"v_lin"
)
name
=
"q_lin"
)
self
.
out_lin
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
name
=
"out_lin"
)
self
.
k_lin
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"k_lin"
)
self
.
v_lin
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"v_lin"
)
self
.
out_lin
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"out_lin"
)
self
.
pruned_heads
=
set
()
self
.
pruned_heads
=
set
()
...
@@ -254,8 +268,12 @@ class TFFFN(tf.keras.layers.Layer):
...
@@ -254,8 +268,12 @@ class TFFFN(tf.keras.layers.Layer):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFFFN
,
self
).
__init__
(
**
kwargs
)
super
(
TFFFN
,
self
).
__init__
(
**
kwargs
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_dim
,
name
=
"lin1"
)
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_dim
,
self
.
lin2
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
name
=
"lin2"
)
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"lin1"
)
self
.
lin2
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"lin2"
)
assert
config
.
activation
in
[
'relu'
,
'gelu'
],
"activation ({}) must be in ['relu', 'gelu']"
.
format
(
config
.
activation
)
assert
config
.
activation
in
[
'relu'
,
'gelu'
],
"activation ({}) must be in ['relu', 'gelu']"
.
format
(
config
.
activation
)
self
.
activation
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
if
config
.
activation
==
'gelu'
else
tf
.
keras
.
activations
.
relu
self
.
activation
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
if
config
.
activation
==
'gelu'
else
tf
.
keras
.
activations
.
relu
...
@@ -596,7 +614,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
...
@@ -596,7 +614,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
vocab_transform
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
name
=
"vocab_transform"
)
self
.
vocab_transform
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"vocab_transform"
)
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
self
.
vocab_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
1e-12
,
name
=
"vocab_layer_norm"
)
self
.
vocab_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
1e-12
,
name
=
"vocab_layer_norm"
)
self
.
vocab_projector
=
TFDistilBertLMHead
(
config
,
self
.
distilbert
.
embeddings
,
name
=
"vocab_projector"
)
self
.
vocab_projector
=
TFDistilBertLMHead
(
config
,
self
.
distilbert
.
embeddings
,
name
=
"vocab_projector"
)
...
@@ -647,8 +667,13 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
...
@@ -647,8 +667,13 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
pre_classifier
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
activation
=
'relu'
,
name
=
"pre_classifier"
)
self
.
pre_classifier
=
tf
.
keras
.
layers
.
Dense
(
config
.
dim
,
self
.
classifier
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
name
=
"classifier"
)
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
activation
=
'relu'
,
name
=
"pre_classifier"
)
self
.
classifier
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"classifier"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
seq_classif_dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
seq_classif_dropout
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -700,7 +725,9 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
...
@@ -700,7 +725,9 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
super
(
TFDistilBertForQuestionAnswering
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
(
TFDistilBertForQuestionAnswering
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
name
=
'qa_outputs'
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'qa_outputs'
)
assert
config
.
num_labels
==
2
assert
config
.
num_labels
==
2
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
qa_dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
qa_dropout
)
...
...
pytorch_transformers/modeling_tf_gpt2.py
View file @
57053334
...
@@ -29,7 +29,7 @@ import numpy as np
...
@@ -29,7 +29,7 @@ import numpy as np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
.modeling_tf_utils
import
(
TFPreTrainedModel
,
TFConv1D
,
TFSharedEmbeddings
,
from
.modeling_tf_utils
import
(
TFPreTrainedModel
,
TFConv1D
,
TFSharedEmbeddings
,
TFSequenceSummary
,
shape_list
)
TFSequenceSummary
,
shape_list
,
get_initializer
)
from
.configuration_gpt2
import
GPT2Config
from
.configuration_gpt2
import
GPT2Config
from
.file_utils
import
add_start_docstrings
from
.file_utils
import
add_start_docstrings
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
...
@@ -76,8 +76,8 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -76,8 +76,8 @@ class TFAttention(tf.keras.layers.Layer):
self
.
split_size
=
n_state
self
.
split_size
=
n_state
self
.
scale
=
scale
self
.
scale
=
scale
self
.
c_attn
=
TFConv1D
(
n_state
*
3
,
nx
,
name
=
'c_attn'
)
self
.
c_attn
=
TFConv1D
(
n_state
*
3
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
'c_attn'
)
self
.
c_proj
=
TFConv1D
(
n_state
,
nx
,
name
=
'c_proj'
)
self
.
c_proj
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
'c_proj'
)
self
.
attn_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attn_pdrop
)
self
.
attn_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attn_pdrop
)
self
.
resid_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
self
.
resid_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
self
.
pruned_heads
=
set
()
self
.
pruned_heads
=
set
()
...
@@ -166,8 +166,8 @@ class TFMLP(tf.keras.layers.Layer):
...
@@ -166,8 +166,8 @@ class TFMLP(tf.keras.layers.Layer):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
name
=
'c_fc'
)
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
'c_fc'
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
name
=
'c_proj'
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
initializer_range
=
config
.
initializer_range
,
name
=
'c_proj'
)
self
.
act
=
gelu
self
.
act
=
gelu
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
...
@@ -212,8 +212,14 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
...
@@ -212,8 +212,14 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
n_embd
=
config
.
n_embd
self
.
n_embd
=
config
.
n_embd
self
.
wte
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
hidden_size
,
name
=
'wte'
)
self
.
wte
=
TFSharedEmbeddings
(
config
.
vocab_size
,
self
.
wpe
=
tf
.
keras
.
layers
.
Embedding
(
config
.
n_positions
,
config
.
n_embd
,
name
=
'wpe'
)
config
.
hidden_size
,
initializer_range
=
config
.
initializer_range
,
name
=
'wte'
)
self
.
wpe
=
tf
.
keras
.
layers
.
Embedding
(
config
.
n_positions
,
config
.
n_embd
,
embeddings_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'wpe'
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
embd_pdrop
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
embd_pdrop
)
self
.
h
=
[
TFBlock
(
config
.
n_ctx
,
self
.
h
=
[
TFBlock
(
config
.
n_ctx
,
config
,
config
,
...
@@ -557,7 +563,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
...
@@ -557,7 +563,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFGPT2DoubleHeadsModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
(
TFGPT2DoubleHeadsModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
'transformer'
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
'transformer'
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
config
,
name
=
'multiple_choice_head'
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
config
,
initializer_range
=
config
.
initializer_range
,
name
=
'multiple_choice_head'
)
def
call
(
self
,
inputs
,
past
=
None
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
mc_token_ids
=
None
,
training
=
False
):
def
call
(
self
,
inputs
,
past
=
None
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
mc_token_ids
=
None
,
training
=
False
):
if
isinstance
(
inputs
,
(
tuple
,
list
)):
if
isinstance
(
inputs
,
(
tuple
,
list
)):
...
...
pytorch_transformers/modeling_tf_openai.py
View file @
57053334
...
@@ -29,7 +29,7 @@ import numpy as np
...
@@ -29,7 +29,7 @@ import numpy as np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
.modeling_tf_utils
import
(
TFPreTrainedModel
,
TFConv1D
,
TFSharedEmbeddings
,
from
.modeling_tf_utils
import
(
TFPreTrainedModel
,
TFConv1D
,
TFSharedEmbeddings
,
TFSequenceSummary
,
shape_list
)
TFSequenceSummary
,
shape_list
,
get_initializer
)
from
.configuration_openai
import
OpenAIGPTConfig
from
.configuration_openai
import
OpenAIGPTConfig
from
.file_utils
import
add_start_docstrings
from
.file_utils
import
add_start_docstrings
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
...
@@ -83,8 +83,8 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -83,8 +83,8 @@ class TFAttention(tf.keras.layers.Layer):
self
.
split_size
=
n_state
self
.
split_size
=
n_state
self
.
scale
=
scale
self
.
scale
=
scale
self
.
c_attn
=
TFConv1D
(
n_state
*
3
,
nx
,
name
=
'c_attn'
)
self
.
c_attn
=
TFConv1D
(
n_state
*
3
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
'c_attn'
)
self
.
c_proj
=
TFConv1D
(
n_state
,
nx
,
name
=
'c_proj'
)
self
.
c_proj
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
'c_proj'
)
self
.
attn_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attn_pdrop
)
self
.
attn_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attn_pdrop
)
self
.
resid_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
self
.
resid_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
self
.
pruned_heads
=
set
()
self
.
pruned_heads
=
set
()
...
@@ -168,8 +168,8 @@ class TFMLP(tf.keras.layers.Layer):
...
@@ -168,8 +168,8 @@ class TFMLP(tf.keras.layers.Layer):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
name
=
'c_fc'
)
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
'c_fc'
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
name
=
'c_proj'
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
initializer_range
=
config
.
initializer_range
,
name
=
'c_proj'
)
self
.
act
=
gelu
self
.
act
=
gelu
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
...
@@ -212,8 +212,14 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
...
@@ -212,8 +212,14 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
n_embd
=
config
.
n_embd
self
.
n_embd
=
config
.
n_embd
self
.
tokens_embed
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
n_embd
,
name
=
'tokens_embed'
)
self
.
tokens_embed
=
TFSharedEmbeddings
(
config
.
vocab_size
,
self
.
positions_embed
=
tf
.
keras
.
layers
.
Embedding
(
config
.
n_positions
,
config
.
n_embd
,
name
=
'positions_embed'
)
config
.
n_embd
,
initializer_range
=
config
.
initializer_range
,
name
=
'tokens_embed'
)
self
.
positions_embed
=
tf
.
keras
.
layers
.
Embedding
(
config
.
n_positions
,
config
.
n_embd
,
embeddings_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'positions_embed'
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
embd_pdrop
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
embd_pdrop
)
self
.
h
=
[
TFBlock
(
config
.
n_ctx
,
self
.
h
=
[
TFBlock
(
config
.
n_ctx
,
config
,
config
,
...
@@ -522,7 +528,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
...
@@ -522,7 +528,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTDoubleHeadsModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
(
TFOpenAIGPTDoubleHeadsModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
'transformer'
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
'transformer'
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
config
,
name
=
'multiple_choice_head'
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
config
,
initializer_range
=
config
.
initializer_range
,
name
=
'multiple_choice_head'
)
def
call
(
self
,
inputs
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
mc_token_ids
=
None
,
training
=
False
):
def
call
(
self
,
inputs
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
mc_token_ids
=
None
,
training
=
False
):
if
isinstance
(
inputs
,
(
tuple
,
list
)):
if
isinstance
(
inputs
,
(
tuple
,
list
)):
...
...
pytorch_transformers/modeling_tf_roberta.py
View file @
57053334
...
@@ -24,7 +24,7 @@ import numpy as np
...
@@ -24,7 +24,7 @@ import numpy as np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
.configuration_roberta
import
RobertaConfig
from
.configuration_roberta
import
RobertaConfig
from
.modeling_tf_utils
import
TFPreTrainedModel
from
.modeling_tf_utils
import
TFPreTrainedModel
,
get_initializer
from
.file_utils
import
add_start_docstrings
from
.file_utils
import
add_start_docstrings
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
...
@@ -232,7 +232,9 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
...
@@ -232,7 +232,9 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFRobertaLMHead
,
self
).
__init__
(
**
kwargs
)
super
(
TFRobertaLMHead
,
self
).
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
name
=
'dense'
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'dense'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
'layer_norm'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
'layer_norm'
)
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
...
@@ -315,9 +317,14 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
...
@@ -315,9 +317,14 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFRobertaClassificationHead
,
self
).
__init__
(
config
,
**
kwargs
)
super
(
TFRobertaClassificationHead
,
self
).
__init__
(
config
,
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
activation
=
'tanh'
,
name
=
"dense"
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
activation
=
'tanh'
,
name
=
"dense"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
out_proj
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
name
=
"out_proj"
)
self
.
out_proj
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"out_proj"
)
def
call
(
self
,
features
,
training
=
False
):
def
call
(
self
,
features
,
training
=
False
):
x
=
features
[:,
0
,
:]
# take <s> token (equiv. to [CLS])
x
=
features
[:,
0
,
:]
# take <s> token (equiv. to [CLS])
...
...
pytorch_transformers/modeling_tf_transfo_xl.py
View file @
57053334
...
@@ -30,7 +30,7 @@ import numpy as np
...
@@ -30,7 +30,7 @@ import numpy as np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
.configuration_transfo_xl
import
TransfoXLConfig
from
.configuration_transfo_xl
import
TransfoXLConfig
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFConv1D
,
TFSequenceSummary
,
shape_list
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFConv1D
,
TFSequenceSummary
,
shape_list
,
get_initializer
from
.modeling_tf_transfo_xl_utilities
import
TFAdaptiveSoftmaxMask
from
.modeling_tf_transfo_xl_utilities
import
TFAdaptiveSoftmaxMask
from
.file_utils
import
add_start_docstrings
from
.file_utils
import
add_start_docstrings
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
...
@@ -66,16 +66,21 @@ class TFPositionalEmbedding(tf.keras.layers.Layer):
...
@@ -66,16 +66,21 @@ class TFPositionalEmbedding(tf.keras.layers.Layer):
class
TFPositionwiseFF
(
tf
.
keras
.
layers
.
Layer
):
class
TFPositionwiseFF
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
layer_norm_epsilon
=
1e-5
,
**
kwargs
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
layer_norm_epsilon
=
1e-5
,
init_std
=
0.02
,
**
kwargs
):
super
(
TFPositionwiseFF
,
self
).
__init__
(
**
kwargs
)
super
(
TFPositionwiseFF
,
self
).
__init__
(
**
kwargs
)
self
.
d_model
=
d_model
self
.
d_model
=
d_model
self
.
d_inner
=
d_inner
self
.
d_inner
=
d_inner
self
.
dropout
=
dropout
self
.
dropout
=
dropout
self
.
layer_1
=
tf
.
keras
.
layers
.
Dense
(
d_inner
,
activation
=
tf
.
nn
.
relu
,
name
=
'CoreNet_._0'
)
self
.
layer_1
=
tf
.
keras
.
layers
.
Dense
(
d_inner
,
kernel_initializer
=
get_initializer
(
init_std
),
activation
=
tf
.
nn
.
relu
,
name
=
'CoreNet_._0'
)
self
.
drop_1
=
tf
.
keras
.
layers
.
Dropout
(
dropout
)
self
.
drop_1
=
tf
.
keras
.
layers
.
Dropout
(
dropout
)
self
.
layer_2
=
tf
.
keras
.
layers
.
Dense
(
d_model
,
name
=
'CoreNet_._3'
)
self
.
layer_2
=
tf
.
keras
.
layers
.
Dense
(
d_model
,
kernel_initializer
=
get_initializer
(
init_std
),
name
=
'CoreNet_._3'
)
self
.
drop_2
=
tf
.
keras
.
layers
.
Dropout
(
dropout
)
self
.
drop_2
=
tf
.
keras
.
layers
.
Dropout
(
dropout
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
layer_norm_epsilon
,
name
=
'layer_norm'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
layer_norm_epsilon
,
name
=
'layer_norm'
)
...
@@ -110,7 +115,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
...
@@ -110,7 +115,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
def
__init__
(
self
,
n_head
,
d_model
,
d_head
,
dropout
,
dropatt
=
0
,
def
__init__
(
self
,
n_head
,
d_model
,
d_head
,
dropout
,
dropatt
=
0
,
tgt_len
=
None
,
ext_len
=
None
,
mem_len
=
None
,
pre_lnorm
=
False
,
tgt_len
=
None
,
ext_len
=
None
,
mem_len
=
None
,
pre_lnorm
=
False
,
r_r_bias
=
None
,
r_w_bias
=
None
,
output_attentions
=
False
,
r_r_bias
=
None
,
r_w_bias
=
None
,
output_attentions
=
False
,
layer_norm_epsilon
=
1e-5
,
**
kwargs
):
layer_norm_epsilon
=
1e-5
,
init_std
=
0.02
,
**
kwargs
):
super
(
TFRelPartialLearnableMultiHeadAttn
,
self
).
__init__
(
**
kwargs
)
super
(
TFRelPartialLearnableMultiHeadAttn
,
self
).
__init__
(
**
kwargs
)
self
.
output_attentions
=
output_attentions
self
.
output_attentions
=
output_attentions
...
@@ -119,11 +124,17 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
...
@@ -119,11 +124,17 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
self
.
d_head
=
d_head
self
.
d_head
=
d_head
self
.
dropout
=
dropout
self
.
dropout
=
dropout
self
.
qkv_net
=
tf
.
keras
.
layers
.
Dense
(
3
*
n_head
*
d_head
,
use_bias
=
False
,
name
=
'qkv_net'
)
self
.
qkv_net
=
tf
.
keras
.
layers
.
Dense
(
3
*
n_head
*
d_head
,
kernel_initializer
=
get_initializer
(
init_std
),
use_bias
=
False
,
name
=
'qkv_net'
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
dropout
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
dropout
)
self
.
dropatt
=
tf
.
keras
.
layers
.
Dropout
(
dropatt
)
self
.
dropatt
=
tf
.
keras
.
layers
.
Dropout
(
dropatt
)
self
.
o_net
=
tf
.
keras
.
layers
.
Dense
(
d_model
,
use_bias
=
False
,
name
=
'o_net'
)
self
.
o_net
=
tf
.
keras
.
layers
.
Dense
(
d_model
,
kernel_initializer
=
get_initializer
(
init_std
),
use_bias
=
False
,
name
=
'o_net'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
layer_norm_epsilon
,
name
=
'layer_norm'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
layer_norm_epsilon
,
name
=
'layer_norm'
)
...
@@ -138,14 +149,19 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
...
@@ -138,14 +149,19 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
self
.
r_r_bias
=
None
self
.
r_r_bias
=
None
self
.
r_w_bias
=
None
self
.
r_w_bias
=
None
self
.
r_net
=
tf
.
keras
.
layers
.
Dense
(
self
.
n_head
*
self
.
d_head
,
use_bias
=
False
,
name
=
'r_net'
)
self
.
r_net
=
tf
.
keras
.
layers
.
Dense
(
self
.
n_head
*
self
.
d_head
,
kernel_initializer
=
get_initializer
(
init_std
),
use_bias
=
False
,
name
=
'r_net'
)
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
if
self
.
r_r_bias
is
None
or
self
.
r_w_bias
is
None
:
# Biases are not shared
if
self
.
r_r_bias
is
None
or
self
.
r_w_bias
is
None
:
# Biases are not shared
self
.
r_r_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
self
.
r_r_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
'zeros'
,
trainable
=
True
,
trainable
=
True
,
name
=
'r_r_bias'
)
name
=
'r_r_bias'
)
self
.
r_w_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
self
.
r_w_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
'zeros'
,
trainable
=
True
,
trainable
=
True
,
name
=
'r_w_bias'
)
name
=
'r_w_bias'
)
super
(
TFRelPartialLearnableMultiHeadAttn
,
self
).
build
(
input_shape
)
super
(
TFRelPartialLearnableMultiHeadAttn
,
self
).
build
(
input_shape
)
...
@@ -249,17 +265,18 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
...
@@ -249,17 +265,18 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
r_r_bias
=
None
,
r_r_bias
=
None
,
output_attentions
=
False
,
output_attentions
=
False
,
layer_norm_epsilon
=
1e-5
,
layer_norm_epsilon
=
1e-5
,
init_std
=
0.02
,
**
kwargs
):
**
kwargs
):
super
(
TFRelPartialLearnableDecoderLayer
,
self
).
__init__
(
**
kwargs
)
super
(
TFRelPartialLearnableDecoderLayer
,
self
).
__init__
(
**
kwargs
)
self
.
dec_attn
=
TFRelPartialLearnableMultiHeadAttn
(
n_head
,
d_model
,
self
.
dec_attn
=
TFRelPartialLearnableMultiHeadAttn
(
n_head
,
d_model
,
d_head
,
dropout
,
tgt_len
=
tgt_len
,
ext_len
=
ext_len
,
d_head
,
dropout
,
tgt_len
=
tgt_len
,
ext_len
=
ext_len
,
mem_len
=
mem_len
,
dropatt
=
dropatt
,
pre_lnorm
=
pre_lnorm
,
mem_len
=
mem_len
,
dropatt
=
dropatt
,
pre_lnorm
=
pre_lnorm
,
r_w_bias
=
r_w_bias
,
r_r_bias
=
r_r_bias
,
r_w_bias
=
r_w_bias
,
r_r_bias
=
r_r_bias
,
init_std
=
init_std
,
output_attentions
=
output_attentions
,
output_attentions
=
output_attentions
,
layer_norm_epsilon
=
layer_norm_epsilon
,
name
=
'dec_attn'
)
layer_norm_epsilon
=
layer_norm_epsilon
,
name
=
'dec_attn'
)
self
.
pos_ff
=
TFPositionwiseFF
(
d_model
,
d_inner
,
dropout
,
self
.
pos_ff
=
TFPositionwiseFF
(
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
pre_lnorm
,
pre_lnorm
=
pre_lnorm
,
init_std
=
init_std
,
layer_norm_epsilon
=
layer_norm_epsilon
,
layer_norm_epsilon
=
layer_norm_epsilon
,
name
=
'pos_ff'
)
name
=
'pos_ff'
)
...
@@ -275,12 +292,13 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
...
@@ -275,12 +292,13 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
class
TFAdaptiveEmbedding
(
tf
.
keras
.
layers
.
Layer
):
class
TFAdaptiveEmbedding
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
init_std
=
0.02
,
sample_softmax
=
False
,
**
kwargs
):
sample_softmax
=
False
,
**
kwargs
):
super
(
TFAdaptiveEmbedding
,
self
).
__init__
(
**
kwargs
)
super
(
TFAdaptiveEmbedding
,
self
).
__init__
(
**
kwargs
)
self
.
n_token
=
n_token
self
.
n_token
=
n_token
self
.
d_embed
=
d_embed
self
.
d_embed
=
d_embed
self
.
init_std
=
init_std
self
.
cutoffs
=
cutoffs
+
[
n_token
]
self
.
cutoffs
=
cutoffs
+
[
n_token
]
self
.
div_val
=
div_val
self
.
div_val
=
div_val
...
@@ -298,12 +316,16 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
...
@@ -298,12 +316,16 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
for
i
in
range
(
len
(
self
.
cutoffs
)):
for
i
in
range
(
len
(
self
.
cutoffs
)):
l_idx
,
r_idx
=
self
.
cutoff_ends
[
i
],
self
.
cutoff_ends
[
i
+
1
]
l_idx
,
r_idx
=
self
.
cutoff_ends
[
i
],
self
.
cutoff_ends
[
i
+
1
]
d_emb_i
=
d_embed
//
(
div_val
**
i
)
d_emb_i
=
d_embed
//
(
div_val
**
i
)
self
.
emb_layers
.
append
(
tf
.
keras
.
layers
.
Embedding
(
r_idx
-
l_idx
,
d_emb_i
,
name
=
'emb_layers_._{}'
.
format
(
i
)))
self
.
emb_layers
.
append
(
tf
.
keras
.
layers
.
Embedding
(
r_idx
-
l_idx
,
d_emb_i
,
embeddings_initializer
=
get_initializer
(
init_std
),
name
=
'emb_layers_._{}'
.
format
(
i
)))
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
for
i
in
range
(
len
(
self
.
cutoffs
)):
for
i
in
range
(
len
(
self
.
cutoffs
)):
d_emb_i
=
self
.
d_embed
//
(
self
.
div_val
**
i
)
d_emb_i
=
self
.
d_embed
//
(
self
.
div_val
**
i
)
self
.
emb_projs
.
append
(
self
.
add_weight
(
shape
=
(
d_emb_i
,
self
.
d_proj
),
self
.
emb_projs
.
append
(
self
.
add_weight
(
shape
=
(
d_emb_i
,
self
.
d_proj
),
initializer
=
get_initializer
(
self
.
init_std
),
trainable
=
True
,
trainable
=
True
,
name
=
'emb_projs_._{}'
.
format
(
i
)))
name
=
'emb_projs_._{}'
.
format
(
i
)))
super
(
TFAdaptiveEmbedding
,
self
).
build
(
input_shape
)
super
(
TFAdaptiveEmbedding
,
self
).
build
(
input_shape
)
...
@@ -349,7 +371,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
...
@@ -349,7 +371,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
self
.
untie_r
=
config
.
untie_r
self
.
untie_r
=
config
.
untie_r
self
.
word_emb
=
TFAdaptiveEmbedding
(
config
.
n_token
,
config
.
d_embed
,
config
.
d_model
,
config
.
cutoffs
,
self
.
word_emb
=
TFAdaptiveEmbedding
(
config
.
n_token
,
config
.
d_embed
,
config
.
d_model
,
config
.
cutoffs
,
div_val
=
config
.
div_val
,
name
=
'word_emb'
)
div_val
=
config
.
div_val
,
init_std
=
config
.
init_std
,
name
=
'word_emb'
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
...
@@ -374,6 +396,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
...
@@ -374,6 +396,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
r_r_bias
=
None
if
self
.
untie_r
else
self
.
r_r_bias
,
r_r_bias
=
None
if
self
.
untie_r
else
self
.
r_r_bias
,
output_attentions
=
self
.
output_attentions
,
output_attentions
=
self
.
output_attentions
,
layer_norm_epsilon
=
config
.
layer_norm_epsilon
,
layer_norm_epsilon
=
config
.
layer_norm_epsilon
,
init_std
=
config
.
init_std
,
name
=
'layers_._{}'
.
format
(
i
))
name
=
'layers_._{}'
.
format
(
i
))
)
)
else
:
# learnable embeddings and absolute embeddings
else
:
# learnable embeddings and absolute embeddings
...
...
pytorch_transformers/modeling_tf_utils.py
View file @
57053334
...
@@ -277,20 +277,20 @@ class TFPreTrainedModel(tf.keras.Model):
...
@@ -277,20 +277,20 @@ class TFPreTrainedModel(tf.keras.Model):
return
model
return
model
class
TFConv1D
(
tf
.
keras
.
layers
.
Layer
):
class
TFConv1D
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
nf
,
nx
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
nf
,
nx
,
*
inputs
,
initializer_range
=
0.02
,
**
kwargs
):
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
Basically works like a Linear layer but the weights are transposed
Basically works like a Linear layer but the weights are transposed
"""
"""
super
(
TFConv1D
,
self
).
__init__
(
*
inputs
,
**
kwargs
)
super
(
TFConv1D
,
self
).
__init__
(
*
inputs
,
**
kwargs
)
self
.
nf
=
nf
self
.
nf
=
nf
self
.
nx
=
nx
self
.
nx
=
nx
self
.
initializer_range
=
initializer_range
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
self
.
weight
=
self
.
add_weight
(
self
.
weight
=
self
.
add_weight
(
"weight"
,
"weight"
,
shape
=
[
self
.
nx
,
self
.
nf
],
shape
=
[
self
.
nx
,
self
.
nf
],
initializer
=
tf
.
random_normal_initializer
(
initializer
=
get_initializer
(
self
.
initializer_range
))
mean
=
0.
,
stddev
=
0.02
))
self
.
bias
=
self
.
add_weight
(
self
.
bias
=
self
.
add_weight
(
"bias"
,
"bias"
,
shape
=
[
1
,
self
.
nf
],
shape
=
[
1
,
self
.
nf
],
...
@@ -314,19 +314,17 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
...
@@ -314,19 +314,17 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
super
(
TFSharedEmbeddings
,
self
).
__init__
(
**
kwargs
)
super
(
TFSharedEmbeddings
,
self
).
__init__
(
**
kwargs
)
self
.
vocab_size
=
vocab_size
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
hidden_size
=
hidden_size
self
.
initializer_range
=
initializer_range
self
.
initializer_range
=
hidden_size
**-
0.5
if
initializer_range
is
None
else
initializer_range
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
"""Build shared word embedding layer
"""Build shared word embedding layer
Shared weights logic adapted from
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
"""
initializer_range
=
self
.
hidden_size
**-
0.5
if
self
.
initializer_range
is
None
else
self
.
initializer_range
self
.
weight
=
self
.
add_weight
(
self
.
weight
=
self
.
add_weight
(
"weight"
,
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
initializer
=
tf
.
random_normal_initializer
(
initializer
=
get_initializer
(
self
.
initializer_range
))
mean
=
0.
,
stddev
=
initializer_range
))
super
(
TFSharedEmbeddings
,
self
).
build
(
input_shape
)
super
(
TFSharedEmbeddings
,
self
).
build
(
input_shape
)
def
call
(
self
,
inputs
,
mode
=
"embedding"
):
def
call
(
self
,
inputs
,
mode
=
"embedding"
):
...
@@ -385,7 +383,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
...
@@ -385,7 +383,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
summary_first_dropout: Add a dropout before the projection and activation
summary_first_dropout: Add a dropout before the projection and activation
summary_last_dropout: Add a dropout after the projection and activation
summary_last_dropout: Add a dropout after the projection and activation
"""
"""
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
initializer_range
=
0.02
,
**
kwargs
):
super
(
TFSequenceSummary
,
self
).
__init__
(
**
kwargs
)
super
(
TFSequenceSummary
,
self
).
__init__
(
**
kwargs
)
self
.
summary_type
=
config
.
summary_type
if
hasattr
(
config
,
'summary_use_proj'
)
else
'last'
self
.
summary_type
=
config
.
summary_type
if
hasattr
(
config
,
'summary_use_proj'
)
else
'last'
...
@@ -401,7 +399,9 @@ class TFSequenceSummary(tf.keras.layers.Layer):
...
@@ -401,7 +399,9 @@ class TFSequenceSummary(tf.keras.layers.Layer):
num_classes
=
config
.
num_labels
num_classes
=
config
.
num_labels
else
:
else
:
num_classes
=
config
.
hidden_size
num_classes
=
config
.
hidden_size
self
.
summary
=
tf
.
keras
.
layers
.
Dense
(
num_classes
,
name
=
'summary'
)
self
.
summary
=
tf
.
keras
.
layers
.
Dense
(
num_classes
,
kernel_initializer
=
get_initializer
(
initializer_range
),
name
=
'summary'
)
self
.
activation
=
None
self
.
activation
=
None
if
hasattr
(
config
,
'summary_activation'
)
and
config
.
summary_activation
==
'tanh'
:
if
hasattr
(
config
,
'summary_activation'
)
and
config
.
summary_activation
==
'tanh'
:
...
...
pytorch_transformers/modeling_tf_xlm.py
View file @
57053334
...
@@ -25,7 +25,7 @@ import numpy as np
...
@@ -25,7 +25,7 @@ import numpy as np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
.configuration_xlm
import
XLMConfig
from
.configuration_xlm
import
XLMConfig
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFSharedEmbeddings
,
TFSequenceSummary
,
shape_list
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFSharedEmbeddings
,
TFSequenceSummary
,
shape_list
,
get_initializer
from
.file_utils
import
add_start_docstrings
from
.file_utils
import
add_start_docstrings
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
...
@@ -119,10 +119,10 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
...
@@ -119,10 +119,10 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
self
.
n_heads
=
n_heads
self
.
n_heads
=
n_heads
assert
self
.
dim
%
self
.
n_heads
==
0
assert
self
.
dim
%
self
.
n_heads
==
0
self
.
q_lin
=
tf
.
keras
.
layers
.
Dense
(
dim
,
name
=
'q_lin'
)
self
.
q_lin
=
tf
.
keras
.
layers
.
Dense
(
dim
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
'q_lin'
)
self
.
k_lin
=
tf
.
keras
.
layers
.
Dense
(
dim
,
name
=
'k_lin'
)
self
.
k_lin
=
tf
.
keras
.
layers
.
Dense
(
dim
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
'k_lin'
)
self
.
v_lin
=
tf
.
keras
.
layers
.
Dense
(
dim
,
name
=
'v_lin'
)
self
.
v_lin
=
tf
.
keras
.
layers
.
Dense
(
dim
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
'v_lin'
)
self
.
out_lin
=
tf
.
keras
.
layers
.
Dense
(
dim
,
name
=
'out_lin'
)
self
.
out_lin
=
tf
.
keras
.
layers
.
Dense
(
dim
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
'out_lin'
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attention_dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attention_dropout
)
self
.
pruned_heads
=
set
()
self
.
pruned_heads
=
set
()
...
@@ -199,8 +199,8 @@ class TFTransformerFFN(tf.keras.layers.Layer):
...
@@ -199,8 +199,8 @@ class TFTransformerFFN(tf.keras.layers.Layer):
def
__init__
(
self
,
in_dim
,
dim_hidden
,
out_dim
,
config
,
**
kwargs
):
def
__init__
(
self
,
in_dim
,
dim_hidden
,
out_dim
,
config
,
**
kwargs
):
super
(
TFTransformerFFN
,
self
).
__init__
(
**
kwargs
)
super
(
TFTransformerFFN
,
self
).
__init__
(
**
kwargs
)
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
dim_hidden
,
name
=
'lin1'
)
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
dim_hidden
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
'lin1'
)
self
.
lin2
=
tf
.
keras
.
layers
.
Dense
(
out_dim
,
name
=
'lin2'
)
self
.
lin2
=
tf
.
keras
.
layers
.
Dense
(
out_dim
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
'lin2'
)
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
if
config
.
gelu_activation
else
tf
.
keras
.
activations
.
relu
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
if
config
.
gelu_activation
else
tf
.
keras
.
activations
.
relu
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
...
@@ -249,13 +249,19 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
...
@@ -249,13 +249,19 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
attention_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attention_dropout
)
self
.
attention_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attention_dropout
)
self
.
position_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
config
.
max_position_embeddings
,
self
.
dim
,
name
=
'position_embeddings'
)
self
.
position_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
config
.
max_position_embeddings
,
self
.
dim
,
embeddings_initializer
=
get_initializer
(
config
.
embed_init_std
),
name
=
'position_embeddings'
)
if
config
.
sinusoidal_embeddings
:
if
config
.
sinusoidal_embeddings
:
raise
NotImplementedError
raise
NotImplementedError
# create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
# create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
if
config
.
n_langs
>
1
and
config
.
use_lang_emb
:
if
config
.
n_langs
>
1
and
config
.
use_lang_emb
:
self
.
lang_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
self
.
n_langs
,
self
.
dim
,
name
=
'lang_embeddings'
)
self
.
lang_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
self
.
n_langs
,
self
.
embeddings
=
TFSharedEmbeddings
(
self
.
n_words
,
self
.
dim
,
name
=
'embeddings'
)
# padding_idx=self.pad_index)
self
.
dim
,
embeddings_initializer
=
get_initializer
(
config
.
embed_init_std
),
name
=
'lang_embeddings'
)
self
.
embeddings
=
TFSharedEmbeddings
(
self
.
n_words
,
self
.
dim
,
initializer_range
=
config
.
embed_init_std
,
name
=
'embeddings'
)
# padding_idx=self.pad_index)
self
.
layer_norm_emb
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
'layer_norm_emb'
)
self
.
layer_norm_emb
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
'layer_norm_emb'
)
# transformer layers
# transformer layers
...
@@ -676,7 +682,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
...
@@ -676,7 +682,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
'transformer'
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
'transformer'
)
self
.
sequence_summary
=
TFSequenceSummary
(
config
,
name
=
'sequence_summary'
)
self
.
sequence_summary
=
TFSequenceSummary
(
config
,
initializer_range
=
config
.
init_std
,
name
=
'sequence_summary'
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
transformer_outputs
=
self
.
transformer
(
inputs
,
**
kwargs
)
transformer_outputs
=
self
.
transformer
(
inputs
,
**
kwargs
)
...
@@ -721,7 +727,9 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
...
@@ -721,7 +727,9 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLMForQuestionAnsweringSimple
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
(
TFXLMForQuestionAnsweringSimple
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
'transformer'
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
'transformer'
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
name
=
'qa_outputs'
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
'qa_outputs'
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
transformer_outputs
=
self
.
transformer
(
inputs
,
**
kwargs
)
transformer_outputs
=
self
.
transformer
(
inputs
,
**
kwargs
)
...
...
pytorch_transformers/modeling_tf_xlnet.py
View file @
57053334
...
@@ -28,7 +28,7 @@ import numpy as np
...
@@ -28,7 +28,7 @@ import numpy as np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
.configuration_xlnet
import
XLNetConfig
from
.configuration_xlnet
import
XLNetConfig
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFSharedEmbeddings
,
TFSequenceSummary
,
shape_list
from
.modeling_tf_utils
import
TFPreTrainedModel
,
TFSharedEmbeddings
,
TFSequenceSummary
,
shape_list
,
get_initializer
from
.file_utils
import
add_start_docstrings
from
.file_utils
import
add_start_docstrings
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
...
@@ -87,7 +87,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
...
@@ -87,7 +87,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
initializer_range
)
initializer
=
get_initializer
(
self
.
initializer_range
)
self
.
q
=
self
.
add_weight
(
shape
=
(
self
.
d_model
,
self
.
n_head
,
self
.
d_head
),
self
.
q
=
self
.
add_weight
(
shape
=
(
self
.
d_model
,
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
initializer
=
initializer
,
trainable
=
True
,
name
=
'q'
)
trainable
=
True
,
name
=
'q'
)
...
@@ -104,13 +104,13 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
...
@@ -104,13 +104,13 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
initializer
=
initializer
,
initializer
=
initializer
,
trainable
=
True
,
name
=
'r'
)
trainable
=
True
,
name
=
'r'
)
self
.
r_r_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
self
.
r_r_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
initializer
=
'zeros'
,
trainable
=
True
,
name
=
'r_r_bias'
)
trainable
=
True
,
name
=
'r_r_bias'
)
self
.
r_s_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
self
.
r_s_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
initializer
=
'zeros'
,
trainable
=
True
,
name
=
'r_s_bias'
)
trainable
=
True
,
name
=
'r_s_bias'
)
self
.
r_w_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
self
.
r_w_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
initializer
=
'zeros'
,
trainable
=
True
,
name
=
'r_w_bias'
)
trainable
=
True
,
name
=
'r_w_bias'
)
self
.
seg_embed
=
self
.
add_weight
(
shape
=
(
2
,
self
.
n_head
,
self
.
d_head
),
self
.
seg_embed
=
self
.
add_weight
(
shape
=
(
2
,
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
initializer
=
initializer
,
...
@@ -294,8 +294,12 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
...
@@ -294,8 +294,12 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLNetFeedForward
,
self
).
__init__
(
**
kwargs
)
super
(
TFXLNetFeedForward
,
self
).
__init__
(
**
kwargs
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
'layer_norm'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
'layer_norm'
)
self
.
layer_1
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_inner
,
name
=
'layer_1'
)
self
.
layer_1
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_inner
,
self
.
layer_2
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_model
,
name
=
'layer_2'
)
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'layer_1'
)
self
.
layer_2
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_model
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'layer_2'
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
if
isinstance
(
config
.
ff_activation
,
str
)
or
\
if
isinstance
(
config
.
ff_activation
,
str
)
or
\
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
config
.
ff_activation
,
unicode
)):
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
config
.
ff_activation
,
unicode
)):
...
@@ -375,7 +379,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
...
@@ -375,7 +379,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
initializer_range
)
initializer
=
get_initializer
(
self
.
initializer_range
)
self
.
mask_emb
=
self
.
add_weight
(
shape
=
(
1
,
1
,
self
.
d_model
),
self
.
mask_emb
=
self
.
add_weight
(
shape
=
(
1
,
1
,
self
.
d_model
),
initializer
=
initializer
,
initializer
=
initializer
,
trainable
=
True
,
name
=
'mask_emb'
)
trainable
=
True
,
name
=
'mask_emb'
)
...
@@ -900,8 +904,10 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
...
@@ -900,8 +904,10 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
'transformer'
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
'transformer'
)
self
.
sequence_summary
=
TFSequenceSummary
(
config
,
name
=
'sequence_summary'
)
self
.
sequence_summary
=
TFSequenceSummary
(
config
,
initializer_range
=
config
.
initializer_range
,
name
=
'sequence_summary'
)
self
.
logits_proj
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
name
=
'logits_proj'
)
self
.
logits_proj
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'logits_proj'
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
transformer_outputs
=
self
.
transformer
(
inputs
,
**
kwargs
)
transformer_outputs
=
self
.
transformer
(
inputs
,
**
kwargs
)
...
@@ -949,7 +955,9 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
...
@@ -949,7 +955,9 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetForQuestionAnsweringSimple
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
(
TFXLNetForQuestionAnsweringSimple
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
'transformer'
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
'transformer'
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
name
=
'qa_outputs'
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
'qa_outputs'
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
transformer_outputs
=
self
.
transformer
(
inputs
,
**
kwargs
)
transformer_outputs
=
self
.
transformer
(
inputs
,
**
kwargs
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment