Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
31b0560a
Unverified
Commit
31b0560a
authored
Feb 15, 2021
by
Julien Plu
Committed by
GitHub
Feb 15, 2021
Browse files
Add AMP for Albert (#10141)
parent
6fc940ed
Changes
8
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
415 additions
and
345 deletions
+415
-345
src/transformers/models/albert/modeling_tf_albert.py
src/transformers/models/albert/modeling_tf_albert.py
+375
-309
src/transformers/models/bert/modeling_tf_bert.py
src/transformers/models/bert/modeling_tf_bert.py
+7
-7
src/transformers/models/convbert/modeling_tf_convbert.py
src/transformers/models/convbert/modeling_tf_convbert.py
+5
-5
src/transformers/models/electra/modeling_tf_electra.py
src/transformers/models/electra/modeling_tf_electra.py
+5
-6
src/transformers/models/longformer/modeling_tf_longformer.py
src/transformers/models/longformer/modeling_tf_longformer.py
+3
-3
src/transformers/models/roberta/modeling_tf_roberta.py
src/transformers/models/roberta/modeling_tf_roberta.py
+4
-5
templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
...ame}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+5
-6
tests/test_modeling_tf_albert.py
tests/test_modeling_tf_albert.py
+11
-4
No files found.
src/transformers/models/albert/modeling_tf_albert.py
View file @
31b0560a
This diff is collapsed.
Click to expand it.
src/transformers/models/bert/modeling_tf_bert.py
View file @
31b0560a
...
...
@@ -148,21 +148,21 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
self
.
weight
=
self
.
add_weight
(
name
=
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
with
tf
.
name_scope
(
"token_type_embeddings"
):
self
.
token_type_embeddings
=
self
.
add_weight
(
name
=
"embeddings"
,
shape
=
[
self
.
type_vocab_size
,
self
.
hidden_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
with
tf
.
name_scope
(
"position_embeddings"
):
self
.
position_embeddings
=
self
.
add_weight
(
name
=
"embeddings"
,
shape
=
[
self
.
max_position_embeddings
,
self
.
hidden_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
super
().
build
(
input_shape
)
...
...
@@ -253,8 +253,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
key_layer
=
self
.
transpose_for_scores
(
mixed_key_layer
,
batch_size
)
value_layer
=
self
.
transpose_for_scores
(
mixed_value_layer
,
batch_size
)
# Take the dot product between "query" and "key" to get the raw
# attention scores.
# Take the dot product between "query" and "key" to get the raw attention scores.
# (batch size, num_heads, seq_len_q, seq_len_k)
attention_scores
=
tf
.
matmul
(
query_layer
,
key_layer
,
transpose_b
=
True
)
dk
=
tf
.
cast
(
self
.
sqrt_att_head_size
,
dtype
=
attention_scores
.
dtype
)
...
...
@@ -1009,7 +1008,8 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
total_loss
=
self
.
compute_loss
(
labels
=
d_labels
,
logits
=
(
prediction_scores
,
seq_relationship_score
))
if
not
inputs
[
"return_dict"
]:
return
(
prediction_scores
,
seq_relationship_score
)
+
outputs
[
2
:]
output
=
(
prediction_scores
,
seq_relationship_score
)
+
outputs
[
2
:]
return
((
total_loss
,)
+
output
)
if
total_loss
is
not
None
else
output
return
TFBertForPreTrainingOutput
(
loss
=
total_loss
,
...
...
@@ -1598,7 +1598,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
}
]
)
def
serving
(
self
,
inputs
:
Dict
[
str
,
tf
.
Tensor
]):
def
serving
(
self
,
inputs
:
Dict
[
str
,
tf
.
Tensor
])
->
TFMultipleChoiceModelOutput
:
output
=
self
.
call
(
input_ids
=
inputs
)
return
self
.
serving_output
(
output
)
...
...
src/transformers/models/convbert/modeling_tf_convbert.py
View file @
31b0560a
...
...
@@ -62,11 +62,11 @@ TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
]
# Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings
# Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings
with Albert->ConvBert
class
TFConvBertEmbeddings
(
tf
.
keras
.
layers
.
Layer
):
"""Construct the embeddings from word, position and token_type embeddings."""
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
:
ConvBertConfig
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
...
...
@@ -83,21 +83,21 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer):
self
.
weight
=
self
.
add_weight
(
name
=
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
embedding_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
with
tf
.
name_scope
(
"token_type_embeddings"
):
self
.
token_type_embeddings
=
self
.
add_weight
(
name
=
"embeddings"
,
shape
=
[
self
.
type_vocab_size
,
self
.
embedding_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
with
tf
.
name_scope
(
"position_embeddings"
):
self
.
position_embeddings
=
self
.
add_weight
(
name
=
"embeddings"
,
shape
=
[
self
.
max_position_embeddings
,
self
.
embedding_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
super
().
build
(
input_shape
)
...
...
src/transformers/models/electra/modeling_tf_electra.py
View file @
31b0560a
...
...
@@ -121,8 +121,7 @@ class TFElectraSelfAttention(tf.keras.layers.Layer):
key_layer
=
self
.
transpose_for_scores
(
mixed_key_layer
,
batch_size
)
value_layer
=
self
.
transpose_for_scores
(
mixed_value_layer
,
batch_size
)
# Take the dot product between "query" and "key" to get the raw
# attention scores.
# Take the dot product between "query" and "key" to get the raw attention scores.
# (batch size, num_heads, seq_len_q, seq_len_k)
attention_scores
=
tf
.
matmul
(
query_layer
,
key_layer
,
transpose_b
=
True
)
dk
=
tf
.
cast
(
self
.
sqrt_att_head_size
,
dtype
=
attention_scores
.
dtype
)
...
...
@@ -353,7 +352,7 @@ class TFElectraPooler(tf.keras.layers.Layer):
class
TFElectraEmbeddings
(
tf
.
keras
.
layers
.
Layer
):
"""Construct the embeddings from word, position and token_type embeddings."""
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
:
ElectraConfig
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
...
...
@@ -370,21 +369,21 @@ class TFElectraEmbeddings(tf.keras.layers.Layer):
self
.
weight
=
self
.
add_weight
(
name
=
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
embedding_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
with
tf
.
name_scope
(
"token_type_embeddings"
):
self
.
token_type_embeddings
=
self
.
add_weight
(
name
=
"embeddings"
,
shape
=
[
self
.
type_vocab_size
,
self
.
embedding_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
with
tf
.
name_scope
(
"position_embeddings"
):
self
.
position_embeddings
=
self
.
add_weight
(
name
=
"embeddings"
,
shape
=
[
self
.
max_position_embeddings
,
self
.
embedding_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
super
().
build
(
input_shape
)
...
...
src/transformers/models/longformer/modeling_tf_longformer.py
View file @
31b0560a
...
...
@@ -491,21 +491,21 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
self
.
weight
=
self
.
add_weight
(
name
=
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
with
tf
.
name_scope
(
"token_type_embeddings"
):
self
.
token_type_embeddings
=
self
.
add_weight
(
name
=
"embeddings"
,
shape
=
[
self
.
type_vocab_size
,
self
.
hidden_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
with
tf
.
name_scope
(
"position_embeddings"
):
self
.
position_embeddings
=
self
.
add_weight
(
name
=
"embeddings"
,
shape
=
[
self
.
max_position_embeddings
,
self
.
hidden_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
super
().
build
(
input_shape
)
...
...
src/transformers/models/roberta/modeling_tf_roberta.py
View file @
31b0560a
...
...
@@ -92,21 +92,21 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
self
.
weight
=
self
.
add_weight
(
name
=
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
with
tf
.
name_scope
(
"token_type_embeddings"
):
self
.
token_type_embeddings
=
self
.
add_weight
(
name
=
"embeddings"
,
shape
=
[
self
.
type_vocab_size
,
self
.
hidden_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
with
tf
.
name_scope
(
"position_embeddings"
):
self
.
position_embeddings
=
self
.
add_weight
(
name
=
"embeddings"
,
shape
=
[
self
.
max_position_embeddings
,
self
.
hidden_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
super
().
build
(
input_shape
)
...
...
@@ -232,8 +232,7 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer):
key_layer
=
self
.
transpose_for_scores
(
mixed_key_layer
,
batch_size
)
value_layer
=
self
.
transpose_for_scores
(
mixed_value_layer
,
batch_size
)
# Take the dot product between "query" and "key" to get the raw
# attention scores.
# Take the dot product between "query" and "key" to get the raw attention scores.
# (batch size, num_heads, seq_len_q, seq_len_k)
attention_scores
=
tf
.
matmul
(
query_layer
,
key_layer
,
transpose_b
=
True
)
dk
=
tf
.
cast
(
self
.
sqrt_att_head_size
,
dtype
=
attention_scores
.
dtype
)
...
...
templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
View file @
31b0560a
...
...
@@ -90,21 +90,21 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer):
self
.
weight
=
self
.
add_weight
(
name
=
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
with
tf
.
name_scope
(
"token_type_embeddings"
):
self
.
token_type_embeddings
=
self
.
add_weight
(
name
=
"embeddings"
,
shape
=
[
self
.
type_vocab_size
,
self
.
hidden_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
with
tf
.
name_scope
(
"position_embeddings"
):
self
.
position_embeddings
=
self
.
add_weight
(
name
=
"embeddings"
,
shape
=
[
self
.
max_position_embeddings
,
self
.
hidden_size
],
initializer
=
get_initializer
(
initializer_range
=
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
super
().
build
(
input_shape
)
...
...
@@ -197,8 +197,7 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer)
key_layer
=
self
.
transpose_for_scores
(
mixed_key_layer
,
batch_size
)
value_layer
=
self
.
transpose_for_scores
(
mixed_value_layer
,
batch_size
)
# Take the dot product between "query" and "key" to get the raw
# attention scores.
# Take the dot product between "query" and "key" to get the raw attention scores.
# (batch size, num_heads, seq_len_q, seq_len_k)
attention_scores
=
tf
.
matmul
(
query_layer
,
key_layer
,
transpose_b
=
True
)
dk
=
tf
.
cast
(
self
.
sqrt_att_head_size
,
dtype
=
attention_scores
.
dtype
)
...
...
@@ -1247,7 +1246,7 @@ class TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(TF{{cookiecutter.c
"token_type_ids"
:
tf
.
TensorSpec
((
None
,
None
,
None
),
tf
.
int32
,
name
=
"token_type_ids"
),
}])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertForMultipleChoice.serving
def
serving
(
self
,
inputs
:
Dict
[
str
,
tf
.
Tensor
]):
def
serving
(
self
,
inputs
:
Dict
[
str
,
tf
.
Tensor
])
->
TFMultipleChoiceModelOutput
:
output
=
self
.
call
(
input_ids
=
inputs
)
return
self
.
serving_output
(
output
)
...
...
tests/test_modeling_tf_albert.py
View file @
31b0560a
...
...
@@ -26,6 +26,7 @@ from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
if
is_tf_available
():
import
tensorflow
as
tf
from
transformers
import
TF_MODEL_FOR_PRETRAINING_MAPPING
from
transformers.models.albert.modeling_tf_albert
import
(
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST
,
TFAlbertForMaskedLM
,
...
...
@@ -243,6 +244,16 @@ class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
test_head_masking
=
False
test_onnx
=
False
# special case for ForPreTraining model
def
_prepare_for_class
(
self
,
inputs_dict
,
model_class
,
return_labels
=
False
):
inputs_dict
=
super
().
_prepare_for_class
(
inputs_dict
,
model_class
,
return_labels
=
return_labels
)
if
return_labels
:
if
model_class
in
TF_MODEL_FOR_PRETRAINING_MAPPING
.
values
():
inputs_dict
[
"sentence_order_label"
]
=
tf
.
zeros
(
self
.
model_tester
.
batch_size
,
dtype
=
tf
.
int32
)
return
inputs_dict
def
setUp
(
self
):
self
.
model_tester
=
TFAlbertModelTester
(
self
)
self
.
config_tester
=
ConfigTester
(
self
,
config_class
=
AlbertConfig
,
hidden_size
=
37
)
...
...
@@ -295,10 +306,6 @@ class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
name
=
model
.
get_bias
()
assert
name
is
None
def
test_mixed_precision
(
self
):
# TODO JP: Make ALBERT float16 compliant
pass
@
slow
def
test_model_from_pretrained
(
self
):
for
model_name
in
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST
[:
1
]:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment