Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
1abd53b1
Commit
1abd53b1
authored
Feb 19, 2020
by
Lysandre
Browse files
Patch ALBERT with heads in TensorFlow
parent
e6767642
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
146 additions
and
136 deletions
+146
-136
src/transformers/modeling_tf_albert.py
src/transformers/modeling_tf_albert.py
+146
-136
No files found.
src/transformers/modeling_tf_albert.py
View file @
1abd53b1
...
...
@@ -29,14 +29,14 @@ from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
logger
=
logging
.
getLogger
(
__name__
)
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
=
{
"albert-base-v1"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tf_model.h5"
,
"albert-large-v1"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-tf_model.h5"
,
"albert-xlarge-v1"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-tf_model.h5"
,
"albert-xxlarge-v1"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-tf_model.h5"
,
"albert-base-v2"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tf_model.h5"
,
"albert-large-v2"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tf_model.h5"
,
"albert-xlarge-v2"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tf_model.h5"
,
"albert-xxlarge-v2"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tf_model.h5"
,
"albert-base-v1"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-
with-prefix-
tf_model.h5"
,
"albert-large-v1"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-
with-prefix-
tf_model.h5"
,
"albert-xlarge-v1"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-
with-prefix-
tf_model.h5"
,
"albert-xxlarge-v1"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-
with-prefix-
tf_model.h5"
,
"albert-base-v2"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-
with-prefix-
tf_model.h5"
,
"albert-large-v2"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-
with-prefix-
tf_model.h5"
,
"albert-xlarge-v2"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-
with-prefix-
tf_model.h5"
,
"albert-xxlarge-v2"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-
with-prefix-
tf_model.h5"
,
}
...
...
@@ -478,6 +478,115 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
return
hidden_states
class
TFAlbertMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
().
__init__
(
config
,
**
kwargs
)
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
embeddings
=
TFAlbertEmbeddings
(
config
,
name
=
"embeddings"
)
self
.
encoder
=
TFAlbertTransformer
(
config
,
name
=
"encoder"
)
self
.
pooler
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
activation
=
"tanh"
,
name
=
"pooler"
,
)
def
get_input_embeddings
(
self
):
return
self
.
embeddings
def
_resize_token_embeddings
(
self
,
new_num_tokens
):
raise
NotImplementedError
def
_prune_heads
(
self
,
heads_to_prune
):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
raise
NotImplementedError
def
call
(
self
,
inputs
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
inputs_embeds
=
None
,
training
=
False
,
):
if
isinstance
(
inputs
,
(
tuple
,
list
)):
input_ids
=
inputs
[
0
]
attention_mask
=
inputs
[
1
]
if
len
(
inputs
)
>
1
else
attention_mask
token_type_ids
=
inputs
[
2
]
if
len
(
inputs
)
>
2
else
token_type_ids
position_ids
=
inputs
[
3
]
if
len
(
inputs
)
>
3
else
position_ids
head_mask
=
inputs
[
4
]
if
len
(
inputs
)
>
4
else
head_mask
inputs_embeds
=
inputs
[
5
]
if
len
(
inputs
)
>
5
else
inputs_embeds
assert
len
(
inputs
)
<=
6
,
"Too many inputs."
elif
isinstance
(
inputs
,
dict
):
input_ids
=
inputs
.
get
(
"input_ids"
)
attention_mask
=
inputs
.
get
(
"attention_mask"
,
attention_mask
)
token_type_ids
=
inputs
.
get
(
"token_type_ids"
,
token_type_ids
)
position_ids
=
inputs
.
get
(
"position_ids"
,
position_ids
)
head_mask
=
inputs
.
get
(
"head_mask"
,
head_mask
)
inputs_embeds
=
inputs
.
get
(
"inputs_embeds"
,
inputs_embeds
)
assert
len
(
inputs
)
<=
6
,
"Too many inputs."
else
:
input_ids
=
inputs
if
input_ids
is
not
None
and
inputs_embeds
is
not
None
:
raise
ValueError
(
"You cannot specify both input_ids and inputs_embeds at the same time"
)
elif
input_ids
is
not
None
:
input_shape
=
shape_list
(
input_ids
)
elif
inputs_embeds
is
not
None
:
input_shape
=
shape_list
(
inputs_embeds
)[:
-
1
]
else
:
raise
ValueError
(
"You have to specify either input_ids or inputs_embeds"
)
if
attention_mask
is
None
:
attention_mask
=
tf
.
fill
(
input_shape
,
1
)
if
token_type_ids
is
None
:
token_type_ids
=
tf
.
fill
(
input_shape
,
0
)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask
=
attention_mask
[:,
tf
.
newaxis
,
tf
.
newaxis
,
:]
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask
=
tf
.
cast
(
extended_attention_mask
,
tf
.
float32
)
extended_attention_mask
=
(
1.0
-
extended_attention_mask
)
*
-
10000.0
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if
head_mask
is
not
None
:
raise
NotImplementedError
else
:
head_mask
=
[
None
]
*
self
.
num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers)
embedding_output
=
self
.
embeddings
([
input_ids
,
position_ids
,
token_type_ids
,
inputs_embeds
],
training
=
training
)
encoder_outputs
=
self
.
encoder
([
embedding_output
,
extended_attention_mask
,
head_mask
],
training
=
training
)
sequence_output
=
encoder_outputs
[
0
]
pooled_output
=
self
.
pooler
(
sequence_output
[:,
0
])
# add hidden_states and attentions if they are here
outputs
=
(
sequence_output
,
pooled_output
,)
+
encoder_outputs
[
1
:]
# sequence_output, pooled_output, (hidden_states), (attentions)
return
outputs
ALBERT_START_DOCSTRING
=
r
"""
This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
Use it as a regular TF 2.0 Keras Model and
...
...
@@ -560,43 +669,12 @@ ALBERT_INPUTS_DOCSTRING = r"""
ALBERT_START_DOCSTRING
,
)
class
TFAlbertModel
(
TFAlbertPreTrainedModel
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
().
__init__
(
config
,
**
kwargs
)
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
embeddings
=
TFAlbertEmbeddings
(
config
,
name
=
"embeddings"
)
self
.
encoder
=
TFAlbertTransformer
(
config
,
name
=
"encoder"
)
self
.
pooler
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
activation
=
"tanh"
,
name
=
"pooler"
,
)
def
get_input_embeddings
(
self
):
return
self
.
embeddings
def
_resize_token_embeddings
(
self
,
new_num_tokens
):
raise
NotImplementedError
def
_prune_heads
(
self
,
heads_to_prune
):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
raise
NotImplementedError
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
albert
=
TFAlbertMainLayer
(
config
,
name
=
"albert"
)
@
add_start_docstrings_to_callable
(
ALBERT_INPUTS_DOCSTRING
)
def
call
(
self
,
inputs
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
,
inputs_embeds
=
None
,
training
=
False
,
):
def
call
(
self
,
inputs
,
**
kwargs
):
r
"""
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
...
...
@@ -632,75 +710,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
if
isinstance
(
inputs
,
(
tuple
,
list
)):
input_ids
=
inputs
[
0
]
attention_mask
=
inputs
[
1
]
if
len
(
inputs
)
>
1
else
attention_mask
token_type_ids
=
inputs
[
2
]
if
len
(
inputs
)
>
2
else
token_type_ids
position_ids
=
inputs
[
3
]
if
len
(
inputs
)
>
3
else
position_ids
head_mask
=
inputs
[
4
]
if
len
(
inputs
)
>
4
else
head_mask
inputs_embeds
=
inputs
[
5
]
if
len
(
inputs
)
>
5
else
inputs_embeds
assert
len
(
inputs
)
<=
6
,
"Too many inputs."
elif
isinstance
(
inputs
,
dict
):
input_ids
=
inputs
.
get
(
"input_ids"
)
attention_mask
=
inputs
.
get
(
"attention_mask"
,
attention_mask
)
token_type_ids
=
inputs
.
get
(
"token_type_ids"
,
token_type_ids
)
position_ids
=
inputs
.
get
(
"position_ids"
,
position_ids
)
head_mask
=
inputs
.
get
(
"head_mask"
,
head_mask
)
inputs_embeds
=
inputs
.
get
(
"inputs_embeds"
,
inputs_embeds
)
assert
len
(
inputs
)
<=
6
,
"Too many inputs."
else
:
input_ids
=
inputs
if
input_ids
is
not
None
and
inputs_embeds
is
not
None
:
raise
ValueError
(
"You cannot specify both input_ids and inputs_embeds at the same time"
)
elif
input_ids
is
not
None
:
input_shape
=
shape_list
(
input_ids
)
elif
inputs_embeds
is
not
None
:
input_shape
=
shape_list
(
inputs_embeds
)[:
-
1
]
else
:
raise
ValueError
(
"You have to specify either input_ids or inputs_embeds"
)
if
attention_mask
is
None
:
attention_mask
=
tf
.
fill
(
input_shape
,
1
)
if
token_type_ids
is
None
:
token_type_ids
=
tf
.
fill
(
input_shape
,
0
)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask
=
attention_mask
[:,
tf
.
newaxis
,
tf
.
newaxis
,
:]
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask
=
tf
.
cast
(
extended_attention_mask
,
tf
.
float32
)
extended_attention_mask
=
(
1.0
-
extended_attention_mask
)
*
-
10000.0
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if
head_mask
is
not
None
:
raise
NotImplementedError
else
:
head_mask
=
[
None
]
*
self
.
num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers)
embedding_output
=
self
.
embeddings
([
input_ids
,
position_ids
,
token_type_ids
,
inputs_embeds
],
training
=
training
)
encoder_outputs
=
self
.
encoder
([
embedding_output
,
extended_attention_mask
,
head_mask
],
training
=
training
)
sequence_output
=
encoder_outputs
[
0
]
pooled_output
=
self
.
pooler
(
sequence_output
[:,
0
])
# add hidden_states and attentions if they are here
outputs
=
(
sequence_output
,
pooled_output
,)
+
encoder_outputs
[
1
:]
# sequence_output, pooled_output, (hidden_states), (attentions)
outputs
=
self
.
albert
(
inputs
,
**
kwargs
)
return
outputs
...
...
@@ -709,7 +719,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFAlbertForMaskedLM
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
albert
=
TFAlbertM
odel
(
config
,
name
=
"albert"
)
self
.
albert
=
TFAlbertM
ainLayer
(
config
,
name
=
"albert"
)
self
.
predictions
=
TFAlbertMLMHead
(
config
,
self
.
albert
.
embeddings
,
name
=
"predictions"
)
def
get_output_embeddings
(
self
):
...
...
@@ -766,7 +776,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
super
(
TFAlbertForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
albert
=
TFAlbertM
odel
(
config
,
name
=
"albert"
)
self
.
albert
=
TFAlbertM
ainLayer
(
config
,
name
=
"albert"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"classifier"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment