Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
c0c20883
Commit
c0c20883
authored
Oct 29, 2019
by
Lysandre
Committed by
Lysandre Debut
Nov 26, 2019
Browse files
ALBERT model
parent
8e5d84fc
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
447 additions
and
0 deletions
+447
-0
transformers/configuration_albert.py
transformers/configuration_albert.py
+72
-0
transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
...rmers/convert_albert_original_tf_checkpoint_to_pytorch.py
+44
-0
transformers/modeling_albert.py
transformers/modeling_albert.py
+331
-0
No files found.
transformers/configuration_albert.py
0 → 100644
View file @
c0c20883
from
.configuration_utils
import
PretrainedConfig
class
AlbertConfig
(
PretrainedConfig
):
"""Configuration for `AlbertModel`.
The default settings match the configuration of model `albert_xxlarge`.
"""
def
__init__
(
self
,
vocab_size_or_config_json_file
,
embedding_size
=
128
,
hidden_size
=
4096
,
num_hidden_layers
=
12
,
num_hidden_groups
=
1
,
num_attention_heads
=
64
,
intermediate_size
=
16384
,
inner_group_num
=
1
,
down_scale_factor
=
1
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0
,
attention_probs_dropout_prob
=
0
,
max_position_embeddings
=
512
,
type_vocab_size
=
2
,
initializer_range
=
0.02
,
layer_norm_eps
=
1e-12
,
**
kwargs
):
"""Constructs AlbertConfig.
Args:
vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
embedding_size: size of voc embeddings.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_hidden_groups: Number of group for the hidden layers, parameters in
the same group are shared.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
inner_group_num: int, number of inner repetition of attention and ffn.
down_scale_factor: float, the scale to apply
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler.
hidden_dropout_prob: The dropout probability for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`AlbertModel`.
initializer_range: The stdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
super
(
AlbertConfig
,
self
).
__init__
(
**
kwargs
)
self
.
vocab_size
=
vocab_size_or_config_json_file
self
.
embedding_size
=
embedding_size
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_hidden_groups
=
num_hidden_groups
self
.
num_attention_heads
=
num_attention_heads
self
.
inner_group_num
=
inner_group_num
self
.
down_scale_factor
=
down_scale_factor
self
.
hidden_act
=
hidden_act
self
.
intermediate_size
=
intermediate_size
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
initializer_range
=
initializer_range
self
.
layer_norm_eps
=
layer_norm_eps
\ No newline at end of file
transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
0 → 100644
View file @
c0c20883
from
transformers
import
AlbertConfig
,
BertForPreTraining
,
load_tf_weights_in_bert
def
convert_tf_checkpoint_to_pytorch
(
tf_checkpoint_path
,
bert_config_file
,
pytorch_dump_path
):
# Initialise PyTorch model
config
=
BertConfig
.
from_json_file
(
bert_config_file
)
print
(
"Building PyTorch model from configuration: {}"
.
format
(
str
(
config
)))
model
=
BertForPreTraining
(
config
)
# Load weights from tf checkpoint
load_tf_weights_in_bert
(
model
,
config
,
tf_checkpoint_path
)
# Save pytorch-model
print
(
"Save PyTorch model to {}"
.
format
(
pytorch_dump_path
))
torch
.
save
(
model
.
state_dict
(),
pytorch_dump_path
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--tf_checkpoint_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path to the TensorFlow checkpoint path."
)
parser
.
add_argument
(
"--albert_config_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The config json file corresponding to the pre-trained BERT model.
\n
"
"This specifies the model architecture."
)
parser
.
add_argument
(
"--pytorch_dump_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path to the output PyTorch model."
)
args
=
parser
.
parse_args
()
convert_tf_checkpoint_to_pytorch
(
args
.
tf_checkpoint_path
,
args
.
bert_config_file
,
args
.
pytorch_dump_path
)
transformers/modeling_albert.py
0 → 100644
View file @
c0c20883
import
os
import
math
import
logging
import
torch
import
torch.nn
as
nn
from
transformers.configuration_albert
import
AlbertConfig
logger
=
logging
.
getLogger
(
__name__
)
def
load_tf_weights_in_albert
(
model
,
config
,
tf_checkpoint_path
):
""" Load tf checkpoints in a pytorch model."""
try
:
import
re
import
numpy
as
np
import
tensorflow
as
tf
except
ImportError
:
logger
.
error
(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path
=
os
.
path
.
abspath
(
tf_checkpoint_path
)
logger
.
info
(
"Converting TensorFlow checkpoint from {}"
.
format
(
tf_path
))
# Load weights from TF model
init_vars
=
tf
.
train
.
list_variables
(
tf_path
)
names
=
[]
arrays
=
[]
for
name
,
shape
in
init_vars
:
logger
.
info
(
"Loading TF weight {} with shape {}"
.
format
(
name
,
shape
))
array
=
tf
.
train
.
load_variable
(
tf_path
,
name
)
names
.
append
(
name
)
arrays
.
append
(
array
)
print
(
model
)
for
name
,
array
in
zip
(
names
,
arrays
):
og
=
name
name
=
name
.
replace
(
"transformer/group_0/inner_group_0"
,
"transformer"
)
name
=
name
.
replace
(
"LayerNorm"
,
"layer_norm"
)
name
=
name
.
replace
(
"ffn_1"
,
"ffn"
)
name
=
name
.
replace
(
"ffn/intermediate/output"
,
"ffn_output"
)
name
=
name
.
replace
(
"attention_1"
,
"attention"
)
name
=
name
.
replace
(
"cls/predictions/transform"
,
"predictions"
)
name
=
name
.
replace
(
"transformer/layer_norm_1"
,
"transformer/attention/output/LayerNorm"
)
name
=
name
.
split
(
'/'
)
print
(
name
)
pointer
=
model
for
m_name
in
name
:
if
re
.
fullmatch
(
r
'[A-Za-z]+_\d+'
,
m_name
):
l
=
re
.
split
(
r
'_(\d+)'
,
m_name
)
else
:
l
=
[
m_name
]
if
l
[
0
]
==
'kernel'
or
l
[
0
]
==
'gamma'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
l
[
0
]
==
'output_bias'
or
l
[
0
]
==
'beta'
:
pointer
=
getattr
(
pointer
,
'bias'
)
elif
l
[
0
]
==
'output_weights'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
l
[
0
]
==
'squad'
:
pointer
=
getattr
(
pointer
,
'classifier'
)
else
:
try
:
pointer
=
getattr
(
pointer
,
l
[
0
])
except
AttributeError
:
logger
.
info
(
"Skipping {}"
.
format
(
"/"
.
join
(
name
)))
continue
if
len
(
l
)
>=
2
:
num
=
int
(
l
[
1
])
pointer
=
pointer
[
num
]
if
m_name
[
-
11
:]
==
'_embeddings'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
m_name
==
'kernel'
:
array
=
np
.
transpose
(
array
)
print
(
"transposed"
)
try
:
assert
pointer
.
shape
==
array
.
shape
except
AssertionError
as
e
:
e
.
args
+=
(
pointer
.
shape
,
array
.
shape
)
raise
print
(
"Initialize PyTorch weight {} from {}"
.
format
(
name
,
og
))
pointer
.
data
=
torch
.
from_numpy
(
array
)
return
model
class
AlbertEmbeddings
(
nn
.
Module
):
"""
Construct the embeddings from word, position and token_type embeddings.
"""
def
__init__
(
self
,
config
):
super
(
AlbertEmbeddings
,
self
).
__init__
()
self
.
word_embeddings
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
embedding_size
,
padding_idx
=
0
)
self
.
position_embeddings
=
nn
.
Embedding
(
config
.
max_position_embeddings
,
config
.
embedding_size
)
self
.
token_type_embeddings
=
nn
.
Embedding
(
config
.
type_vocab_size
,
config
.
embedding_size
)
self
.
layer_norm
=
torch
.
nn
.
LayerNorm
(
config
.
embedding_size
,
eps
=
config
.
layer_norm_eps
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
position_ids
=
None
):
seq_length
=
input_ids
.
size
(
1
)
if
position_ids
is
None
:
position_ids
=
torch
.
arange
(
seq_length
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
word_embeddings
=
self
.
word_embeddings
(
input_ids
)
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
token_type_embeddings
=
self
.
token_type_embeddings
(
token_type_ids
)
embeddings
=
word_embeddings
+
position_embeddings
+
token_type_embeddings
embeddings
=
self
.
layer_norm
(
embeddings
)
embeddings
=
self
.
dropout
(
embeddings
)
return
embeddings
def
get_word_embeddings_table
(
self
):
return
self
.
word_embeddings
class
AlbertModel
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
AlbertModel
,
self
).
__init__
()
self
.
config
=
config
self
.
embeddings
=
AlbertEmbeddings
(
config
)
self
.
encoder
=
AlbertEncoder
(
config
)
self
.
pooler
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
pooler_activation
=
nn
.
Tanh
()
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
if
attention_mask
is
None
:
attention_mask
=
torch
.
ones_like
(
input_ids
)
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
extended_attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
extended_attention_mask
=
extended_attention_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# fp16 compatibility
extended_attention_mask
=
(
1.0
-
extended_attention_mask
)
*
-
10000.0
if
head_mask
is
not
None
:
if
head_mask
.
dim
()
==
1
:
head_mask
=
head_mask
.
unsqueeze
(
0
).
unsqueeze
(
0
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
head_mask
=
head_mask
.
expand
(
self
.
config
.
num_hidden_layers
,
-
1
,
-
1
,
-
1
,
-
1
)
elif
head_mask
.
dim
()
==
2
:
head_mask
=
head_mask
.
unsqueeze
(
1
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
# We can specify head_mask for each layer
head_mask
=
head_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# switch to fload if need + fp16 compatibility
else
:
head_mask
=
[
None
]
*
self
.
config
.
num_hidden_layers
embedding_output
=
self
.
embeddings
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
)
encoder_outputs
=
self
.
encoder
(
embedding_output
,
extended_attention_mask
,
head_mask
=
head_mask
)
sequence_output
=
encoder_outputs
[
0
]
print
(
sequence_output
.
shape
,
sequence_output
[:,
0
].
shape
,
self
.
pooler
(
sequence_output
[:,
0
]).
shape
)
pooled_output
=
self
.
pooler_activation
(
self
.
pooler
(
sequence_output
[:,
0
]))
outputs
=
(
sequence_output
,
pooled_output
,)
+
encoder_outputs
[
1
:]
# add hidden_states and attentions if they are here
return
outputs
class
AlbertForMaskedLM
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
AlbertForMaskedLM
,
self
).
__init__
()
self
.
config
=
config
self
.
bert
=
AlbertModel
(
config
)
self
.
layer_norm
=
nn
.
LayerNorm
(
config
.
embedding_size
)
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
config
.
vocab_size
))
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
embedding_size
)
self
.
word_embeddings
=
nn
.
Linear
(
config
.
embedding_size
,
config
.
vocab_size
)
def
tie_weights
(
self
):
""" Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
"""
self
.
_tie_or_clone_weights
(
self
.
classifier
.
word_embeddings
,
self
.
transformer
.
embeddings
.
word_embeddings
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
hidden_states
=
self
.
bert
(
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
)[
0
]
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
gelu_new
(
hidden_states
)
hidden_states
=
self
.
layer_norm
(
hidden_states
)
logits
=
self
.
word_embeddings
(
hidden_states
)
return
logits
class
AlbertAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
AlbertAttention
,
self
).
__init__
()
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
raise
ValueError
(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)"
%
(
config
.
hidden_size
,
config
.
num_attention_heads
))
self
.
output_attentions
=
config
.
output_attentions
self
.
num_attention_heads
=
config
.
num_attention_heads
self
.
attention_head_size
=
int
(
config
.
hidden_size
/
config
.
num_attention_heads
)
self
.
all_head_size
=
self
.
num_attention_heads
*
self
.
attention_head_size
self
.
query
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
key
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
value
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
dropout
=
nn
.
Dropout
(
config
.
attention_probs_dropout_prob
)
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
LayerNorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
def
transpose_for_scores
(
self
,
x
):
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads
,
self
.
attention_head_size
)
x
=
x
.
view
(
*
new_x_shape
)
return
x
.
permute
(
0
,
2
,
1
,
3
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
head_mask
=
None
):
mixed_query_layer
=
self
.
query
(
input_ids
)
mixed_key_layer
=
self
.
key
(
input_ids
)
mixed_value_layer
=
self
.
value
(
input_ids
)
query_layer
=
self
.
transpose_for_scores
(
mixed_query_layer
)
key_layer
=
self
.
transpose_for_scores
(
mixed_key_layer
)
value_layer
=
self
.
transpose_for_scores
(
mixed_value_layer
)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores
=
torch
.
matmul
(
query_layer
,
key_layer
.
transpose
(
-
1
,
-
2
))
attention_scores
=
attention_scores
/
math
.
sqrt
(
self
.
attention_head_size
)
if
attention_mask
is
not
None
:
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores
=
attention_scores
+
attention_mask
# Normalize the attention scores to probabilities.
attention_probs
=
nn
.
Softmax
(
dim
=-
1
)(
attention_scores
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
self
.
dropout
(
attention_probs
)
# Mask heads if we want to
if
head_mask
is
not
None
:
attention_probs
=
attention_probs
*
head_mask
context_layer
=
torch
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
all_head_size
,)
reshaped_context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
w
=
self
.
dense
.
weight
.
T
.
view
(
16
,
64
,
1024
)
b
=
self
.
dense
.
bias
projected_context_layer
=
torch
.
einsum
(
"bfnd,ndh->bfh"
,
context_layer
,
w
)
+
b
projected_context_layer
=
self
.
dropout
(
projected_context_layer
)
layernormed_context_layer
=
self
.
LayerNorm
(
input_ids
+
projected_context_layer
)
return
layernormed_context_layer
,
projected_context_layer
,
reshaped_context_layer
,
context_layer
,
attention_scores
,
attention_probs
,
attention_mask
class
AlbertTransformer
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
AlbertTransformer
,
self
).
__init__
()
self
.
config
=
config
self
.
layer_norm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
attention
=
AlbertAttention
(
config
)
self
.
ffn
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
intermediate_size
)
self
.
ffn_output
=
nn
.
Linear
(
config
.
intermediate_size
,
config
.
hidden_size
)
def
forward
(
self
,
hidden_states
,
attention_mask
=
None
,
head_mask
=
None
):
for
i
in
range
(
self
.
config
.
num_hidden_layers
):
attention_output
=
self
.
attention
(
hidden_states
,
attention_mask
)[
0
]
ffn_output
=
self
.
ffn
(
attention_output
)
ffn_output
=
gelu_new
(
ffn_output
)
ffn_output
=
self
.
ffn_output
(
ffn_output
)
hidden_states
=
self
.
layer_norm
(
ffn_output
+
attention_output
)
return
hidden_states
def
gelu_new
(
x
):
""" Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
Also see https://arxiv.org/abs/1606.08415
"""
return
0.5
*
x
*
(
1
+
torch
.
tanh
(
math
.
sqrt
(
2
/
math
.
pi
)
*
(
x
+
0.044715
*
torch
.
pow
(
x
,
3
))))
class
AlbertEncoder
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
AlbertEncoder
,
self
).
__init__
()
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
embedding_hidden_mapping_in
=
nn
.
Linear
(
config
.
embedding_size
,
config
.
hidden_size
)
self
.
transformer
=
AlbertTransformer
(
config
)
def
forward
(
self
,
hidden_states
,
attention_mask
=
None
,
head_mask
=
None
):
hidden_states
=
self
.
embedding_hidden_mapping_in
(
hidden_states
)
hidden_states
=
self
.
transformer
(
hidden_states
,
attention_mask
,
head_mask
)
outputs
=
(
hidden_states
,)
if
self
.
output_hidden_states
:
outputs
=
outputs
+
(
all_hidden_states
,)
if
self
.
output_attentions
:
outputs
=
outputs
+
(
all_attentions
,)
return
outputs
# last-layer hidden state, (all hidden states), (all attentions)
# config = AlbertConfig.from_json_file("config.json")
# # model = AlbertForMaskedLM(config)
# model = AlbertModel(config)
# model = load_tf_weights_in_albert(model, config, "albert/albert")
# print(model)
# input_ids = torch.tensor([[31, 51, 99], [15, 5, 0]])
# input_mask = torch.tensor([[1, 1, 1], [1, 1, 0]])
# segment_ids = torch.tensor([[0, 0, 1], [0, 0, 0]])
# # sequence_output, pooled_outputs = model()
# logits = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids)[1]
# embeddings_output =
# print("pooled output", logits)
# # print("Pooled output", pooled_outputs)
config
=
AlbertConfig
.
from_json_file
(
"/home/hf/google-research/albert/config.json"
)
model
=
AlbertModel
(
config
)
model
=
load_tf_weights_in_albert
(
model
,
config
,
"/home/hf/transformers/albert/albert"
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment