Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
c0c20883
"configs/datasets/subjective/vscode:/vscode.git/clone" did not exist on "2d4da8dd0210b1df2222c4fe251d8760087c9540"
Commit
c0c20883
authored
Oct 29, 2019
by
Lysandre
Committed by
Lysandre Debut
Nov 26, 2019
Browse files
ALBERT model
parent
8e5d84fc
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
447 additions
and
0 deletions
+447
-0
transformers/configuration_albert.py
transformers/configuration_albert.py
+72
-0
transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
...rmers/convert_albert_original_tf_checkpoint_to_pytorch.py
+44
-0
transformers/modeling_albert.py
transformers/modeling_albert.py
+331
-0
No files found.
transformers/configuration_albert.py
0 → 100644
View file @
c0c20883
from
.configuration_utils
import
PretrainedConfig
class
AlbertConfig
(
PretrainedConfig
):
"""Configuration for `AlbertModel`.
The default settings match the configuration of model `albert_xxlarge`.
"""
def
__init__
(
self
,
vocab_size_or_config_json_file
,
embedding_size
=
128
,
hidden_size
=
4096
,
num_hidden_layers
=
12
,
num_hidden_groups
=
1
,
num_attention_heads
=
64
,
intermediate_size
=
16384
,
inner_group_num
=
1
,
down_scale_factor
=
1
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0
,
attention_probs_dropout_prob
=
0
,
max_position_embeddings
=
512
,
type_vocab_size
=
2
,
initializer_range
=
0.02
,
layer_norm_eps
=
1e-12
,
**
kwargs
):
"""Constructs AlbertConfig.
Args:
vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
embedding_size: size of voc embeddings.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_hidden_groups: Number of group for the hidden layers, parameters in
the same group are shared.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
inner_group_num: int, number of inner repetition of attention and ffn.
down_scale_factor: float, the scale to apply
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler.
hidden_dropout_prob: The dropout probability for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`AlbertModel`.
initializer_range: The stdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
super
(
AlbertConfig
,
self
).
__init__
(
**
kwargs
)
self
.
vocab_size
=
vocab_size_or_config_json_file
self
.
embedding_size
=
embedding_size
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_hidden_groups
=
num_hidden_groups
self
.
num_attention_heads
=
num_attention_heads
self
.
inner_group_num
=
inner_group_num
self
.
down_scale_factor
=
down_scale_factor
self
.
hidden_act
=
hidden_act
self
.
intermediate_size
=
intermediate_size
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
initializer_range
=
initializer_range
self
.
layer_norm_eps
=
layer_norm_eps
\ No newline at end of file
transformers/convert_albert_original_tf_checkpoint_to_pytorch.py
0 → 100644
View file @
c0c20883
from
transformers
import
AlbertConfig
,
BertForPreTraining
,
load_tf_weights_in_bert
def
convert_tf_checkpoint_to_pytorch
(
tf_checkpoint_path
,
bert_config_file
,
pytorch_dump_path
):
# Initialise PyTorch model
config
=
BertConfig
.
from_json_file
(
bert_config_file
)
print
(
"Building PyTorch model from configuration: {}"
.
format
(
str
(
config
)))
model
=
BertForPreTraining
(
config
)
# Load weights from tf checkpoint
load_tf_weights_in_bert
(
model
,
config
,
tf_checkpoint_path
)
# Save pytorch-model
print
(
"Save PyTorch model to {}"
.
format
(
pytorch_dump_path
))
torch
.
save
(
model
.
state_dict
(),
pytorch_dump_path
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--tf_checkpoint_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path to the TensorFlow checkpoint path."
)
parser
.
add_argument
(
"--albert_config_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The config json file corresponding to the pre-trained BERT model.
\n
"
"This specifies the model architecture."
)
parser
.
add_argument
(
"--pytorch_dump_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path to the output PyTorch model."
)
args
=
parser
.
parse_args
()
convert_tf_checkpoint_to_pytorch
(
args
.
tf_checkpoint_path
,
args
.
bert_config_file
,
args
.
pytorch_dump_path
)
transformers/modeling_albert.py
0 → 100644
View file @
c0c20883
import
os
import
math
import
logging
import
torch
import
torch.nn
as
nn
from
transformers.configuration_albert
import
AlbertConfig
logger
=
logging
.
getLogger
(
__name__
)
def
load_tf_weights_in_albert
(
model
,
config
,
tf_checkpoint_path
):
""" Load tf checkpoints in a pytorch model."""
try
:
import
re
import
numpy
as
np
import
tensorflow
as
tf
except
ImportError
:
logger
.
error
(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path
=
os
.
path
.
abspath
(
tf_checkpoint_path
)
logger
.
info
(
"Converting TensorFlow checkpoint from {}"
.
format
(
tf_path
))
# Load weights from TF model
init_vars
=
tf
.
train
.
list_variables
(
tf_path
)
names
=
[]
arrays
=
[]
for
name
,
shape
in
init_vars
:
logger
.
info
(
"Loading TF weight {} with shape {}"
.
format
(
name
,
shape
))
array
=
tf
.
train
.
load_variable
(
tf_path
,
name
)
names
.
append
(
name
)
arrays
.
append
(
array
)
print
(
model
)
for
name
,
array
in
zip
(
names
,
arrays
):
og
=
name
name
=
name
.
replace
(
"transformer/group_0/inner_group_0"
,
"transformer"
)
name
=
name
.
replace
(
"LayerNorm"
,
"layer_norm"
)
name
=
name
.
replace
(
"ffn_1"
,
"ffn"
)
name
=
name
.
replace
(
"ffn/intermediate/output"
,
"ffn_output"
)
name
=
name
.
replace
(
"attention_1"
,
"attention"
)
name
=
name
.
replace
(
"cls/predictions/transform"
,
"predictions"
)
name
=
name
.
replace
(
"transformer/layer_norm_1"
,
"transformer/attention/output/LayerNorm"
)
name
=
name
.
split
(
'/'
)
print
(
name
)
pointer
=
model
for
m_name
in
name
:
if
re
.
fullmatch
(
r
'[A-Za-z]+_\d+'
,
m_name
):
l
=
re
.
split
(
r
'_(\d+)'
,
m_name
)
else
:
l
=
[
m_name
]
if
l
[
0
]
==
'kernel'
or
l
[
0
]
==
'gamma'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
l
[
0
]
==
'output_bias'
or
l
[
0
]
==
'beta'
:
pointer
=
getattr
(
pointer
,
'bias'
)
elif
l
[
0
]
==
'output_weights'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
l
[
0
]
==
'squad'
:
pointer
=
getattr
(
pointer
,
'classifier'
)
else
:
try
:
pointer
=
getattr
(
pointer
,
l
[
0
])
except
AttributeError
:
logger
.
info
(
"Skipping {}"
.
format
(
"/"
.
join
(
name
)))
continue
if
len
(
l
)
>=
2
:
num
=
int
(
l
[
1
])
pointer
=
pointer
[
num
]
if
m_name
[
-
11
:]
==
'_embeddings'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
m_name
==
'kernel'
:
array
=
np
.
transpose
(
array
)
print
(
"transposed"
)
try
:
assert
pointer
.
shape
==
array
.
shape
except
AssertionError
as
e
:
e
.
args
+=
(
pointer
.
shape
,
array
.
shape
)
raise
print
(
"Initialize PyTorch weight {} from {}"
.
format
(
name
,
og
))
pointer
.
data
=
torch
.
from_numpy
(
array
)
return
model
class
AlbertEmbeddings
(
nn
.
Module
):
"""
Construct the embeddings from word, position and token_type embeddings.
"""
def
__init__
(
self
,
config
):
super
(
AlbertEmbeddings
,
self
).
__init__
()
self
.
word_embeddings
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
embedding_size
,
padding_idx
=
0
)
self
.
position_embeddings
=
nn
.
Embedding
(
config
.
max_position_embeddings
,
config
.
embedding_size
)
self
.
token_type_embeddings
=
nn
.
Embedding
(
config
.
type_vocab_size
,
config
.
embedding_size
)
self
.
layer_norm
=
torch
.
nn
.
LayerNorm
(
config
.
embedding_size
,
eps
=
config
.
layer_norm_eps
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
position_ids
=
None
):
seq_length
=
input_ids
.
size
(
1
)
if
position_ids
is
None
:
position_ids
=
torch
.
arange
(
seq_length
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
word_embeddings
=
self
.
word_embeddings
(
input_ids
)
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
token_type_embeddings
=
self
.
token_type_embeddings
(
token_type_ids
)
embeddings
=
word_embeddings
+
position_embeddings
+
token_type_embeddings
embeddings
=
self
.
layer_norm
(
embeddings
)
embeddings
=
self
.
dropout
(
embeddings
)
return
embeddings
def
get_word_embeddings_table
(
self
):
return
self
.
word_embeddings
class
AlbertModel
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
AlbertModel
,
self
).
__init__
()
self
.
config
=
config
self
.
embeddings
=
AlbertEmbeddings
(
config
)
self
.
encoder
=
AlbertEncoder
(
config
)
self
.
pooler
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
pooler_activation
=
nn
.
Tanh
()
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
if
attention_mask
is
None
:
attention_mask
=
torch
.
ones_like
(
input_ids
)
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
extended_attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
extended_attention_mask
=
extended_attention_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# fp16 compatibility
extended_attention_mask
=
(
1.0
-
extended_attention_mask
)
*
-
10000.0
if
head_mask
is
not
None
:
if
head_mask
.
dim
()
==
1
:
head_mask
=
head_mask
.
unsqueeze
(
0
).
unsqueeze
(
0
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
head_mask
=
head_mask
.
expand
(
self
.
config
.
num_hidden_layers
,
-
1
,
-
1
,
-
1
,
-
1
)
elif
head_mask
.
dim
()
==
2
:
head_mask
=
head_mask
.
unsqueeze
(
1
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
# We can specify head_mask for each layer
head_mask
=
head_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# switch to fload if need + fp16 compatibility
else
:
head_mask
=
[
None
]
*
self
.
config
.
num_hidden_layers
embedding_output
=
self
.
embeddings
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
)
encoder_outputs
=
self
.
encoder
(
embedding_output
,
extended_attention_mask
,
head_mask
=
head_mask
)
sequence_output
=
encoder_outputs
[
0
]
print
(
sequence_output
.
shape
,
sequence_output
[:,
0
].
shape
,
self
.
pooler
(
sequence_output
[:,
0
]).
shape
)
pooled_output
=
self
.
pooler_activation
(
self
.
pooler
(
sequence_output
[:,
0
]))
outputs
=
(
sequence_output
,
pooled_output
,)
+
encoder_outputs
[
1
:]
# add hidden_states and attentions if they are here
return
outputs
class
AlbertForMaskedLM
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
AlbertForMaskedLM
,
self
).
__init__
()
self
.
config
=
config
self
.
bert
=
AlbertModel
(
config
)
self
.
layer_norm
=
nn
.
LayerNorm
(
config
.
embedding_size
)
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
config
.
vocab_size
))
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
embedding_size
)
self
.
word_embeddings
=
nn
.
Linear
(
config
.
embedding_size
,
config
.
vocab_size
)
def
tie_weights
(
self
):
""" Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
"""
self
.
_tie_or_clone_weights
(
self
.
classifier
.
word_embeddings
,
self
.
transformer
.
embeddings
.
word_embeddings
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
hidden_states
=
self
.
bert
(
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
)[
0
]
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
gelu_new
(
hidden_states
)
hidden_states
=
self
.
layer_norm
(
hidden_states
)
logits
=
self
.
word_embeddings
(
hidden_states
)
return
logits
class
AlbertAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
AlbertAttention
,
self
).
__init__
()
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
raise
ValueError
(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)"
%
(
config
.
hidden_size
,
config
.
num_attention_heads
))
self
.
output_attentions
=
config
.
output_attentions
self
.
num_attention_heads
=
config
.
num_attention_heads
self
.
attention_head_size
=
int
(
config
.
hidden_size
/
config
.
num_attention_heads
)
self
.
all_head_size
=
self
.
num_attention_heads
*
self
.
attention_head_size
self
.
query
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
key
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
value
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
dropout
=
nn
.
Dropout
(
config
.
attention_probs_dropout_prob
)
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
LayerNorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
def
transpose_for_scores
(
self
,
x
):
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads
,
self
.
attention_head_size
)
x
=
x
.
view
(
*
new_x_shape
)
return
x
.
permute
(
0
,
2
,
1
,
3
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
head_mask
=
None
):
mixed_query_layer
=
self
.
query
(
input_ids
)
mixed_key_layer
=
self
.
key
(
input_ids
)
mixed_value_layer
=
self
.
value
(
input_ids
)
query_layer
=
self
.
transpose_for_scores
(
mixed_query_layer
)
key_layer
=
self
.
transpose_for_scores
(
mixed_key_layer
)
value_layer
=
self
.
transpose_for_scores
(
mixed_value_layer
)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores
=
torch
.
matmul
(
query_layer
,
key_layer
.
transpose
(
-
1
,
-
2
))
attention_scores
=
attention_scores
/
math
.
sqrt
(
self
.
attention_head_size
)
if
attention_mask
is
not
None
:
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores
=
attention_scores
+
attention_mask
# Normalize the attention scores to probabilities.
attention_probs
=
nn
.
Softmax
(
dim
=-
1
)(
attention_scores
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
self
.
dropout
(
attention_probs
)
# Mask heads if we want to
if
head_mask
is
not
None
:
attention_probs
=
attention_probs
*
head_mask
context_layer
=
torch
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
all_head_size
,)
reshaped_context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
w
=
self
.
dense
.
weight
.
T
.
view
(
16
,
64
,
1024
)
b
=
self
.
dense
.
bias
projected_context_layer
=
torch
.
einsum
(
"bfnd,ndh->bfh"
,
context_layer
,
w
)
+
b
projected_context_layer
=
self
.
dropout
(
projected_context_layer
)
layernormed_context_layer
=
self
.
LayerNorm
(
input_ids
+
projected_context_layer
)
return
layernormed_context_layer
,
projected_context_layer
,
reshaped_context_layer
,
context_layer
,
attention_scores
,
attention_probs
,
attention_mask
class
AlbertTransformer
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
AlbertTransformer
,
self
).
__init__
()
self
.
config
=
config
self
.
layer_norm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
attention
=
AlbertAttention
(
config
)
self
.
ffn
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
intermediate_size
)
self
.
ffn_output
=
nn
.
Linear
(
config
.
intermediate_size
,
config
.
hidden_size
)
def
forward
(
self
,
hidden_states
,
attention_mask
=
None
,
head_mask
=
None
):
for
i
in
range
(
self
.
config
.
num_hidden_layers
):
attention_output
=
self
.
attention
(
hidden_states
,
attention_mask
)[
0
]
ffn_output
=
self
.
ffn
(
attention_output
)
ffn_output
=
gelu_new
(
ffn_output
)
ffn_output
=
self
.
ffn_output
(
ffn_output
)
hidden_states
=
self
.
layer_norm
(
ffn_output
+
attention_output
)
return
hidden_states
def
gelu_new
(
x
):
""" Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
Also see https://arxiv.org/abs/1606.08415
"""
return
0.5
*
x
*
(
1
+
torch
.
tanh
(
math
.
sqrt
(
2
/
math
.
pi
)
*
(
x
+
0.044715
*
torch
.
pow
(
x
,
3
))))
class
AlbertEncoder
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
AlbertEncoder
,
self
).
__init__
()
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
embedding_hidden_mapping_in
=
nn
.
Linear
(
config
.
embedding_size
,
config
.
hidden_size
)
self
.
transformer
=
AlbertTransformer
(
config
)
def
forward
(
self
,
hidden_states
,
attention_mask
=
None
,
head_mask
=
None
):
hidden_states
=
self
.
embedding_hidden_mapping_in
(
hidden_states
)
hidden_states
=
self
.
transformer
(
hidden_states
,
attention_mask
,
head_mask
)
outputs
=
(
hidden_states
,)
if
self
.
output_hidden_states
:
outputs
=
outputs
+
(
all_hidden_states
,)
if
self
.
output_attentions
:
outputs
=
outputs
+
(
all_attentions
,)
return
outputs
# last-layer hidden state, (all hidden states), (all attentions)
# config = AlbertConfig.from_json_file("config.json")
# # model = AlbertForMaskedLM(config)
# model = AlbertModel(config)
# model = load_tf_weights_in_albert(model, config, "albert/albert")
# print(model)
# input_ids = torch.tensor([[31, 51, 99], [15, 5, 0]])
# input_mask = torch.tensor([[1, 1, 1], [1, 1, 0]])
# segment_ids = torch.tensor([[0, 0, 1], [0, 0, 0]])
# # sequence_output, pooled_outputs = model()
# logits = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids)[1]
# embeddings_output =
# print("pooled output", logits)
# # print("Pooled output", pooled_outputs)
config
=
AlbertConfig
.
from_json_file
(
"/home/hf/google-research/albert/config.json"
)
model
=
AlbertModel
(
config
)
model
=
load_tf_weights_in_albert
(
model
,
config
,
"/home/hf/transformers/albert/albert"
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment