Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
1b925643
Commit
1b925643
authored
Oct 29, 2019
by
Lysandre
Committed by
Lysandre Debut
Nov 26, 2019
Browse files
Reorganize and cleanup
parent
12290c0d
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
67 additions
and
90 deletions
+67
-90
transformers/modeling_albert.py
transformers/modeling_albert.py
+67
-90
No files found.
transformers/modeling_albert.py
View file @
1b925643
...
@@ -100,77 +100,6 @@ class AlbertEmbeddings(BertEmbeddings):
...
@@ -100,77 +100,6 @@ class AlbertEmbeddings(BertEmbeddings):
self
.
LayerNorm
=
torch
.
nn
.
LayerNorm
(
config
.
embedding_size
,
eps
=
config
.
layer_norm_eps
)
self
.
LayerNorm
=
torch
.
nn
.
LayerNorm
(
config
.
embedding_size
,
eps
=
config
.
layer_norm_eps
)
class
AlbertModel
(
BertModel
):
def
__init__
(
self
,
config
):
super
(
AlbertModel
,
self
).
__init__
(
config
)
self
.
config
=
config
self
.
embeddings
=
AlbertEmbeddings
(
config
)
self
.
encoder
=
AlbertTransformer
(
config
)
self
.
pooler
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
pooler_activation
=
nn
.
Tanh
()
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
if
attention_mask
is
None
:
attention_mask
=
torch
.
ones_like
(
input_ids
)
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
extended_attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
extended_attention_mask
=
extended_attention_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# fp16 compatibility
extended_attention_mask
=
(
1.0
-
extended_attention_mask
)
*
-
10000.0
if
head_mask
is
not
None
:
if
head_mask
.
dim
()
==
1
:
head_mask
=
head_mask
.
unsqueeze
(
0
).
unsqueeze
(
0
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
head_mask
=
head_mask
.
expand
(
self
.
config
.
num_hidden_layers
,
-
1
,
-
1
,
-
1
,
-
1
)
elif
head_mask
.
dim
()
==
2
:
head_mask
=
head_mask
.
unsqueeze
(
1
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
# We can specify head_mask for each layer
head_mask
=
head_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# switch to fload if need + fp16 compatibility
else
:
head_mask
=
[
None
]
*
self
.
config
.
num_hidden_layers
embedding_output
=
self
.
embeddings
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
)
encoder_outputs
=
self
.
encoder
(
embedding_output
,
extended_attention_mask
,
head_mask
=
head_mask
)
sequence_output
=
encoder_outputs
[
0
]
pooled_output
=
self
.
pooler_activation
(
self
.
pooler
(
sequence_output
[:,
0
]))
outputs
=
(
sequence_output
,
pooled_output
,)
+
encoder_outputs
[
1
:]
# add hidden_states and attentions if they are here
return
outputs
class
AlbertForMaskedLM
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
AlbertForMaskedLM
,
self
).
__init__
()
self
.
config
=
config
self
.
bert
=
AlbertModel
(
config
)
self
.
LayerNorm
=
nn
.
LayerNorm
(
config
.
embedding_size
)
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
config
.
vocab_size
))
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
embedding_size
)
self
.
word_embeddings
=
nn
.
Linear
(
config
.
embedding_size
,
config
.
vocab_size
)
def
tie_weights
(
self
):
""" Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
"""
self
.
_tie_or_clone_weights
(
self
.
classifier
.
word_embeddings
,
self
.
transformer
.
embeddings
.
word_embeddings
)
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
hidden_states
=
self
.
bert
(
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
)[
0
]
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
gelu_new
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
)
logits
=
self
.
word_embeddings
(
hidden_states
)
return
logits
class
AlbertAttention
(
BertSelfAttention
):
class
AlbertAttention
(
BertSelfAttention
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
AlbertAttention
,
self
).
__init__
(
config
)
super
(
AlbertAttention
,
self
).
__init__
(
config
)
...
@@ -238,7 +167,9 @@ class AlbertAttention(BertSelfAttention):
...
@@ -238,7 +167,9 @@ class AlbertAttention(BertSelfAttention):
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
all_head_size
,)
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
all_head_size
,)
reshaped_context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
reshaped_context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
print
(
self
.
dense
.
weight
.
T
.
shape
)
# Should find a better way to do this
w
=
self
.
dense
.
weight
.
T
.
view
(
self
.
num_attention_heads
,
self
.
attention_head_size
,
self
.
hidden_size
)
w
=
self
.
dense
.
weight
.
T
.
view
(
self
.
num_attention_heads
,
self
.
attention_head_size
,
self
.
hidden_size
)
b
=
self
.
dense
.
bias
b
=
self
.
dense
.
bias
...
@@ -301,26 +232,72 @@ class AlbertTransformer(nn.Module):
...
@@ -301,26 +232,72 @@ class AlbertTransformer(nn.Module):
return
(
hidden_states
,)
return
(
hidden_states
,)
# model_size = 'base'
class
AlbertModel
(
BertModel
):
# hidden_groups = 1
def
__init__
(
self
,
config
):
# inner_groups = 2
super
(
AlbertModel
,
self
).
__init__
(
config
)
# config = AlbertConfig.from_json_file("/home/hf/google-research/albert/config_{}-{}-hg-{}-ig.json".format(model_size, hidden_groups, inner_groups))
# model = AlbertModel(config)
self
.
config
=
config
self
.
embeddings
=
AlbertEmbeddings
(
config
)
self
.
encoder
=
AlbertTransformer
(
config
)
self
.
pooler
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
pooler_activation
=
nn
.
Tanh
()
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
if
attention_mask
is
None
:
attention_mask
=
torch
.
ones_like
(
input_ids
)
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
extended_attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
extended_attention_mask
=
extended_attention_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# fp16 compatibility
extended_attention_mask
=
(
1.0
-
extended_attention_mask
)
*
-
10000.0
if
head_mask
is
not
None
:
if
head_mask
.
dim
()
==
1
:
head_mask
=
head_mask
.
unsqueeze
(
0
).
unsqueeze
(
0
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
head_mask
=
head_mask
.
expand
(
self
.
config
.
num_hidden_layers
,
-
1
,
-
1
,
-
1
,
-
1
)
elif
head_mask
.
dim
()
==
2
:
head_mask
=
head_mask
.
unsqueeze
(
1
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
# We can specify head_mask for each layer
head_mask
=
head_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# switch to fload if need + fp16 compatibility
else
:
head_mask
=
[
None
]
*
self
.
config
.
num_hidden_layers
embedding_output
=
self
.
embeddings
(
input_ids
,
position_ids
=
position_ids
,
token_type_ids
=
token_type_ids
)
encoder_outputs
=
self
.
encoder
(
embedding_output
,
extended_attention_mask
,
head_mask
=
head_mask
)
sequence_output
=
encoder_outputs
[
0
]
pooled_output
=
self
.
pooler_activation
(
self
.
pooler
(
sequence_output
[:,
0
]))
outputs
=
(
sequence_output
,
pooled_output
,)
+
encoder_outputs
[
1
:]
# add hidden_states and attentions if they are here
return
outputs
class
AlbertForMaskedLM
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
AlbertForMaskedLM
,
self
).
__init__
()
# # print(model)
self
.
config
=
config
# model = load_tf_weights_in_albert(model, config, "/home/hf/transformers/albert-{}-{}-hg-{}-ig/albert-{}-{}-hg-{}-ig".format(model_size, hidden_groups, inner_groups, model_size, hidden_groups, inner_groups))
self
.
bert
=
AlbertModel
(
config
)
# # model.eval()
self
.
LayerNorm
=
nn
.
LayerNorm
(
config
.
embedding_size
)
# # print(sum(p.numel() for p in model.parameters() if p.requires_grad))
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
config
.
vocab_size
))
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
embedding_size
)
self
.
word_embeddings
=
nn
.
Linear
(
config
.
embedding_size
,
config
.
vocab_size
)
def
tie_weights
(
self
):
""" Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
"""
self
.
_tie_or_clone_weights
(
self
.
classifier
.
word_embeddings
,
self
.
transformer
.
embeddings
.
word_embeddings
)
# input_ids = [[31, 51, 99, 88, 54, 34, 23, 23, 12], [15, 5, 0, 88, 54, 34, 23, 23, 12]]
def
forward
(
self
,
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
):
# input_mask = [[1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 0, 0, 0]]
hidden_states
=
self
.
bert
(
input_ids
,
attention_mask
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
head_mask
=
None
)[
0
]
# segment_ids = [[0, 0, 1, 0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0]]
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
gelu_new
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
)
# pt_input_ids = torch.tensor(input_ids)
logits
=
self
.
word_embeddings
(
hidden_states
)
# pt_input_mask = torch.tensor(input_mask)
# pt_segment_ids = torch.tensor(segment_ids)
# pt_dict = {"input_ids": pt_input_ids, "attention_mask": pt_input_mask, "token_type_ids": pt_segment_ids}
return
logits
# pt_output = model(**pt_dict)
# print(pt_output)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment