Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
1d232400
"docs/source/en/vscode:/vscode.git/clone" did not exist on "6c5bd2a38d2d07603f17d60e87c009710d251990"
Commit
1d232400
authored
Aug 27, 2019
by
VictorSanh
Browse files
wip
parent
e08c01aa
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
375 additions
and
0 deletions
+375
-0
pytorch_transformers/modeling_dilbert.py
pytorch_transformers/modeling_dilbert.py
+375
-0
No files found.
pytorch_transformers/modeling_dilbert.py
0 → 100644
View file @
1d232400
# coding=utf-8
# Copyright 2019-present, the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
PyTorch DilBERT model.
"""
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
json
import
logging
import
math
import
sys
from
io
import
open
import
itertools
import
numpy
as
np
import
torch
import
torch.nn
as
nn
from
pytorch_transformers.modeling_utils
import
PretrainedConfig
,
PreTrainedModel
,
add_start_docstrings
import
logging
logger
=
logging
.
getLogger
(
__name__
)
DILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
=
{
'dilbert-base-uncased'
:
None
,
# TODO(Victor)
}
DILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
=
{
'dilbert-base-uncased'
:
None
,
#TODO(Victor)
}
class
DilBertconfig
(
PretrainedConfig
):
pretrained_config_archive_map
=
DILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
def
__init__
(
self
,
vocab_size_or_config_json_file
=
30522
,
max_position_embeddings
=
512
,
sinusoidal_pos_embds
=
True
,
n_layers
=
6
,
n_heads
=
12
,
dim
=
768
,
dropout
=
0.1
,
attention_dropout
=
0.1
,
activation
=
'gelu'
,
initializer_range
=
0.02
,
tie_weights
=
True
,
**
kwargs
):
super
(
DilBertconfig
,
self
).
__init__
(
**
kwargs
)
if
isintance
(
vocab_size_or_config_json_file
,
str
)
or
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
vocab_size_or_config_json_file
,
unicode
)):
with
open
(
vocab_size_or_config_json_file
,
"r"
,
encoding
=
'utf-8'
)
as
reader
:
json_config
=
json
.
loads
(
reader
.
read
())
for
key
,
value
in
json_config
.
items
():
self
.
__dict__
[
key
]
=
value
elif
isinstance
(
vocab_size_or_config_json_file
,
int
):
self
.
vocab_size
=
vocab_size_or_config_json_file
self
.
max_position_embeddings
=
max_position_embeddings
self
.
sinusoidal_pos_embds
=
sinusoidal_pos_embds
self
.
n_layers
=
n_layers
self
.
n_heads
=
n_heads
self
.
dim
=
dim
self
.
dropout
=
dropout
self
.
attention_dropout
=
attention_dropout
self
.
activation
=
activation
self
.
initializer_range
=
initializer_range
self
.
tie_weights
=
tie_weights
else
:
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
def
gelu
(
x
):
return
0.5
*
x
*
(
1.0
+
torch
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
def
create_sinusoidal_embeddings
(
n_pos
,
dim
,
out
):
position_enc
=
np
.
array
([
[
pos
/
np
.
power
(
10000
,
2
*
(
j
//
2
)
/
dim
)
for
j
in
range
(
dim
)]
for
pos
in
range
(
n_pos
)
])
out
[:,
0
::
2
]
=
torch
.
FloatTensor
(
np
.
sin
(
position_enc
[:,
0
::
2
]))
out
[:,
1
::
2
]
=
torch
.
FloatTensor
(
np
.
cos
(
position_enc
[:,
1
::
2
]))
out
.
detach_
()
out
.
requires_grad
=
False
class
Embeddings
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
Embeddings
,
self
).
__init__
()
self
.
word_embeddings
=
nn
.
Embedding
(
config
.
vocab_size
,
dim
,
padding_idx
=
0
)
self
.
position_embeddings
=
nn
.
Embedding
(
config
.
max_position_embeddings
,
config
.
dim
)
if
sinusoidal_pos_embds
:
create_sinusoidal_embeddings
(
n_pos
=
config
.
max_position_embeddings
,
dim
=
config
.
dim
,
out
=
self
.
position_embeddings
.
weight
)
self
.
LayerNorm
=
nn
.
LayerNorm
(
config
.
dim
,
eps
=
1e-12
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout
)
def
forward
(
self
,
input_ids
):
"""
Parameters
----------
input_ids: torch.tensor(bs, max_seq_length) - The token ids to embed.
"""
seq_length
=
input_ids
.
size
(
1
)
position_ids
=
torch
.
arange
(
seq_length
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
# (max_seq_length)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
# (bs, max_seq_length)
word_embeddings
=
self
.
word_embeddings
(
input_ids
)
# (bs, max_seq_length, dim)
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
# (bs, max_seq_length, dim)
embeddings
=
word_embeddings
+
position_embeddings
embeddings
=
self
.
LayerNorm
(
embeddings
)
embeddings
=
self
.
dropout
(
embeddings
)
return
embeddings
class
MultiHeadSelfAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
MultiHeadSelfAttention
,
self
).
__init__
()
self
.
n_heads
=
config
.
n_heads
self
.
dim
=
config
.
dim
self
.
dropout
=
nn
.
Dropout
(
p
=
config
.
attention_dropout
)
self
.
output_attentions
=
config
.
output_attentions
assert
self
.
dim
%
self
.
n_heads
==
0
self
.
q_lin
=
nn
.
Linear
(
in_features
=
dim
,
out_features
=
dim
)
self
.
k_lin
=
nn
.
Linear
(
in_features
=
dim
,
out_features
=
dim
)
self
.
v_lin
=
nn
.
Linear
(
in_features
=
dim
,
out_features
=
dim
)
self
.
out_lin
=
nn
.
Linear
(
in_features
=
dim
,
out_features
=
dim
)
def
forward
(
self
,
query
:
torch
.
tensor
,
key
:
torch
.
tensor
,
value
:
torch
.
tensor
,
mask
:
torch
.
tensor
):
"""
Classic Self Attention. I don't understand the one of PyTorch...
Parameters
----------
query: torch.tensor(bs, seq_length, dim)
key: torch.tensor(bs, seq_length, dim)
value: torch.tensor(bs, seq_length, dim)
mask: torch.tensor(bs, seq_length)
Return
------
weights: torch.tensor(bs, n_heads, seq_length, seq_length)
Attention weights
context: torch.tensor(bs, seq_length, dim)
Contextualized layer
"""
bs
,
q_length
,
dim
=
query
.
size
()
k_length
=
key
.
size
(
1
)
assert
dim
==
self
.
dim
,
'Dimensions do not match: %s input vs %s configured'
%
(
dim
,
self
.
dim
)
assert
key
.
size
()
==
value
.
size
()
dim_per_head
=
dim
//
self
.
n_heads
assert
2
<=
mask
.
dim
()
<=
3
causal
=
(
mask
.
dim
()
==
3
)
mask_reshp
=
(
bs
,
1
,
1
,
k_length
)
def
shape
(
x
):
""" separate heads """
return
x
.
view
(
bs
,
-
1
,
self
.
n_heads
,
dim_per_head
).
transpose
(
1
,
2
)
def
unshape
(
x
):
""" group heads """
return
x
.
transpose
(
1
,
2
).
contiguous
().
view
(
bs
,
-
1
,
dim
)
q
=
shape
(
self
.
q_lin
(
query
))
# (bs, n_heads, q_length, dim_per_head)
k
=
shape
(
self
.
k_lin
(
key
))
# (bs, n_heads, k_length, dim_per_head)
v
=
shape
(
self
.
v_lin
(
value
))
# (bs, n_heads, k_length, dim_per_head)
q
=
q
/
math
.
sqrt
(
dim_per_head
)
# (bs, n_heads, q_length, dim_per_head)
scores
=
torch
.
matmul
(
q
,
k
.
transpose
(
2
,
3
))
# (bs, n_heads, q_length, k_length)
mask
=
(
mask
==
0
).
view
(
mask_reshp
).
expand_as
(
scores
)
# (bs, n_heads, q_length, k_length)
scores
.
masked_fill_
(
mask
,
-
float
(
'inf'
))
# (bs, n_heads, q_length, k_length)
weights
=
nn
.
Softmax
(
dim
=-
1
)(
scores
)
# (bs, n_heads, q_length, k_length)
weights
=
self
.
dropout
(
weights
)
# (bs, n_heads, q_length, k_length)
context
=
torch
.
matmul
(
weights
,
v
)
# (bs, n_heads, q_length, dim_per_head)
context
=
unshape
(
context
)
# (bs, q_length, dim)
context
=
self
.
out_lin
(
context
)
# (bs, q_length, dim)
if
self
.
output_attentions
:
return
context
,
weights
else
:
return
context
class
FFN
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
FFN
,
self
).
__init__
()
self
.
dropout
=
nn
.
Dropout
(
p
=
config
.
dropout
)
self
.
lin1
=
nn
.
Linear
(
in_features
=
config
.
dim
,
out_features
=
config
.
hidden_dim
)
self
.
lin2
=
nn
.
Linear
(
in_features
=
config
.
hidden_dim
,
out_features
=
config
.
dim
)
assert
activation
in
[
'relu'
,
'gelu'
],
ValueError
(
f
"activation (
{
config
.
activation
}
) must be in ['relu', 'gelu']"
)
self
.
activation
=
gelu
if
activation
==
'gelu'
else
nn
.
ReLU
()
def
forward
(
self
,
input
:
torch
.
tensor
):
x
=
self
.
lin1
(
input
)
x
=
self
.
activation
(
x
)
x
=
self
.
lin2
(
x
)
x
=
self
.
dropout
(
x
)
return
x
class
TransformerBlock
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
TransformerBlock
,
self
).
__init__
()
self
.
n_heads
=
config
.
n_heads
self
.
dim
=
config
.
dim
self
.
hidden_dim
=
config
.
hidden_dim
self
.
dropout
=
nn
.
Dropout
(
p
=
config
.
dropout
)
self
.
activation
=
config
.
activation
self
.
output_attentions
=
config
.
output_attentions
assert
dim
%
n_heads
==
0
self
.
attention
=
MultiHeadSelfAttention
(
dim
=
config
.
dim
,
n_heads
=
config
.
n_heads
,
dropout
=
config
.
attention_dropout
,
output_attentions
=
config
.
output_attentions
)
self
.
sa_layer_norm
=
nn
.
LayerNorm
(
normalized_shape
=
config
.
dim
,
eps
=
1e-12
)
self
.
ffn
=
FFN
(
in_dim
=
config
.
dim
,
hidden_dim
=
config
.
hidden_dim
,
out_dim
=
config
.
dim
,
dropout
=
config
.
dropout
,
activation
=
config
.
activation
)
self
.
output_layer_norm
=
nn
.
LayerNorm
(
normalized_shape
=
config
.
dim
,
eps
=
1e-12
)
def
forward
(
self
,
x
:
torch
.
tensor
,
attn_mask
:
torch
.
tensor
=
None
):
"""
Parameters
----------
x: torch.tensor(bs, seq_length, dim)
attn_mask: torch.tensor(bs, seq_length)
"""
# Self-Attention
sa_output
=
self
.
attention
(
query
=
x
,
key
=
x
,
value
=
x
,
mask
=
attn_mask
)
if
self
.
output_attentions
:
sa_output
,
sa_weights
=
sa_output
# (bs, seq_length, dim)
sa_output
=
self
.
sa_layer_norm
(
sa_output
+
x
)
# (bs, seq_length, dim)
# Feed Forward Network
ffn_output
=
self
.
ffn
(
sa_output
)
# (bs, seq_length, dim)
ffn_output
=
self
.
output_layer_norm
(
ffn_output
+
sa_output
)
# (bs, seq_length, dim)
if
self
.
output_attentions
:
return
sa_weights
,
ffn_output
else
:
return
ffn_output
class
Transformer
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
Transformer
,
self
).
__init__
()
self
.
n_layers
=
config
.
n_layers
self
.
output_attentions
=
config
.
output_attentions
layer
=
TransformerBlock
(
n_heads
=
config
.
n_heads
,
dim
=
config
.
dim
,
hidden_dim
=
config
.
hidden_dim
,
dropout
=
config
.
dropout
,
attention_dropout
=
config
.
attention_dropout
,
activation
=
config
.
activation
,
output_attentions
=
config
.
output_attentions
)
self
.
layer
=
nn
.
ModuleList
([
copy
.
deepcopy
(
layer
)
for
_
in
range
(
n_layers
)])
def
forward
(
self
,
x
:
torch
.
tensor
,
attn_mask
:
torch
.
tensor
=
None
,
output_all_encoded_layers
:
bool
=
True
):
"""
Parameters
----------
x: torch.tensor(bs, seq_length, dim)
attn_mask: torch.tensor(bs, seq_length)
output_all_encoded_layers: bool
"""
all_encoder_layers
=
[]
all_attentions
=
[]
for
_
,
layer_module
in
enumerate
(
self
.
layer
):
x
=
layer_module
(
x
=
x
,
attn_mask
=
attn_mask
)
if
self
.
output_attentions
:
attentions
,
x
=
x
all_attentions
.
append
(
attentions
)
all_encoder_layers
.
append
(
x
)
if
not
output_all_encoded_layers
:
all_encoder_layers
=
all_encoder_layers
[
-
1
]
if
self
.
output_attentions
:
return
all_attentions
,
all_encoder_layers
else
:
return
all_encoder_layers
# TODO(Victor)
# class DilBertWithLMHeadModel(DilBertPreTrainedModel):
# class DilBertForSequenceClassification(DilBertPretrainedModel):
class
DilBertForQuestionAnswering
(
DilBertPreTrainedModel
):
def
__init__
(
self
,
config
):
super
(
DilBertForQuestionAnswering
,
self
).
__init__
(
config
)
self
.
dilbert
=
DilBertModel
(
config
)
self
.
qa_outputs
=
nn
.
Linear
(
config
.
dim
,
config
.
num_labels
)
assert
config
.
num_labels
==
2
self
.
dropout
=
nn
.
Dropout
(
config
.
qa_dropout
)
self
.
apply
(
self
.
init_weights
)
def
forward
(
self
,
input_ids
:
torch
.
tensor
,
attention_mask
:
torch
.
tensor
=
None
,
start_positions
:
torch
.
tensor
=
None
,
end_positions
:
torch
.
tensor
=
None
):
_
,
_
,
hidden_states
=
self
.
dilbert
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
)
# _, _, (bs, max_query_len, dim)
hidden_states
=
self
.
dropout
(
hidden_states
)
# (bs, max_query_len, dim)
logits
=
self
.
qa_outputs
(
hidden_states
)
# (bs, max_query_len, 2)
start_logits
,
end_logits
=
logits
.
split
(
1
,
dim
=-
1
)
start_logits
=
start_logits
.
squeeze
(
-
1
)
# (bs, max_query_len)
end_logits
=
end_logits
.
squeeze
(
-
1
)
# (bs, max_query_len)
outputs
=
(
start_logits
,
end_logits
,)
+
(
hidden_states
,)
if
start_positions
is
not
None
and
end_positions
is
not
None
:
# If we are on multi-GPU, split add a dimension
if
len
(
start_positions
.
size
())
>
1
:
start_positions
=
start_positions
.
squeeze
(
-
1
)
if
len
(
end_positions
.
size
())
>
1
:
end_positions
=
end_positions
.
squeeze
(
-
1
)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index
=
start_logits
.
size
(
1
)
start_positions
.
clamp_
(
0
,
ignored_index
)
end_positions
.
clamp_
(
0
,
ignored_index
)
loss_fct
=
nn
.
CrossEntropyLoss
(
ignore_index
=
ignored_index
)
start_loss
=
loss_fct
(
start_logits
,
start_positions
)
end_loss
=
loss_fct
(
end_logits
,
end_positions
)
total_loss
=
(
start_loss
+
end_loss
)
/
2
outputs
=
(
total_loss
,)
+
outputs
return
outputs
# (loss), start_logits, end_logits, hidden_states
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment