Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
8aa22af0
Commit
8aa22af0
authored
Nov 03, 2018
by
thomwolf
Browse files
fixing model
parent
38f740a1
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
265 additions
and
161 deletions
+265
-161
Comparing TF and PT models.ipynb
Comparing TF and PT models.ipynb
+225
-132
extract_features_pytorch.py
extract_features_pytorch.py
+11
-9
modeling_pytorch.py
modeling_pytorch.py
+29
-20
No files found.
Comparing TF and PT models.ipynb
View file @
8aa22af0
This diff is collapsed.
Click to expand it.
extract_features_pytorch.py
View file @
8aa22af0
...
...
@@ -268,29 +268,31 @@ def main():
input_mask
=
input_mask
.
float
().
to
(
device
)
all_encoder_layers
,
_
=
model
(
input_ids
,
token_type_ids
=
None
,
attention_mask
=
input_mask
)
all_encoder_layers
=
all_encoder_layers
for
enc_layers
,
example_index
in
zip
(
all_encoder_layers
,
example_indices
):
for
b
,
example_index
in
enumerate
(
example_indices
):
feature
=
features
[
example_index
.
item
()]
unique_id
=
int
(
feature
.
unique_id
)
# feature = unique_id_to_feature[unique_id]
output_json
=
collections
.
OrderedDict
()
output_json
[
"linex_index"
]
=
unique_id
all_features
=
[]
all_
out_
features
=
[]
for
(
i
,
token
)
in
enumerate
(
feature
.
tokens
):
all_layers
=
[]
for
(
j
,
layer_index
)
in
enumerate
(
layer_indexes
):
layer_output
=
enc_layers
[
int
(
layer_index
)].
detach
().
cpu
().
numpy
()
layer_output
=
all_encoder_layers
[
int
(
layer_index
)].
detach
().
cpu
().
numpy
()
layer_output
=
layer_output
[
b
]
layers
=
collections
.
OrderedDict
()
layers
[
"index"
]
=
layer_index
layers
[
"values"
]
=
[
round
(
float
(
x
),
6
)
for
x
in
layer_output
[
i
:(
i
+
1
)].
flat
round
(
x
.
item
(
),
6
)
for
x
in
layer_output
[
i
]
]
all_layers
.
append
(
layers
)
features
=
collections
.
OrderedDict
()
features
[
"token"
]
=
token
features
[
"layers"
]
=
all_layers
all_features
.
append
(
features
)
output_json
[
"features"
]
=
all_features
out_
features
=
collections
.
OrderedDict
()
out_
features
[
"token"
]
=
token
out_
features
[
"layers"
]
=
all_layers
all_
out_
features
.
append
(
out_
features
)
output_json
[
"features"
]
=
all_
out_
features
writer
.
write
(
json
.
dumps
(
output_json
)
+
"
\n
"
)
...
...
modeling_pytorch.py
View file @
8aa22af0
...
...
@@ -27,8 +27,9 @@ import torch.nn as nn
from
torch.nn
import
CrossEntropyLoss
def
gelu
(
x
):
return
0.5
*
(
1.0
+
torch
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
# OpenAI GPT gelu version was : 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
return
x
*
0.5
*
(
1.0
+
torch
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
# OpenAI GPT gelu version :
# return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
class
BertConfig
(
object
):
...
...
@@ -157,7 +158,7 @@ class BERTEmbeddings(nn.Module):
words_embeddings
=
self
.
word_embeddings
(
input_ids
)
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
token_type_embeddings
=
self
.
token_type_embeddings
(
token_type_ids
)
embeddings
=
words_embeddings
+
position_embeddings
+
token_type_embeddings
embeddings
=
self
.
LayerNorm
(
embeddings
)
embeddings
=
self
.
dropout
(
embeddings
)
...
...
@@ -196,19 +197,19 @@ class BERTSelfAttention(nn.Module):
# T = `to_tensor` sequence length
# N = `num_attention_heads`
# H = `size_per_head`
query_layer
=
self
.
query
(
hidden_states
)
key_layer
=
self
.
key
(
hidden_states
)
value_layer
=
self
.
value
(
hidden_states
)
mixed_
query_layer
=
self
.
query
(
hidden_states
)
mixed_
key_layer
=
self
.
key
(
hidden_states
)
mixed_
value_layer
=
self
.
value
(
hidden_states
)
query_layer
=
self
.
transpose_for_scores
(
query_layer
)
key_layer
=
self
.
transpose_for_scores
(
key_layer
,
is_key_tensor
=
True
)
value_layer
=
self
.
transpose_for_scores
(
value_layer
)
query_layer
=
self
.
transpose_for_scores
(
mixed_
query_layer
)
key_layer
=
self
.
transpose_for_scores
(
mixed_
key_layer
)
#
, is_key_tensor=True)
value_layer
=
self
.
transpose_for_scores
(
mixed_
value_layer
)
# Take the dot product between "query" and "key" to get the raw
# attention scores.
# `attention_scores` = [B, N, F, T]
attention_scores
=
torch
.
matmul
(
query_layer
,
key_layer
)
attention_scores
=
attention_scores
/
math
.
sqrt
(
self
.
attention_head_size
)
attention_scores
_no_norm
=
torch
.
matmul
(
query_layer
,
key_layer
.
transpose
(
-
1
,
-
2
)
)
attention_scores
_no_mask
=
attention_scores
_no_norm
/
math
.
sqrt
(
self
.
attention_head_size
)
# TODO clean up this (precompute)
# MY PYTORCH: w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights
...
...
@@ -220,21 +221,26 @@ class BERTSelfAttention(nn.Module):
# adder = (1.0 - attention_mask) * -10000.0
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_scores
+
=
attention_mask
attention_scores
=
attention_scores_no_mask
+
attention_mask
# Normalize the attention scores to probabilities.
# `attention_probs` = [B, N, F, T]
attention_probs
=
nn
.
Softmax
(
dim
=-
1
)(
attention_scores
)
attention_probs
_no_drop
=
nn
.
Softmax
(
dim
=-
1
)(
attention_scores
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
self
.
dropout
(
attention_probs
)
attention_probs
=
self
.
dropout
(
attention_probs
_no_drop
)
context_layer
=
torch
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
all_head_size
,)
context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
# aux_attention = attention_probs[0, 0, 0, :].view(1, 128, 1)
# aux_attention = aux_attention.permute(0, 2, 1, 3).contiguous().view(1, 128, 768)
# aux_attention = key_layer.permute(0, 2, 3, 1).contiguous().view(1, 128, 768)
# aux_attention = key_layer.permute(0, 2, 1, 3).contiguous().view(1, 128, 768)
return
context_layer
...
...
@@ -246,7 +252,7 @@ class BERTSelfOutput(nn.Module):
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
,
input_tensor
):
hidden_states
=
self
.
dense
(
input_tensor
)
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
return
hidden_states
...
...
@@ -259,8 +265,8 @@ class BERTAttention(nn.Module):
self
.
output
=
BERTSelfOutput
(
config
)
def
forward
(
self
,
input_tensor
,
attention_mask
):
attention
_output
=
self
.
self
(
input_tensor
,
attention_mask
)
attention_output
=
self
.
output
(
attention
_output
,
input_tensor
)
self
_output
=
self
.
self
(
input_tensor
,
attention_mask
)
attention_output
=
self
.
output
(
self
_output
,
input_tensor
)
return
attention_output
...
...
@@ -388,13 +394,16 @@ class BertModel(nn.Module):
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
attention_mask
=
(
1.0
-
attention_mask
)
*
-
10000.0
extended_
attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
extended_
attention_mask
=
(
1.0
-
extended_
attention_mask
)
*
-
10000.0
embedding_output
=
self
.
embeddings
(
input_ids
,
token_type_ids
)
all_encoder_layers
=
self
.
encoder
(
embedding_output
,
attention_mask
)
all_encoder_layers
=
self
.
encoder
(
embedding_output
,
extended_
attention_mask
)
sequence_output
=
all_encoder_layers
[
-
1
]
pooled_output
=
self
.
pooler
(
sequence_output
)
# TODO DEbugging
# all_encoder_layers = [attention_mask, embeddings_sum, embedding_output] + all_encoder_layers
return
all_encoder_layers
,
pooled_output
class
BertForSequenceClassification
(
nn
.
Module
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment