Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
8aa22af0
Commit
8aa22af0
authored
Nov 03, 2018
by
thomwolf
Browse files
fixing model
parent
38f740a1
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
265 additions
and
161 deletions
+265
-161
Comparing TF and PT models.ipynb
Comparing TF and PT models.ipynb
+225
-132
extract_features_pytorch.py
extract_features_pytorch.py
+11
-9
modeling_pytorch.py
modeling_pytorch.py
+29
-20
No files found.
Comparing TF and PT models.ipynb
View file @
8aa22af0
This diff is collapsed.
Click to expand it.
extract_features_pytorch.py
View file @
8aa22af0
...
@@ -268,29 +268,31 @@ def main():
...
@@ -268,29 +268,31 @@ def main():
input_mask
=
input_mask
.
float
().
to
(
device
)
input_mask
=
input_mask
.
float
().
to
(
device
)
all_encoder_layers
,
_
=
model
(
input_ids
,
token_type_ids
=
None
,
attention_mask
=
input_mask
)
all_encoder_layers
,
_
=
model
(
input_ids
,
token_type_ids
=
None
,
attention_mask
=
input_mask
)
all_encoder_layers
=
all_encoder_layers
for
enc_layers
,
example_index
in
zip
(
all_encoder_layers
,
example_indices
):
for
b
,
example_index
in
enumerate
(
example_indices
):
feature
=
features
[
example_index
.
item
()]
feature
=
features
[
example_index
.
item
()]
unique_id
=
int
(
feature
.
unique_id
)
unique_id
=
int
(
feature
.
unique_id
)
# feature = unique_id_to_feature[unique_id]
# feature = unique_id_to_feature[unique_id]
output_json
=
collections
.
OrderedDict
()
output_json
=
collections
.
OrderedDict
()
output_json
[
"linex_index"
]
=
unique_id
output_json
[
"linex_index"
]
=
unique_id
all_features
=
[]
all_
out_
features
=
[]
for
(
i
,
token
)
in
enumerate
(
feature
.
tokens
):
for
(
i
,
token
)
in
enumerate
(
feature
.
tokens
):
all_layers
=
[]
all_layers
=
[]
for
(
j
,
layer_index
)
in
enumerate
(
layer_indexes
):
for
(
j
,
layer_index
)
in
enumerate
(
layer_indexes
):
layer_output
=
enc_layers
[
int
(
layer_index
)].
detach
().
cpu
().
numpy
()
layer_output
=
all_encoder_layers
[
int
(
layer_index
)].
detach
().
cpu
().
numpy
()
layer_output
=
layer_output
[
b
]
layers
=
collections
.
OrderedDict
()
layers
=
collections
.
OrderedDict
()
layers
[
"index"
]
=
layer_index
layers
[
"index"
]
=
layer_index
layers
[
"values"
]
=
[
layers
[
"values"
]
=
[
round
(
float
(
x
),
6
)
for
x
in
layer_output
[
i
:(
i
+
1
)].
flat
round
(
x
.
item
(
),
6
)
for
x
in
layer_output
[
i
]
]
]
all_layers
.
append
(
layers
)
all_layers
.
append
(
layers
)
features
=
collections
.
OrderedDict
()
out_
features
=
collections
.
OrderedDict
()
features
[
"token"
]
=
token
out_
features
[
"token"
]
=
token
features
[
"layers"
]
=
all_layers
out_
features
[
"layers"
]
=
all_layers
all_features
.
append
(
features
)
all_
out_
features
.
append
(
out_
features
)
output_json
[
"features"
]
=
all_features
output_json
[
"features"
]
=
all_
out_
features
writer
.
write
(
json
.
dumps
(
output_json
)
+
"
\n
"
)
writer
.
write
(
json
.
dumps
(
output_json
)
+
"
\n
"
)
...
...
modeling_pytorch.py
View file @
8aa22af0
...
@@ -27,8 +27,9 @@ import torch.nn as nn
...
@@ -27,8 +27,9 @@ import torch.nn as nn
from
torch.nn
import
CrossEntropyLoss
from
torch.nn
import
CrossEntropyLoss
def
gelu
(
x
):
def
gelu
(
x
):
return
0.5
*
(
1.0
+
torch
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
return
x
*
0.5
*
(
1.0
+
torch
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
# OpenAI GPT gelu version was : 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
# OpenAI GPT gelu version :
# return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
class
BertConfig
(
object
):
class
BertConfig
(
object
):
...
@@ -157,7 +158,7 @@ class BERTEmbeddings(nn.Module):
...
@@ -157,7 +158,7 @@ class BERTEmbeddings(nn.Module):
words_embeddings
=
self
.
word_embeddings
(
input_ids
)
words_embeddings
=
self
.
word_embeddings
(
input_ids
)
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
token_type_embeddings
=
self
.
token_type_embeddings
(
token_type_ids
)
token_type_embeddings
=
self
.
token_type_embeddings
(
token_type_ids
)
embeddings
=
words_embeddings
+
position_embeddings
+
token_type_embeddings
embeddings
=
words_embeddings
+
position_embeddings
+
token_type_embeddings
embeddings
=
self
.
LayerNorm
(
embeddings
)
embeddings
=
self
.
LayerNorm
(
embeddings
)
embeddings
=
self
.
dropout
(
embeddings
)
embeddings
=
self
.
dropout
(
embeddings
)
...
@@ -196,19 +197,19 @@ class BERTSelfAttention(nn.Module):
...
@@ -196,19 +197,19 @@ class BERTSelfAttention(nn.Module):
# T = `to_tensor` sequence length
# T = `to_tensor` sequence length
# N = `num_attention_heads`
# N = `num_attention_heads`
# H = `size_per_head`
# H = `size_per_head`
query_layer
=
self
.
query
(
hidden_states
)
mixed_
query_layer
=
self
.
query
(
hidden_states
)
key_layer
=
self
.
key
(
hidden_states
)
mixed_
key_layer
=
self
.
key
(
hidden_states
)
value_layer
=
self
.
value
(
hidden_states
)
mixed_
value_layer
=
self
.
value
(
hidden_states
)
query_layer
=
self
.
transpose_for_scores
(
query_layer
)
query_layer
=
self
.
transpose_for_scores
(
mixed_
query_layer
)
key_layer
=
self
.
transpose_for_scores
(
key_layer
,
is_key_tensor
=
True
)
key_layer
=
self
.
transpose_for_scores
(
mixed_
key_layer
)
#
, is_key_tensor=True)
value_layer
=
self
.
transpose_for_scores
(
value_layer
)
value_layer
=
self
.
transpose_for_scores
(
mixed_
value_layer
)
# Take the dot product between "query" and "key" to get the raw
# Take the dot product between "query" and "key" to get the raw
# attention scores.
# attention scores.
# `attention_scores` = [B, N, F, T]
# `attention_scores` = [B, N, F, T]
attention_scores
=
torch
.
matmul
(
query_layer
,
key_layer
)
attention_scores
_no_norm
=
torch
.
matmul
(
query_layer
,
key_layer
.
transpose
(
-
1
,
-
2
)
)
attention_scores
=
attention_scores
/
math
.
sqrt
(
self
.
attention_head_size
)
attention_scores
_no_mask
=
attention_scores
_no_norm
/
math
.
sqrt
(
self
.
attention_head_size
)
# TODO clean up this (precompute)
# TODO clean up this (precompute)
# MY PYTORCH: w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights
# MY PYTORCH: w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights
...
@@ -220,21 +221,26 @@ class BERTSelfAttention(nn.Module):
...
@@ -220,21 +221,26 @@ class BERTSelfAttention(nn.Module):
# adder = (1.0 - attention_mask) * -10000.0
# adder = (1.0 - attention_mask) * -10000.0
# Since we are adding it to the raw scores before the softmax, this is
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
# effectively the same as removing these entirely.
attention_scores
+
=
attention_mask
attention_scores
=
attention_scores_no_mask
+
attention_mask
# Normalize the attention scores to probabilities.
# Normalize the attention scores to probabilities.
# `attention_probs` = [B, N, F, T]
# `attention_probs` = [B, N, F, T]
attention_probs
=
nn
.
Softmax
(
dim
=-
1
)(
attention_scores
)
attention_probs
_no_drop
=
nn
.
Softmax
(
dim
=-
1
)(
attention_scores
)
# This is actually dropping out entire tokens to attend to, which might
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
self
.
dropout
(
attention_probs
)
attention_probs
=
self
.
dropout
(
attention_probs
_no_drop
)
context_layer
=
torch
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
torch
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
all_head_size
,)
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
all_head_size
,)
context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
# aux_attention = attention_probs[0, 0, 0, :].view(1, 128, 1)
# aux_attention = aux_attention.permute(0, 2, 1, 3).contiguous().view(1, 128, 768)
# aux_attention = key_layer.permute(0, 2, 3, 1).contiguous().view(1, 128, 768)
# aux_attention = key_layer.permute(0, 2, 1, 3).contiguous().view(1, 128, 768)
return
context_layer
return
context_layer
...
@@ -246,7 +252,7 @@ class BERTSelfOutput(nn.Module):
...
@@ -246,7 +252,7 @@ class BERTSelfOutput(nn.Module):
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
,
input_tensor
):
def
forward
(
self
,
hidden_states
,
input_tensor
):
hidden_states
=
self
.
dense
(
input_tensor
)
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
return
hidden_states
return
hidden_states
...
@@ -259,8 +265,8 @@ class BERTAttention(nn.Module):
...
@@ -259,8 +265,8 @@ class BERTAttention(nn.Module):
self
.
output
=
BERTSelfOutput
(
config
)
self
.
output
=
BERTSelfOutput
(
config
)
def
forward
(
self
,
input_tensor
,
attention_mask
):
def
forward
(
self
,
input_tensor
,
attention_mask
):
attention
_output
=
self
.
self
(
input_tensor
,
attention_mask
)
self
_output
=
self
.
self
(
input_tensor
,
attention_mask
)
attention_output
=
self
.
output
(
attention
_output
,
input_tensor
)
attention_output
=
self
.
output
(
self
_output
,
input_tensor
)
return
attention_output
return
attention_output
...
@@ -388,13 +394,16 @@ class BertModel(nn.Module):
...
@@ -388,13 +394,16 @@ class BertModel(nn.Module):
if
token_type_ids
is
None
:
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
extended_
attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
attention_mask
=
(
1.0
-
attention_mask
)
*
-
10000.0
extended_
attention_mask
=
(
1.0
-
extended_
attention_mask
)
*
-
10000.0
embedding_output
=
self
.
embeddings
(
input_ids
,
token_type_ids
)
embedding_output
=
self
.
embeddings
(
input_ids
,
token_type_ids
)
all_encoder_layers
=
self
.
encoder
(
embedding_output
,
attention_mask
)
all_encoder_layers
=
self
.
encoder
(
embedding_output
,
extended_
attention_mask
)
sequence_output
=
all_encoder_layers
[
-
1
]
sequence_output
=
all_encoder_layers
[
-
1
]
pooled_output
=
self
.
pooler
(
sequence_output
)
pooled_output
=
self
.
pooler
(
sequence_output
)
# TODO DEbugging
# all_encoder_layers = [attention_mask, embeddings_sum, embedding_output] + all_encoder_layers
return
all_encoder_layers
,
pooled_output
return
all_encoder_layers
,
pooled_output
class
BertForSequenceClassification
(
nn
.
Module
):
class
BertForSequenceClassification
(
nn
.
Module
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment