Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
8aa22af0
Commit
8aa22af0
authored
Nov 03, 2018
by
thomwolf
Browse files
fixing model
parent
38f740a1
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
265 additions
and
161 deletions
+265
-161
Comparing TF and PT models.ipynb
Comparing TF and PT models.ipynb
+225
-132
extract_features_pytorch.py
extract_features_pytorch.py
+11
-9
modeling_pytorch.py
modeling_pytorch.py
+29
-20
No files found.
Comparing TF and PT models.ipynb
View file @
8aa22af0
This diff is collapsed.
Click to expand it.
extract_features_pytorch.py
View file @
8aa22af0
...
@@ -268,29 +268,31 @@ def main():
...
@@ -268,29 +268,31 @@ def main():
input_mask
=
input_mask
.
float
().
to
(
device
)
input_mask
=
input_mask
.
float
().
to
(
device
)
all_encoder_layers
,
_
=
model
(
input_ids
,
token_type_ids
=
None
,
attention_mask
=
input_mask
)
all_encoder_layers
,
_
=
model
(
input_ids
,
token_type_ids
=
None
,
attention_mask
=
input_mask
)
all_encoder_layers
=
all_encoder_layers
for
enc_layers
,
example_index
in
zip
(
all_encoder_layers
,
example_indices
):
for
b
,
example_index
in
enumerate
(
example_indices
):
feature
=
features
[
example_index
.
item
()]
feature
=
features
[
example_index
.
item
()]
unique_id
=
int
(
feature
.
unique_id
)
unique_id
=
int
(
feature
.
unique_id
)
# feature = unique_id_to_feature[unique_id]
# feature = unique_id_to_feature[unique_id]
output_json
=
collections
.
OrderedDict
()
output_json
=
collections
.
OrderedDict
()
output_json
[
"linex_index"
]
=
unique_id
output_json
[
"linex_index"
]
=
unique_id
all_features
=
[]
all_
out_
features
=
[]
for
(
i
,
token
)
in
enumerate
(
feature
.
tokens
):
for
(
i
,
token
)
in
enumerate
(
feature
.
tokens
):
all_layers
=
[]
all_layers
=
[]
for
(
j
,
layer_index
)
in
enumerate
(
layer_indexes
):
for
(
j
,
layer_index
)
in
enumerate
(
layer_indexes
):
layer_output
=
enc_layers
[
int
(
layer_index
)].
detach
().
cpu
().
numpy
()
layer_output
=
all_encoder_layers
[
int
(
layer_index
)].
detach
().
cpu
().
numpy
()
layer_output
=
layer_output
[
b
]
layers
=
collections
.
OrderedDict
()
layers
=
collections
.
OrderedDict
()
layers
[
"index"
]
=
layer_index
layers
[
"index"
]
=
layer_index
layers
[
"values"
]
=
[
layers
[
"values"
]
=
[
round
(
float
(
x
),
6
)
for
x
in
layer_output
[
i
:(
i
+
1
)].
flat
round
(
x
.
item
(
),
6
)
for
x
in
layer_output
[
i
]
]
]
all_layers
.
append
(
layers
)
all_layers
.
append
(
layers
)
features
=
collections
.
OrderedDict
()
out_
features
=
collections
.
OrderedDict
()
features
[
"token"
]
=
token
out_
features
[
"token"
]
=
token
features
[
"layers"
]
=
all_layers
out_
features
[
"layers"
]
=
all_layers
all_features
.
append
(
features
)
all_
out_
features
.
append
(
out_
features
)
output_json
[
"features"
]
=
all_features
output_json
[
"features"
]
=
all_
out_
features
writer
.
write
(
json
.
dumps
(
output_json
)
+
"
\n
"
)
writer
.
write
(
json
.
dumps
(
output_json
)
+
"
\n
"
)
...
...
modeling_pytorch.py
View file @
8aa22af0
...
@@ -27,8 +27,9 @@ import torch.nn as nn
...
@@ -27,8 +27,9 @@ import torch.nn as nn
from
torch.nn
import
CrossEntropyLoss
from
torch.nn
import
CrossEntropyLoss
def
gelu
(
x
):
def
gelu
(
x
):
return
0.5
*
(
1.0
+
torch
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
return
x
*
0.5
*
(
1.0
+
torch
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
# OpenAI GPT gelu version was : 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
# OpenAI GPT gelu version :
# return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
class
BertConfig
(
object
):
class
BertConfig
(
object
):
...
@@ -157,7 +158,7 @@ class BERTEmbeddings(nn.Module):
...
@@ -157,7 +158,7 @@ class BERTEmbeddings(nn.Module):
words_embeddings
=
self
.
word_embeddings
(
input_ids
)
words_embeddings
=
self
.
word_embeddings
(
input_ids
)
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
token_type_embeddings
=
self
.
token_type_embeddings
(
token_type_ids
)
token_type_embeddings
=
self
.
token_type_embeddings
(
token_type_ids
)
embeddings
=
words_embeddings
+
position_embeddings
+
token_type_embeddings
embeddings
=
words_embeddings
+
position_embeddings
+
token_type_embeddings
embeddings
=
self
.
LayerNorm
(
embeddings
)
embeddings
=
self
.
LayerNorm
(
embeddings
)
embeddings
=
self
.
dropout
(
embeddings
)
embeddings
=
self
.
dropout
(
embeddings
)
...
@@ -196,19 +197,19 @@ class BERTSelfAttention(nn.Module):
...
@@ -196,19 +197,19 @@ class BERTSelfAttention(nn.Module):
# T = `to_tensor` sequence length
# T = `to_tensor` sequence length
# N = `num_attention_heads`
# N = `num_attention_heads`
# H = `size_per_head`
# H = `size_per_head`
query_layer
=
self
.
query
(
hidden_states
)
mixed_
query_layer
=
self
.
query
(
hidden_states
)
key_layer
=
self
.
key
(
hidden_states
)
mixed_
key_layer
=
self
.
key
(
hidden_states
)
value_layer
=
self
.
value
(
hidden_states
)
mixed_
value_layer
=
self
.
value
(
hidden_states
)
query_layer
=
self
.
transpose_for_scores
(
query_layer
)
query_layer
=
self
.
transpose_for_scores
(
mixed_
query_layer
)
key_layer
=
self
.
transpose_for_scores
(
key_layer
,
is_key_tensor
=
True
)
key_layer
=
self
.
transpose_for_scores
(
mixed_
key_layer
)
#
, is_key_tensor=True)
value_layer
=
self
.
transpose_for_scores
(
value_layer
)
value_layer
=
self
.
transpose_for_scores
(
mixed_
value_layer
)
# Take the dot product between "query" and "key" to get the raw
# Take the dot product between "query" and "key" to get the raw
# attention scores.
# attention scores.
# `attention_scores` = [B, N, F, T]
# `attention_scores` = [B, N, F, T]
attention_scores
=
torch
.
matmul
(
query_layer
,
key_layer
)
attention_scores
_no_norm
=
torch
.
matmul
(
query_layer
,
key_layer
.
transpose
(
-
1
,
-
2
)
)
attention_scores
=
attention_scores
/
math
.
sqrt
(
self
.
attention_head_size
)
attention_scores
_no_mask
=
attention_scores
_no_norm
/
math
.
sqrt
(
self
.
attention_head_size
)
# TODO clean up this (precompute)
# TODO clean up this (precompute)
# MY PYTORCH: w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights
# MY PYTORCH: w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights
...
@@ -220,21 +221,26 @@ class BERTSelfAttention(nn.Module):
...
@@ -220,21 +221,26 @@ class BERTSelfAttention(nn.Module):
# adder = (1.0 - attention_mask) * -10000.0
# adder = (1.0 - attention_mask) * -10000.0
# Since we are adding it to the raw scores before the softmax, this is
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
# effectively the same as removing these entirely.
attention_scores
+
=
attention_mask
attention_scores
=
attention_scores_no_mask
+
attention_mask
# Normalize the attention scores to probabilities.
# Normalize the attention scores to probabilities.
# `attention_probs` = [B, N, F, T]
# `attention_probs` = [B, N, F, T]
attention_probs
=
nn
.
Softmax
(
dim
=-
1
)(
attention_scores
)
attention_probs
_no_drop
=
nn
.
Softmax
(
dim
=-
1
)(
attention_scores
)
# This is actually dropping out entire tokens to attend to, which might
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
self
.
dropout
(
attention_probs
)
attention_probs
=
self
.
dropout
(
attention_probs
_no_drop
)
context_layer
=
torch
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
torch
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
all_head_size
,)
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
all_head_size
,)
context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
# aux_attention = attention_probs[0, 0, 0, :].view(1, 128, 1)
# aux_attention = aux_attention.permute(0, 2, 1, 3).contiguous().view(1, 128, 768)
# aux_attention = key_layer.permute(0, 2, 3, 1).contiguous().view(1, 128, 768)
# aux_attention = key_layer.permute(0, 2, 1, 3).contiguous().view(1, 128, 768)
return
context_layer
return
context_layer
...
@@ -246,7 +252,7 @@ class BERTSelfOutput(nn.Module):
...
@@ -246,7 +252,7 @@ class BERTSelfOutput(nn.Module):
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
,
input_tensor
):
def
forward
(
self
,
hidden_states
,
input_tensor
):
hidden_states
=
self
.
dense
(
input_tensor
)
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
return
hidden_states
return
hidden_states
...
@@ -259,8 +265,8 @@ class BERTAttention(nn.Module):
...
@@ -259,8 +265,8 @@ class BERTAttention(nn.Module):
self
.
output
=
BERTSelfOutput
(
config
)
self
.
output
=
BERTSelfOutput
(
config
)
def
forward
(
self
,
input_tensor
,
attention_mask
):
def
forward
(
self
,
input_tensor
,
attention_mask
):
attention
_output
=
self
.
self
(
input_tensor
,
attention_mask
)
self
_output
=
self
.
self
(
input_tensor
,
attention_mask
)
attention_output
=
self
.
output
(
attention
_output
,
input_tensor
)
attention_output
=
self
.
output
(
self
_output
,
input_tensor
)
return
attention_output
return
attention_output
...
@@ -388,13 +394,16 @@ class BertModel(nn.Module):
...
@@ -388,13 +394,16 @@ class BertModel(nn.Module):
if
token_type_ids
is
None
:
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
extended_
attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
attention_mask
=
(
1.0
-
attention_mask
)
*
-
10000.0
extended_
attention_mask
=
(
1.0
-
extended_
attention_mask
)
*
-
10000.0
embedding_output
=
self
.
embeddings
(
input_ids
,
token_type_ids
)
embedding_output
=
self
.
embeddings
(
input_ids
,
token_type_ids
)
all_encoder_layers
=
self
.
encoder
(
embedding_output
,
attention_mask
)
all_encoder_layers
=
self
.
encoder
(
embedding_output
,
extended_
attention_mask
)
sequence_output
=
all_encoder_layers
[
-
1
]
sequence_output
=
all_encoder_layers
[
-
1
]
pooled_output
=
self
.
pooler
(
sequence_output
)
pooled_output
=
self
.
pooler
(
sequence_output
)
# TODO DEbugging
# all_encoder_layers = [attention_mask, embeddings_sum, embedding_output] + all_encoder_layers
return
all_encoder_layers
,
pooled_output
return
all_encoder_layers
,
pooled_output
class
BertForSequenceClassification
(
nn
.
Module
):
class
BertForSequenceClassification
(
nn
.
Module
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment