Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
bd91ae65
Commit
bd91ae65
authored
Nov 06, 2018
by
lukovnikov
Browse files
moved bert to qelos-util
parent
4e521884
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
8 additions
and
74 deletions
+8
-74
hf_bert/__init__.py
hf_bert/__init__.py
+0
-0
modeling.py
modeling.py
+8
-3
tests/mytest.py
tests/mytest.py
+0
-71
No files found.
hf_bert/__init__.py
0 → 100644
View file @
bd91ae65
modeling.py
View file @
bd91ae65
...
@@ -34,6 +34,10 @@ def gelu(x):
...
@@ -34,6 +34,10 @@ def gelu(x):
return
x
*
0.5
*
(
1.0
+
torch
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
return
x
*
0.5
*
(
1.0
+
torch
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
def
swish
(
x
):
return
x
*
torch
.
sigmoid
(
x
)
class
BertConfig
(
object
):
class
BertConfig
(
object
):
"""Configuration class to store the configuration of a `BertModel`.
"""Configuration class to store the configuration of a `BertModel`.
"""
"""
...
@@ -60,7 +64,7 @@ class BertConfig(object):
...
@@ -60,7 +64,7 @@ class BertConfig(object):
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler.
encoder and pooler.
If string, "gelu", "relu" and "swish" supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
hidden_dropout_prob: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
attention_probs_dropout_prob: The dropout ratio for the attention
...
@@ -237,7 +241,8 @@ class BERTIntermediate(nn.Module):
...
@@ -237,7 +241,8 @@ class BERTIntermediate(nn.Module):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
BERTIntermediate
,
self
).
__init__
()
super
(
BERTIntermediate
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
intermediate_size
)
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
intermediate_size
)
self
.
intermediate_act_fn
=
gelu
act2fn
=
{
"gelu"
:
gelu
,
"relu"
:
torch
.
nn
.
ReLU
,
"swish"
:
swish
}
self
.
intermediate_act_fn
=
act2fn
[
config
.
hidden_act
]
if
isinstance
(
config
.
hidden_act
,
str
)
else
config
.
hidden_act
def
forward
(
self
,
hidden_states
):
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dense
(
hidden_states
)
...
@@ -355,7 +360,7 @@ class BertModel(nn.Module):
...
@@ -355,7 +360,7 @@ class BertModel(nn.Module):
all_encoder_layers
=
self
.
encoder
(
embedding_output
,
extended_attention_mask
)
all_encoder_layers
=
self
.
encoder
(
embedding_output
,
extended_attention_mask
)
sequence_output
=
all_encoder_layers
[
-
1
]
sequence_output
=
all_encoder_layers
[
-
1
]
pooled_output
=
self
.
pooler
(
sequence_output
)
pooled_output
=
self
.
pooler
(
sequence_output
)
return
[
embedding_output
]
+
all_encoder_layers
,
pooled_output
return
all_encoder_layers
,
pooled_output
class
BertForSequenceClassification
(
nn
.
Module
):
class
BertForSequenceClassification
(
nn
.
Module
):
"""BERT model for classification.
"""BERT model for classification.
...
...
tests/mytest.py
deleted
100644 → 0
View file @
4e521884
import
unittest
import
json
import
random
import
torch
import
numpy
as
np
import
modeling
import
convert_tf_checkpoint_to_pytorch
import
grouch
class
MyTest
(
unittest
.
TestCase
):
def
test_loading_and_running
(
self
):
bertpath
=
"../../grouch/data/bert/bert-base/"
configpath
=
bertpath
+
"bert_config.json"
ckptpath
=
bertpath
+
"bert_model.ckpt"
m
=
convert_tf_checkpoint_to_pytorch
.
convert
(
configpath
,
ckptpath
)
m
.
eval
()
# print(m)
input_ids
=
torch
.
LongTensor
([[
31
,
51
,
99
],
[
15
,
5
,
0
]])
input_mask
=
torch
.
LongTensor
([[
1
,
1
,
1
],
[
1
,
1
,
0
]])
token_type_ids
=
torch
.
LongTensor
([[
0
,
0
,
1
],
[
0
,
1
,
0
]])
all_y
,
pool_y
=
m
(
input_ids
,
token_type_ids
,
input_mask
)
print
(
pool_y
.
shape
)
# np.save("_bert_ref_pool_out.npy", pool_y.detach().numpy())
# np.save("_bert_ref_all_out.npy", torch.stack(all_y, 0).detach().numpy())
config
=
grouch
.
TransformerBERT
.
load_config
(
configpath
)
gm
=
grouch
.
TransformerBERT
.
init_from_config
(
config
)
gm
.
load_weights_from_tf_checkpoint
(
ckptpath
)
gm
.
eval
()
g_all_y
,
g_pool_y
=
gm
(
input_ids
,
token_type_ids
,
input_mask
)
print
(
g_pool_y
.
shape
)
# check embeddings
# print(m.embeddings)
# print(gm.emb)
# hugging_emb = m.embeddings(input_ids, token_type_ids)
# grouch_emb = gm.emb(input_ids, token_type_ids)
print
((
all_y
[
0
]
-
g_all_y
[
0
]).
norm
())
# print(all_y[0][:, :, :10] - g_all_y[0][:, :, :10])
self
.
assertTrue
(
np
.
allclose
(
all_y
[
0
].
detach
().
numpy
(),
g_all_y
[
0
].
detach
().
numpy
(),
atol
=
1e-7
))
print
(
"embeddings good"
)
print
(
m
.
encoder
.
layer
[
0
])
print
(
gm
.
encoder
.
layers
[
0
])
print
(
"norm of diff at layer 1"
,
(
all_y
[
1
]
-
g_all_y
[
1
]).
norm
())
# print(all_y[1][:, :, :10] - g_all_y[1][:, :, :10])
self
.
assertTrue
(
np
.
allclose
(
all_y
[
1
].
detach
().
numpy
(),
g_all_y
[
1
].
detach
().
numpy
(),
atol
=
1e-6
))
# hugging_layer = m.encoder.layer[0]
# grouch_layer = gm.encoder.layers[0]
# print("comparing weights")
# print((hugging_layer.attention.self.query.weight - grouch_layer.slf_attn.q_proj.weight).norm())
# print((hugging_layer.attention.self.query.bias - grouch_layer.slf_attn.q_proj.bias).norm())
# print((hugging_layer.attention.self.key.weight - grouch_layer.slf_attn.k_proj.weight).norm())
# print((hugging_layer.attention.self.key.bias - grouch_layer.slf_attn.k_proj.bias).norm())
# print((hugging_layer.attention.self.value.weight - grouch_layer.slf_attn.v_proj.weight).norm())
# print((hugging_layer.attention.self.value.bias - grouch_layer.slf_attn.v_proj.bias).norm())
# print((hugging_layer.attention.output.dense.weight - grouch_layer.slf_attn.vw_proj.weight).norm())
# print((hugging_layer.attention.output.dense.bias - grouch_layer.slf_attn.vw_proj.bias).norm())
print
(
"norm of diff at last layer"
,
(
all_y
[
-
1
]
-
g_all_y
[
-
1
]).
norm
())
# print(all_y[-1][:, :, :10] - g_all_y[-1][:, :, :10])
self
.
assertTrue
(
np
.
allclose
(
all_y
[
-
1
].
detach
().
numpy
(),
g_all_y
[
-
1
].
detach
().
numpy
(),
atol
=
1e-4
))
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment