Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
b9c77b98
"vscode:/vscode.git/clone" did not exist on "88efc65bac63bdc6b4123eb225235538a7096034"
Commit
b9c77b98
authored
Jan 17, 2019
by
thomwolf
Browse files
fix transposition in model conversion and memory initialization
parent
009101de
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
7 additions
and
8 deletions
+7
-8
pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
...etrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+1
-3
pytorch_pretrained_bert/modeling_transfo_xl.py
pytorch_pretrained_bert/modeling_transfo_xl.py
+6
-5
No files found.
pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
View file @
b9c77b98
...
@@ -93,8 +93,6 @@ def build_tf_to_pytorch_map(model, config):
...
@@ -93,8 +93,6 @@ def build_tf_to_pytorch_map(model, config):
# Relative positioning biases
# Relative positioning biases
if
config
.
untie_r
:
if
config
.
untie_r
:
layer_str
=
"transformer/r_r_bias"
layer_str_2
=
"transformer/r_w_bias"
r_r_list
=
[]
r_r_list
=
[]
r_w_list
=
[]
r_w_list
=
[]
for
b
in
model
.
layers
:
for
b
in
model
.
layers
:
...
@@ -158,7 +156,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
...
@@ -158,7 +156,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
array
=
tf_weights
[
name
]
array
=
tf_weights
[
name
]
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
# which are not required for using pretrained model
if
'kernel'
in
name
or
'proj
_W
'
in
name
:
if
'kernel'
in
name
or
'proj'
in
name
:
array
=
np
.
transpose
(
array
)
array
=
np
.
transpose
(
array
)
if
(
'r_r_bias'
in
name
or
'r_w_bias'
in
name
)
and
len
(
pointer
)
>
1
:
if
(
'r_r_bias'
in
name
or
'r_w_bias'
in
name
)
and
len
(
pointer
)
>
1
:
# Here we will split the TF weigths
# Here we will split the TF weigths
...
...
pytorch_pretrained_bert/modeling_transfo_xl.py
View file @
b9c77b98
...
@@ -447,10 +447,10 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
...
@@ -447,10 +447,10 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
if
attn_mask
is
not
None
and
attn_mask
.
any
().
item
():
if
attn_mask
is
not
None
and
attn_mask
.
any
().
item
():
if
attn_mask
.
dim
()
==
2
:
if
attn_mask
.
dim
()
==
2
:
attn_score
=
attn_score
.
float
().
masked_fill
(
attn_score
=
attn_score
.
float
().
masked_fill
(
attn_mask
[
None
,:,:,
None
],
-
float
(
'inf'
)
).
type_as
(
attn_score
)
attn_mask
[
None
,:,:,
None
],
-
1e30
).
type_as
(
attn_score
)
elif
attn_mask
.
dim
()
==
3
:
elif
attn_mask
.
dim
()
==
3
:
attn_score
=
attn_score
.
float
().
masked_fill
(
attn_score
=
attn_score
.
float
().
masked_fill
(
attn_mask
[:,:,:,
None
],
-
float
(
'inf'
)
).
type_as
(
attn_score
)
attn_mask
[:,:,:,
None
],
-
1e30
).
type_as
(
attn_score
)
# [qlen x klen x bsz x n_head]
# [qlen x klen x bsz x n_head]
attn_prob
=
F
.
softmax
(
attn_score
,
dim
=
1
)
attn_prob
=
F
.
softmax
(
attn_score
,
dim
=
1
)
...
@@ -947,12 +947,13 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
...
@@ -947,12 +947,13 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
self
.
mem_len
=
mem_len
self
.
mem_len
=
mem_len
self
.
ext_len
=
ext_len
self
.
ext_len
=
ext_len
def
init_mems
(
self
):
def
init_mems
(
self
,
data
):
if
self
.
mem_len
>
0
:
if
self
.
mem_len
>
0
:
mems
=
[]
mems
=
[]
param
=
next
(
self
.
parameters
())
param
=
next
(
self
.
parameters
())
for
i
in
range
(
self
.
n_layer
+
1
):
for
i
in
range
(
self
.
n_layer
+
1
):
empty
=
torch
.
empty
(
0
,
dtype
=
param
.
dtype
,
device
=
param
.
device
)
empty
=
torch
.
zeros
(
self
.
mem_len
,
data
.
size
(
1
),
self
.
config
.
d_model
,
dtype
=
param
.
dtype
,
device
=
param
.
device
)
mems
.
append
(
empty
)
mems
.
append
(
empty
)
return
mems
return
mems
...
@@ -1081,7 +1082,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
...
@@ -1081,7 +1082,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
# So, have to initialize size(0) mems inside the model forward.
# So, have to initialize size(0) mems inside the model forward.
# Moreover, have to return new_mems to allow nn.DataParallel to piece
# Moreover, have to return new_mems to allow nn.DataParallel to piece
# them together.
# them together.
if
not
mems
:
mems
=
self
.
init_mems
()
if
not
mems
:
mems
=
self
.
init_mems
(
data
)
hidden
,
new_mems
=
self
.
_forward
(
data
,
mems
=
mems
)
hidden
,
new_mems
=
self
.
_forward
(
data
,
mems
=
mems
)
if
target
is
None
:
if
target
is
None
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment