Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
91caf246
Unverified
Commit
91caf246
authored
Nov 28, 2019
by
Thomas Wolf
Committed by
GitHub
Nov 28, 2019
Browse files
Merge pull request #1770 from huggingface/initi-encoder-mask
Only init encoder_attention_mask if stack is decoder
parents
49a69d5b
cd286c21
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
8 deletions
+12
-8
transformers/modeling_bert.py
transformers/modeling_bert.py
+12
-8
No files found.
transformers/modeling_bert.py
View file @
91caf246
...
@@ -660,8 +660,6 @@ class BertModel(BertPreTrainedModel):
...
@@ -660,8 +660,6 @@ class BertModel(BertPreTrainedModel):
if
attention_mask
is
None
:
if
attention_mask
is
None
:
attention_mask
=
torch
.
ones
(
input_shape
,
device
=
device
)
attention_mask
=
torch
.
ones
(
input_shape
,
device
=
device
)
if
encoder_attention_mask
is
None
:
encoder_attention_mask
=
torch
.
ones
(
input_shape
,
device
=
device
)
if
token_type_ids
is
None
:
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros
(
input_shape
,
dtype
=
torch
.
long
,
device
=
device
)
token_type_ids
=
torch
.
zeros
(
input_shape
,
dtype
=
torch
.
long
,
device
=
device
)
...
@@ -692,13 +690,19 @@ class BertModel(BertPreTrainedModel):
...
@@ -692,13 +690,19 @@ class BertModel(BertPreTrainedModel):
# If a 2D ou 3D attention mask is provided for the cross-attention
# If a 2D ou 3D attention mask is provided for the cross-attention
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
if
encoder_attention_mask
.
dim
()
==
3
:
if
self
.
config
.
is_decoder
:
encoder_extended_attention_mask
=
encoder_attention_mask
[:,
None
,
:,
:]
if
encoder_attention_mask
is
None
:
if
encoder_attention_mask
.
dim
()
==
2
:
encoder_attention_mask
=
torch
.
ones
(
input_shape
,
device
=
device
)
encoder_extended_attention_mask
=
encoder_attention_mask
[:,
None
,
None
,
:]
encoder_extended_attention_mask
=
encoder_extended_attention_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# fp16 compatibility
if
encoder_attention_mask
.
dim
()
==
3
:
encoder_extended_attention_mask
=
(
1.0
-
encoder_extended_attention_mask
)
*
-
10000.0
encoder_extended_attention_mask
=
encoder_attention_mask
[:,
None
,
:,
:]
if
encoder_attention_mask
.
dim
()
==
2
:
encoder_extended_attention_mask
=
encoder_attention_mask
[:,
None
,
None
,
:]
encoder_extended_attention_mask
=
encoder_extended_attention_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# fp16 compatibility
encoder_extended_attention_mask
=
(
1.0
-
encoder_extended_attention_mask
)
*
-
10000.0
else
:
encoder_extended_attention_mask
=
None
# Prepare head mask if needed
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# 1.0 in head_mask indicate we keep the head
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment