Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
39f8eafc
Unverified
Commit
39f8eafc
authored
May 03, 2022
by
Pavel Belevich
Committed by
GitHub
May 03, 2022
Browse files
Remove device parameter from create_extended_attention_mask_for_decoder (#16894)
parent
dd739f70
Changes
31
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
36 additions
and
30 deletions
+36
-30
examples/research_projects/longform-qa/eli5_utils.py
examples/research_projects/longform-qa/eli5_utils.py
+1
-1
src/transformers/modeling_utils.py
src/transformers/modeling_utils.py
+16
-4
src/transformers/models/bert/modeling_bert.py
src/transformers/models/bert/modeling_bert.py
+1
-1
src/transformers/models/bert_generation/modeling_bert_generation.py
...ormers/models/bert_generation/modeling_bert_generation.py
+1
-3
src/transformers/models/big_bird/modeling_big_bird.py
src/transformers/models/big_bird/modeling_big_bird.py
+1
-3
src/transformers/models/canine/modeling_canine.py
src/transformers/models/canine/modeling_canine.py
+2
-2
src/transformers/models/convbert/modeling_convbert.py
src/transformers/models/convbert/modeling_convbert.py
+1
-1
src/transformers/models/data2vec/modeling_data2vec_text.py
src/transformers/models/data2vec/modeling_data2vec_text.py
+1
-1
src/transformers/models/electra/modeling_electra.py
src/transformers/models/electra/modeling_electra.py
+1
-1
src/transformers/models/ibert/modeling_ibert.py
src/transformers/models/ibert/modeling_ibert.py
+1
-1
src/transformers/models/longformer/modeling_longformer.py
src/transformers/models/longformer/modeling_longformer.py
+1
-1
src/transformers/models/megatron_bert/modeling_megatron_bert.py
...ansformers/models/megatron_bert/modeling_megatron_bert.py
+1
-1
src/transformers/models/mmbt/modeling_mmbt.py
src/transformers/models/mmbt/modeling_mmbt.py
+1
-1
src/transformers/models/mobilebert/modeling_mobilebert.py
src/transformers/models/mobilebert/modeling_mobilebert.py
+1
-3
src/transformers/models/mpnet/modeling_mpnet.py
src/transformers/models/mpnet/modeling_mpnet.py
+1
-1
src/transformers/models/nystromformer/modeling_nystromformer.py
...ansformers/models/nystromformer/modeling_nystromformer.py
+1
-1
src/transformers/models/qdqbert/modeling_qdqbert.py
src/transformers/models/qdqbert/modeling_qdqbert.py
+1
-1
src/transformers/models/realm/modeling_realm.py
src/transformers/models/realm/modeling_realm.py
+1
-1
src/transformers/models/rembert/modeling_rembert.py
src/transformers/models/rembert/modeling_rembert.py
+1
-1
src/transformers/models/retribert/modeling_retribert.py
src/transformers/models/retribert/modeling_retribert.py
+1
-1
No files found.
examples/research_projects/longform-qa/eli5_utils.py
View file @
39f8eafc
...
...
@@ -137,7 +137,7 @@ class RetrievalQAEmbedder(nn.Module):
token_type_ids
=
torch
.
zeros
(
input_shape
,
dtype
=
torch
.
long
,
device
=
device
)
head_mask
=
[
None
]
*
self
.
sent_encoder
.
config
.
num_hidden_layers
extended_attention_mask
:
torch
.
Tensor
=
self
.
sent_encoder
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
attention_mask
,
input_shape
)
# define function for checkpointing
...
...
src/transformers/modeling_utils.py
View file @
39f8eafc
...
...
@@ -651,7 +651,13 @@ class ModuleUtilsMixin:
return
encoder_extended_attention_mask
@
staticmethod
def
create_extended_attention_mask_for_decoder
(
input_shape
,
attention_mask
,
device
):
def
create_extended_attention_mask_for_decoder
(
input_shape
,
attention_mask
,
device
=
None
):
if
device
is
not
None
:
warnings
.
warn
(
"The `device` argument is deprecated and will be removed in v5 of Transformers."
,
FutureWarning
)
else
:
device
=
attention_mask
.
device
batch_size
,
seq_length
=
input_shape
seq_ids
=
torch
.
arange
(
seq_length
,
device
=
device
)
causal_mask
=
seq_ids
[
None
,
None
,
:].
repeat
(
batch_size
,
seq_length
,
1
)
<=
seq_ids
[
None
,
:,
None
]
...
...
@@ -672,7 +678,9 @@ class ModuleUtilsMixin:
extended_attention_mask
=
causal_mask
[:,
None
,
:,
:]
*
attention_mask
[:,
None
,
None
,
:]
return
extended_attention_mask
def
get_extended_attention_mask
(
self
,
attention_mask
:
Tensor
,
input_shape
:
Tuple
[
int
],
device
:
device
)
->
Tensor
:
def
get_extended_attention_mask
(
self
,
attention_mask
:
Tensor
,
input_shape
:
Tuple
[
int
],
device
:
device
=
None
)
->
Tensor
:
"""
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
...
...
@@ -681,12 +689,16 @@ class ModuleUtilsMixin:
Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
input_shape (`Tuple[int]`):
The shape of the input to the model.
device: (`torch.device`):
The device of the input to the model.
Returns:
`torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
"""
if
not
(
attention_mask
.
dim
()
==
2
and
self
.
config
.
is_decoder
):
# show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
if
device
is
not
None
:
warnings
.
warn
(
"The `device` argument is deprecated and will be removed in v5 of Transformers."
,
FutureWarning
)
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
if
attention_mask
.
dim
()
==
3
:
...
...
src/transformers/models/bert/modeling_bert.py
View file @
39f8eafc
...
...
@@ -982,7 +982,7 @@ class BertModel(BertPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/bert_generation/modeling_bert_generation.py
View file @
39f8eafc
...
...
@@ -364,9 +364,7 @@ class BertGenerationEncoder(BertGenerationPreTrainedModel):
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
=
None
if
not
use_cache
:
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/big_bird/modeling_big_bird.py
View file @
39f8eafc
...
...
@@ -2112,9 +2112,7 @@ class BigBirdModel(BigBirdPreTrainedModel):
to_mask
=
None
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
else
:
raise
ValueError
(
f
"attention_type can either be original_full or block_sparse, but is
{
self
.
attention_type
}
"
...
...
src/transformers/models/canine/modeling_canine.py
View file @
39f8eafc
...
...
@@ -1130,12 +1130,12 @@ class CanineModel(CaninePreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
molecule_attention_mask
=
self
.
_downsample_attention_mask
(
attention_mask
,
downsampling_rate
=
self
.
config
.
downsampling_rate
)
extended_molecule_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
molecule_attention_mask
,
(
batch_size
,
molecule_attention_mask
.
shape
[
-
1
])
,
device
molecule_attention_mask
,
(
batch_size
,
molecule_attention_mask
.
shape
[
-
1
])
)
# Prepare head mask if needed
...
...
src/transformers/models/convbert/modeling_convbert.py
View file @
39f8eafc
...
...
@@ -833,7 +833,7 @@ class ConvBertModel(ConvBertPreTrainedModel):
else
:
token_type_ids
=
torch
.
zeros
(
input_shape
,
dtype
=
torch
.
long
,
device
=
device
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
head_mask
=
self
.
get_head_mask
(
head_mask
,
self
.
config
.
num_hidden_layers
)
hidden_states
=
self
.
embeddings
(
...
...
src/transformers/models/data2vec/modeling_data2vec_text.py
View file @
39f8eafc
...
...
@@ -820,7 +820,7 @@ class Data2VecTextModel(Data2VecTextPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/electra/modeling_electra.py
View file @
39f8eafc
...
...
@@ -882,7 +882,7 @@ class ElectraModel(ElectraPreTrainedModel):
else
:
token_type_ids
=
torch
.
zeros
(
input_shape
,
dtype
=
torch
.
long
,
device
=
device
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/ibert/modeling_ibert.py
View file @
39f8eafc
...
...
@@ -814,7 +814,7 @@ class IBertModel(IBertPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
...
...
src/transformers/models/longformer/modeling_longformer.py
View file @
39f8eafc
...
...
@@ -1692,7 +1692,7 @@ class LongformerModel(LongformerPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)[
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)[
:,
0
,
0
,
:
]
...
...
src/transformers/models/megatron_bert/modeling_megatron_bert.py
View file @
39f8eafc
...
...
@@ -940,7 +940,7 @@ class MegatronBertModel(MegatronBertPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/mmbt/modeling_mmbt.py
View file @
39f8eafc
...
...
@@ -268,7 +268,7 @@ class MMBTModel(nn.Module, ModuleUtilsMixin):
[
torch
.
ones
(
input_modal_shape
,
device
=
device
),
encoder_attention_mask
],
dim
=
1
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
self
.
device
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
encoder_extended_attention_mask
=
self
.
invert_attention_mask
(
encoder_attention_mask
)
head_mask
=
self
.
get_head_mask
(
head_mask
,
self
.
config
.
num_hidden_layers
)
...
...
src/transformers/models/mobilebert/modeling_mobilebert.py
View file @
39f8eafc
...
...
@@ -875,9 +875,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
self
.
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
...
...
src/transformers/models/mpnet/modeling_mpnet.py
View file @
39f8eafc
...
...
@@ -547,7 +547,7 @@ class MPNetModel(MPNetPreTrainedModel):
if
attention_mask
is
None
:
attention_mask
=
torch
.
ones
(
input_shape
,
device
=
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
head_mask
=
self
.
get_head_mask
(
head_mask
,
self
.
config
.
num_hidden_layers
)
embedding_output
=
self
.
embeddings
(
input_ids
=
input_ids
,
position_ids
=
position_ids
,
inputs_embeds
=
inputs_embeds
)
...
...
src/transformers/models/nystromformer/modeling_nystromformer.py
View file @
39f8eafc
...
...
@@ -624,7 +624,7 @@ class NystromformerModel(NystromformerPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
...
...
src/transformers/models/qdqbert/modeling_qdqbert.py
View file @
39f8eafc
...
...
@@ -952,7 +952,7 @@ class QDQBertModel(QDQBertPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/realm/modeling_realm.py
View file @
39f8eafc
...
...
@@ -1078,7 +1078,7 @@ class RealmBertModel(RealmPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/rembert/modeling_rembert.py
View file @
39f8eafc
...
...
@@ -857,7 +857,7 @@ class RemBertModel(RemBertPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/retribert/modeling_retribert.py
View file @
39f8eafc
...
...
@@ -117,7 +117,7 @@ class RetriBertModel(RetriBertPreTrainedModel):
token_type_ids
=
torch
.
zeros
(
input_shape
,
dtype
=
torch
.
long
,
device
=
device
)
head_mask
=
[
None
]
*
sent_encoder
.
config
.
num_hidden_layers
extended_attention_mask
:
torch
.
Tensor
=
sent_encoder
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
attention_mask
,
input_shape
)
# define function for checkpointing
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment