Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
39f8eafc
"vscode:/vscode.git/clone" did not exist on "20932e5520fd955697743f4e2baf7879e7faa848"
Unverified
Commit
39f8eafc
authored
May 03, 2022
by
Pavel Belevich
Committed by
GitHub
May 03, 2022
Browse files
Remove device parameter from create_extended_attention_mask_for_decoder (#16894)
parent
dd739f70
Changes
31
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
36 additions
and
30 deletions
+36
-30
examples/research_projects/longform-qa/eli5_utils.py
examples/research_projects/longform-qa/eli5_utils.py
+1
-1
src/transformers/modeling_utils.py
src/transformers/modeling_utils.py
+16
-4
src/transformers/models/bert/modeling_bert.py
src/transformers/models/bert/modeling_bert.py
+1
-1
src/transformers/models/bert_generation/modeling_bert_generation.py
...ormers/models/bert_generation/modeling_bert_generation.py
+1
-3
src/transformers/models/big_bird/modeling_big_bird.py
src/transformers/models/big_bird/modeling_big_bird.py
+1
-3
src/transformers/models/canine/modeling_canine.py
src/transformers/models/canine/modeling_canine.py
+2
-2
src/transformers/models/convbert/modeling_convbert.py
src/transformers/models/convbert/modeling_convbert.py
+1
-1
src/transformers/models/data2vec/modeling_data2vec_text.py
src/transformers/models/data2vec/modeling_data2vec_text.py
+1
-1
src/transformers/models/electra/modeling_electra.py
src/transformers/models/electra/modeling_electra.py
+1
-1
src/transformers/models/ibert/modeling_ibert.py
src/transformers/models/ibert/modeling_ibert.py
+1
-1
src/transformers/models/longformer/modeling_longformer.py
src/transformers/models/longformer/modeling_longformer.py
+1
-1
src/transformers/models/megatron_bert/modeling_megatron_bert.py
...ansformers/models/megatron_bert/modeling_megatron_bert.py
+1
-1
src/transformers/models/mmbt/modeling_mmbt.py
src/transformers/models/mmbt/modeling_mmbt.py
+1
-1
src/transformers/models/mobilebert/modeling_mobilebert.py
src/transformers/models/mobilebert/modeling_mobilebert.py
+1
-3
src/transformers/models/mpnet/modeling_mpnet.py
src/transformers/models/mpnet/modeling_mpnet.py
+1
-1
src/transformers/models/nystromformer/modeling_nystromformer.py
...ansformers/models/nystromformer/modeling_nystromformer.py
+1
-1
src/transformers/models/qdqbert/modeling_qdqbert.py
src/transformers/models/qdqbert/modeling_qdqbert.py
+1
-1
src/transformers/models/realm/modeling_realm.py
src/transformers/models/realm/modeling_realm.py
+1
-1
src/transformers/models/rembert/modeling_rembert.py
src/transformers/models/rembert/modeling_rembert.py
+1
-1
src/transformers/models/retribert/modeling_retribert.py
src/transformers/models/retribert/modeling_retribert.py
+1
-1
No files found.
examples/research_projects/longform-qa/eli5_utils.py
View file @
39f8eafc
...
...
@@ -137,7 +137,7 @@ class RetrievalQAEmbedder(nn.Module):
token_type_ids
=
torch
.
zeros
(
input_shape
,
dtype
=
torch
.
long
,
device
=
device
)
head_mask
=
[
None
]
*
self
.
sent_encoder
.
config
.
num_hidden_layers
extended_attention_mask
:
torch
.
Tensor
=
self
.
sent_encoder
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
attention_mask
,
input_shape
)
# define function for checkpointing
...
...
src/transformers/modeling_utils.py
View file @
39f8eafc
...
...
@@ -651,7 +651,13 @@ class ModuleUtilsMixin:
return
encoder_extended_attention_mask
@
staticmethod
def
create_extended_attention_mask_for_decoder
(
input_shape
,
attention_mask
,
device
):
def
create_extended_attention_mask_for_decoder
(
input_shape
,
attention_mask
,
device
=
None
):
if
device
is
not
None
:
warnings
.
warn
(
"The `device` argument is deprecated and will be removed in v5 of Transformers."
,
FutureWarning
)
else
:
device
=
attention_mask
.
device
batch_size
,
seq_length
=
input_shape
seq_ids
=
torch
.
arange
(
seq_length
,
device
=
device
)
causal_mask
=
seq_ids
[
None
,
None
,
:].
repeat
(
batch_size
,
seq_length
,
1
)
<=
seq_ids
[
None
,
:,
None
]
...
...
@@ -672,7 +678,9 @@ class ModuleUtilsMixin:
extended_attention_mask
=
causal_mask
[:,
None
,
:,
:]
*
attention_mask
[:,
None
,
None
,
:]
return
extended_attention_mask
def
get_extended_attention_mask
(
self
,
attention_mask
:
Tensor
,
input_shape
:
Tuple
[
int
],
device
:
device
)
->
Tensor
:
def
get_extended_attention_mask
(
self
,
attention_mask
:
Tensor
,
input_shape
:
Tuple
[
int
],
device
:
device
=
None
)
->
Tensor
:
"""
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
...
...
@@ -681,12 +689,16 @@ class ModuleUtilsMixin:
Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
input_shape (`Tuple[int]`):
The shape of the input to the model.
device: (`torch.device`):
The device of the input to the model.
Returns:
`torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
"""
if
not
(
attention_mask
.
dim
()
==
2
and
self
.
config
.
is_decoder
):
# show warning only if it won't be shown in `create_extended_attention_mask_for_decoder`
if
device
is
not
None
:
warnings
.
warn
(
"The `device` argument is deprecated and will be removed in v5 of Transformers."
,
FutureWarning
)
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
if
attention_mask
.
dim
()
==
3
:
...
...
src/transformers/models/bert/modeling_bert.py
View file @
39f8eafc
...
...
@@ -982,7 +982,7 @@ class BertModel(BertPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/bert_generation/modeling_bert_generation.py
View file @
39f8eafc
...
...
@@ -364,9 +364,7 @@ class BertGenerationEncoder(BertGenerationPreTrainedModel):
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
=
None
if
not
use_cache
:
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/big_bird/modeling_big_bird.py
View file @
39f8eafc
...
...
@@ -2112,9 +2112,7 @@ class BigBirdModel(BigBirdPreTrainedModel):
to_mask
=
None
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
else
:
raise
ValueError
(
f
"attention_type can either be original_full or block_sparse, but is
{
self
.
attention_type
}
"
...
...
src/transformers/models/canine/modeling_canine.py
View file @
39f8eafc
...
...
@@ -1130,12 +1130,12 @@ class CanineModel(CaninePreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
molecule_attention_mask
=
self
.
_downsample_attention_mask
(
attention_mask
,
downsampling_rate
=
self
.
config
.
downsampling_rate
)
extended_molecule_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
molecule_attention_mask
,
(
batch_size
,
molecule_attention_mask
.
shape
[
-
1
])
,
device
molecule_attention_mask
,
(
batch_size
,
molecule_attention_mask
.
shape
[
-
1
])
)
# Prepare head mask if needed
...
...
src/transformers/models/convbert/modeling_convbert.py
View file @
39f8eafc
...
...
@@ -833,7 +833,7 @@ class ConvBertModel(ConvBertPreTrainedModel):
else
:
token_type_ids
=
torch
.
zeros
(
input_shape
,
dtype
=
torch
.
long
,
device
=
device
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
head_mask
=
self
.
get_head_mask
(
head_mask
,
self
.
config
.
num_hidden_layers
)
hidden_states
=
self
.
embeddings
(
...
...
src/transformers/models/data2vec/modeling_data2vec_text.py
View file @
39f8eafc
...
...
@@ -820,7 +820,7 @@ class Data2VecTextModel(Data2VecTextPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/electra/modeling_electra.py
View file @
39f8eafc
...
...
@@ -882,7 +882,7 @@ class ElectraModel(ElectraPreTrainedModel):
else
:
token_type_ids
=
torch
.
zeros
(
input_shape
,
dtype
=
torch
.
long
,
device
=
device
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/ibert/modeling_ibert.py
View file @
39f8eafc
...
...
@@ -814,7 +814,7 @@ class IBertModel(IBertPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
...
...
src/transformers/models/longformer/modeling_longformer.py
View file @
39f8eafc
...
...
@@ -1692,7 +1692,7 @@ class LongformerModel(LongformerPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)[
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)[
:,
0
,
0
,
:
]
...
...
src/transformers/models/megatron_bert/modeling_megatron_bert.py
View file @
39f8eafc
...
...
@@ -940,7 +940,7 @@ class MegatronBertModel(MegatronBertPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/mmbt/modeling_mmbt.py
View file @
39f8eafc
...
...
@@ -268,7 +268,7 @@ class MMBTModel(nn.Module, ModuleUtilsMixin):
[
torch
.
ones
(
input_modal_shape
,
device
=
device
),
encoder_attention_mask
],
dim
=
1
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
self
.
device
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
encoder_extended_attention_mask
=
self
.
invert_attention_mask
(
encoder_attention_mask
)
head_mask
=
self
.
get_head_mask
(
head_mask
,
self
.
config
.
num_hidden_layers
)
...
...
src/transformers/models/mobilebert/modeling_mobilebert.py
View file @
39f8eafc
...
...
@@ -875,9 +875,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
self
.
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
...
...
src/transformers/models/mpnet/modeling_mpnet.py
View file @
39f8eafc
...
...
@@ -547,7 +547,7 @@ class MPNetModel(MPNetPreTrainedModel):
if
attention_mask
is
None
:
attention_mask
=
torch
.
ones
(
input_shape
,
device
=
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
head_mask
=
self
.
get_head_mask
(
head_mask
,
self
.
config
.
num_hidden_layers
)
embedding_output
=
self
.
embeddings
(
input_ids
=
input_ids
,
position_ids
=
position_ids
,
inputs_embeds
=
inputs_embeds
)
...
...
src/transformers/models/nystromformer/modeling_nystromformer.py
View file @
39f8eafc
...
...
@@ -624,7 +624,7 @@ class NystromformerModel(NystromformerPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
...
...
src/transformers/models/qdqbert/modeling_qdqbert.py
View file @
39f8eafc
...
...
@@ -952,7 +952,7 @@ class QDQBertModel(QDQBertPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/realm/modeling_realm.py
View file @
39f8eafc
...
...
@@ -1078,7 +1078,7 @@ class RealmBertModel(RealmPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/rembert/modeling_rembert.py
View file @
39f8eafc
...
...
@@ -857,7 +857,7 @@ class RemBertModel(RemBertPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/retribert/modeling_retribert.py
View file @
39f8eafc
...
...
@@ -117,7 +117,7 @@ class RetriBertModel(RetriBertPreTrainedModel):
token_type_ids
=
torch
.
zeros
(
input_shape
,
dtype
=
torch
.
long
,
device
=
device
)
head_mask
=
[
None
]
*
sent_encoder
.
config
.
num_hidden_layers
extended_attention_mask
:
torch
.
Tensor
=
sent_encoder
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
attention_mask
,
input_shape
)
# define function for checkpointing
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment