Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
39f8eafc
Unverified
Commit
39f8eafc
authored
May 03, 2022
by
Pavel Belevich
Committed by
GitHub
May 03, 2022
Browse files
Remove device parameter from create_extended_attention_mask_for_decoder (#16894)
parent
dd739f70
Changes
31
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
12 additions
and
12 deletions
+12
-12
src/transformers/models/roberta/modeling_roberta.py
src/transformers/models/roberta/modeling_roberta.py
+1
-1
src/transformers/models/roformer/modeling_roformer.py
src/transformers/models/roformer/modeling_roformer.py
+1
-1
src/transformers/models/splinter/modeling_splinter.py
src/transformers/models/splinter/modeling_splinter.py
+1
-1
src/transformers/models/squeezebert/modeling_squeezebert.py
src/transformers/models/squeezebert/modeling_squeezebert.py
+1
-1
src/transformers/models/t5/modeling_t5.py
src/transformers/models/t5/modeling_t5.py
+1
-1
src/transformers/models/tapas/modeling_tapas.py
src/transformers/models/tapas/modeling_tapas.py
+1
-1
src/transformers/models/vilt/modeling_vilt.py
src/transformers/models/vilt/modeling_vilt.py
+1
-1
src/transformers/models/visual_bert/modeling_visual_bert.py
src/transformers/models/visual_bert/modeling_visual_bert.py
+2
-2
src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
...sformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+1
-1
src/transformers/models/yoso/modeling_yoso.py
src/transformers/models/yoso/modeling_yoso.py
+1
-1
templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
...elname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+1
-1
No files found.
src/transformers/models/roberta/modeling_roberta.py
View file @
39f8eafc
...
...
@@ -817,7 +817,7 @@ class RobertaModel(RobertaPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/roformer/modeling_roformer.py
View file @
39f8eafc
...
...
@@ -900,7 +900,7 @@ class RoFormerModel(RoFormerPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/splinter/modeling_splinter.py
View file @
39f8eafc
...
...
@@ -710,7 +710,7 @@ class SplinterModel(SplinterPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/squeezebert/modeling_squeezebert.py
View file @
39f8eafc
...
...
@@ -612,7 +612,7 @@ class SqueezeBertModel(SqueezeBertPreTrainedModel):
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros
(
input_shape
,
dtype
=
torch
.
long
,
device
=
device
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
...
...
src/transformers/models/t5/modeling_t5.py
View file @
39f8eafc
...
...
@@ -957,7 +957,7 @@ class T5Stack(T5PreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
inputs_embeds
.
device
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/tapas/modeling_tapas.py
View file @
39f8eafc
...
...
@@ -954,7 +954,7 @@ class TapasModel(TapasPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D ou 3D attention mask is provided for the cross-attention
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/vilt/modeling_vilt.py
View file @
39f8eafc
...
...
@@ -843,7 +843,7 @@ class ViltModel(ViltPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
encoder_outputs
=
self
.
encoder
(
embedding_output
,
...
...
src/transformers/models/visual_bert/modeling_visual_bert.py
View file @
39f8eafc
...
...
@@ -794,12 +794,12 @@ class VisualBertModel(VisualBertPreTrainedModel):
if
visual_embeds
is
not
None
:
combined_attention_mask
=
torch
.
cat
((
attention_mask
,
visual_attention_mask
),
dim
=-
1
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
combined_attention_mask
,
[
batch_size
,
input_shape
+
visual_input_shape
],
device
combined_attention_mask
,
(
batch_size
,
input_shape
+
visual_input_shape
)
)
else
:
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
[
batch_size
,
input_shape
],
device
attention_mask
,
(
batch_size
,
input_shape
)
)
# Prepare head mask if needed
...
...
src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
View file @
39f8eafc
...
...
@@ -788,7 +788,7 @@ class XLMRobertaXLModel(XLMRobertaXLPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/yoso/modeling_yoso.py
View file @
39f8eafc
...
...
@@ -816,7 +816,7 @@ class YosoModel(YosoPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
...
...
templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
View file @
39f8eafc
...
...
@@ -876,7 +876,7 @@ class {{cookiecutter.camelcase_modelname}}Model({{cookiecutter.camelcase_modelna
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment