Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
39f8eafc
"src/sdk/vscode:/vscode.git/clone" did not exist on "7e35d32e2987493838779826155f7434bc30b81c"
Unverified
Commit
39f8eafc
authored
May 03, 2022
by
Pavel Belevich
Committed by
GitHub
May 03, 2022
Browse files
Remove device parameter from create_extended_attention_mask_for_decoder (#16894)
parent
dd739f70
Changes
31
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
12 additions
and
12 deletions
+12
-12
src/transformers/models/roberta/modeling_roberta.py
src/transformers/models/roberta/modeling_roberta.py
+1
-1
src/transformers/models/roformer/modeling_roformer.py
src/transformers/models/roformer/modeling_roformer.py
+1
-1
src/transformers/models/splinter/modeling_splinter.py
src/transformers/models/splinter/modeling_splinter.py
+1
-1
src/transformers/models/squeezebert/modeling_squeezebert.py
src/transformers/models/squeezebert/modeling_squeezebert.py
+1
-1
src/transformers/models/t5/modeling_t5.py
src/transformers/models/t5/modeling_t5.py
+1
-1
src/transformers/models/tapas/modeling_tapas.py
src/transformers/models/tapas/modeling_tapas.py
+1
-1
src/transformers/models/vilt/modeling_vilt.py
src/transformers/models/vilt/modeling_vilt.py
+1
-1
src/transformers/models/visual_bert/modeling_visual_bert.py
src/transformers/models/visual_bert/modeling_visual_bert.py
+2
-2
src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
...sformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+1
-1
src/transformers/models/yoso/modeling_yoso.py
src/transformers/models/yoso/modeling_yoso.py
+1
-1
templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
...elname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+1
-1
No files found.
src/transformers/models/roberta/modeling_roberta.py
View file @
39f8eafc
...
@@ -817,7 +817,7 @@ class RobertaModel(RobertaPreTrainedModel):
...
@@ -817,7 +817,7 @@ class RobertaModel(RobertaPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/roformer/modeling_roformer.py
View file @
39f8eafc
...
@@ -900,7 +900,7 @@ class RoFormerModel(RoFormerPreTrainedModel):
...
@@ -900,7 +900,7 @@ class RoFormerModel(RoFormerPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/splinter/modeling_splinter.py
View file @
39f8eafc
...
@@ -710,7 +710,7 @@ class SplinterModel(SplinterPreTrainedModel):
...
@@ -710,7 +710,7 @@ class SplinterModel(SplinterPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/squeezebert/modeling_squeezebert.py
View file @
39f8eafc
...
@@ -612,7 +612,7 @@ class SqueezeBertModel(SqueezeBertPreTrainedModel):
...
@@ -612,7 +612,7 @@ class SqueezeBertModel(SqueezeBertPreTrainedModel):
if
token_type_ids
is
None
:
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros
(
input_shape
,
dtype
=
torch
.
long
,
device
=
device
)
token_type_ids
=
torch
.
zeros
(
input_shape
,
dtype
=
torch
.
long
,
device
=
device
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# Prepare head mask if needed
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# attention_probs has shape bsz x n_heads x N x N
...
...
src/transformers/models/t5/modeling_t5.py
View file @
39f8eafc
...
@@ -957,7 +957,7 @@ class T5Stack(T5PreTrainedModel):
...
@@ -957,7 +957,7 @@ class T5Stack(T5PreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
inputs_embeds
.
device
)
extended_attention_mask
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/tapas/modeling_tapas.py
View file @
39f8eafc
...
@@ -954,7 +954,7 @@ class TapasModel(TapasPreTrainedModel):
...
@@ -954,7 +954,7 @@ class TapasModel(TapasPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D ou 3D attention mask is provided for the cross-attention
# If a 2D ou 3D attention mask is provided for the cross-attention
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/vilt/modeling_vilt.py
View file @
39f8eafc
...
@@ -843,7 +843,7 @@ class ViltModel(ViltPreTrainedModel):
...
@@ -843,7 +843,7 @@ class ViltModel(ViltPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
encoder_outputs
=
self
.
encoder
(
encoder_outputs
=
self
.
encoder
(
embedding_output
,
embedding_output
,
...
...
src/transformers/models/visual_bert/modeling_visual_bert.py
View file @
39f8eafc
...
@@ -794,12 +794,12 @@ class VisualBertModel(VisualBertPreTrainedModel):
...
@@ -794,12 +794,12 @@ class VisualBertModel(VisualBertPreTrainedModel):
if
visual_embeds
is
not
None
:
if
visual_embeds
is
not
None
:
combined_attention_mask
=
torch
.
cat
((
attention_mask
,
visual_attention_mask
),
dim
=-
1
)
combined_attention_mask
=
torch
.
cat
((
attention_mask
,
visual_attention_mask
),
dim
=-
1
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
combined_attention_mask
,
[
batch_size
,
input_shape
+
visual_input_shape
],
device
combined_attention_mask
,
(
batch_size
,
input_shape
+
visual_input_shape
)
)
)
else
:
else
:
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
[
batch_size
,
input_shape
],
device
attention_mask
,
(
batch_size
,
input_shape
)
)
)
# Prepare head mask if needed
# Prepare head mask if needed
...
...
src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
View file @
39f8eafc
...
@@ -788,7 +788,7 @@ class XLMRobertaXLModel(XLMRobertaXLPreTrainedModel):
...
@@ -788,7 +788,7 @@ class XLMRobertaXLModel(XLMRobertaXLPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
src/transformers/models/yoso/modeling_yoso.py
View file @
39f8eafc
...
@@ -816,7 +816,7 @@ class YosoModel(YosoPreTrainedModel):
...
@@ -816,7 +816,7 @@ class YosoModel(YosoPreTrainedModel):
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# Prepare head mask if needed
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# 1.0 in head_mask indicate we keep the head
...
...
templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
View file @
39f8eafc
...
@@ -876,7 +876,7 @@ class {{cookiecutter.camelcase_modelname}}Model({{cookiecutter.camelcase_modelna
...
@@ -876,7 +876,7 @@ class {{cookiecutter.camelcase_modelname}}Model({{cookiecutter.camelcase_modelna
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
,
device
)
extended_attention_mask
:
torch
.
Tensor
=
self
.
get_extended_attention_mask
(
attention_mask
,
input_shape
)
# If a 2D or 3D attention mask is provided for the cross-attention
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment