Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
934d0b8b
Unverified
Commit
934d0b8b
authored
Mar 06, 2023
by
saswatmeher
Committed by
GitHub
Mar 06, 2023
Browse files
Fix bert issue (#21963)
Co-authored-by:
saswatmeher
<
saswatmeher@cse.iitb.ac.in
>
parent
0bb17295
Changes
20
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
140 additions
and
100 deletions
+140
-100
src/transformers/models/align/modeling_align.py
src/transformers/models/align/modeling_align.py
+7
-5
src/transformers/models/altclip/modeling_altclip.py
src/transformers/models/altclip/modeling_altclip.py
+7
-5
src/transformers/models/bert/modeling_bert.py
src/transformers/models/bert/modeling_bert.py
+7
-5
src/transformers/models/bert_generation/modeling_bert_generation.py
...ormers/models/bert_generation/modeling_bert_generation.py
+7
-5
src/transformers/models/bridgetower/modeling_bridgetower.py
src/transformers/models/bridgetower/modeling_bridgetower.py
+7
-5
src/transformers/models/camembert/modeling_camembert.py
src/transformers/models/camembert/modeling_camembert.py
+7
-5
src/transformers/models/chinese_clip/modeling_chinese_clip.py
...transformers/models/chinese_clip/modeling_chinese_clip.py
+7
-5
src/transformers/models/clap/modeling_clap.py
src/transformers/models/clap/modeling_clap.py
+7
-5
src/transformers/models/data2vec/modeling_data2vec_text.py
src/transformers/models/data2vec/modeling_data2vec_text.py
+7
-5
src/transformers/models/electra/modeling_electra.py
src/transformers/models/electra/modeling_electra.py
+7
-5
src/transformers/models/ernie/modeling_ernie.py
src/transformers/models/ernie/modeling_ernie.py
+7
-5
src/transformers/models/layoutlm/modeling_layoutlm.py
src/transformers/models/layoutlm/modeling_layoutlm.py
+7
-5
src/transformers/models/markuplm/modeling_markuplm.py
src/transformers/models/markuplm/modeling_markuplm.py
+7
-5
src/transformers/models/nezha/modeling_nezha.py
src/transformers/models/nezha/modeling_nezha.py
+7
-5
src/transformers/models/realm/modeling_realm.py
src/transformers/models/realm/modeling_realm.py
+7
-5
src/transformers/models/roberta/modeling_roberta.py
src/transformers/models/roberta/modeling_roberta.py
+7
-5
src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
...els/roberta_prelayernorm/modeling_roberta_prelayernorm.py
+7
-5
src/transformers/models/roc_bert/modeling_roc_bert.py
src/transformers/models/roc_bert/modeling_roc_bert.py
+7
-5
src/transformers/models/splinter/modeling_splinter.py
src/transformers/models/splinter/modeling_splinter.py
+7
-5
src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+7
-5
No files found.
src/transformers/models/align/modeling_align.py
View file @
934d0b8b
...
...
@@ -1077,6 +1077,13 @@ class AlignTextEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -1086,11 +1093,6 @@ class AlignTextEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/altclip/modeling_altclip.py
View file @
934d0b8b
...
...
@@ -628,6 +628,13 @@ class AltRobertaEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -637,11 +644,6 @@ class AltRobertaEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/bert/modeling_bert.py
View file @
934d0b8b
...
...
@@ -575,6 +575,13 @@ class BertEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -584,11 +591,6 @@ class BertEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/bert_generation/modeling_bert_generation.py
View file @
934d0b8b
...
...
@@ -385,6 +385,13 @@ class BertEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -394,11 +401,6 @@ class BertEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/bridgetower/modeling_bridgetower.py
View file @
934d0b8b
...
...
@@ -760,6 +760,13 @@ class BridgeTowerTextEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -769,11 +776,6 @@ class BridgeTowerTextEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/camembert/modeling_camembert.py
View file @
934d0b8b
...
...
@@ -506,6 +506,13 @@ class CamembertEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -515,11 +522,6 @@ class CamembertEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/chinese_clip/modeling_chinese_clip.py
View file @
934d0b8b
...
...
@@ -891,6 +891,13 @@ class ChineseCLIPTextEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -900,11 +907,6 @@ class ChineseCLIPTextEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/clap/modeling_clap.py
View file @
934d0b8b
...
...
@@ -1578,6 +1578,13 @@ class ClapTextEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -1587,11 +1594,6 @@ class ClapTextEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/data2vec/modeling_data2vec_text.py
View file @
934d0b8b
...
...
@@ -492,6 +492,13 @@ class Data2VecTextEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -501,11 +508,6 @@ class Data2VecTextEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/electra/modeling_electra.py
View file @
934d0b8b
...
...
@@ -553,6 +553,13 @@ class ElectraEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -562,11 +569,6 @@ class ElectraEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/ernie/modeling_ernie.py
View file @
934d0b8b
...
...
@@ -488,6 +488,13 @@ class ErnieEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -497,11 +504,6 @@ class ErnieEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/layoutlm/modeling_layoutlm.py
View file @
934d0b8b
...
...
@@ -469,6 +469,13 @@ class LayoutLMEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -478,11 +485,6 @@ class LayoutLMEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/markuplm/modeling_markuplm.py
View file @
934d0b8b
...
...
@@ -630,6 +630,13 @@ class MarkupLMEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -639,11 +646,6 @@ class MarkupLMEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/nezha/modeling_nezha.py
View file @
934d0b8b
...
...
@@ -561,6 +561,13 @@ class NezhaEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -570,11 +577,6 @@ class NezhaEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/realm/modeling_realm.py
View file @
934d0b8b
...
...
@@ -568,6 +568,13 @@ class RealmEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -577,11 +584,6 @@ class RealmEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/roberta/modeling_roberta.py
View file @
934d0b8b
...
...
@@ -492,6 +492,13 @@ class RobertaEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -501,11 +508,6 @@ class RobertaEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
View file @
934d0b8b
...
...
@@ -494,6 +494,13 @@ class RobertaPreLayerNormEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -503,11 +510,6 @@ class RobertaPreLayerNormEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/roc_bert/modeling_roc_bert.py
View file @
934d0b8b
...
...
@@ -626,6 +626,13 @@ class RoCBertEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -635,11 +642,6 @@ class RoCBertEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/splinter/modeling_splinter.py
View file @
934d0b8b
...
...
@@ -441,6 +441,13 @@ class SplinterEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -450,11 +457,6 @@ class SplinterEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
View file @
934d0b8b
...
...
@@ -493,6 +493,13 @@ class XLMRobertaEncoder(nn.Module):
all_self_attentions
=
()
if
output_attentions
else
None
all_cross_attentions
=
()
if
output_attentions
and
self
.
config
.
add_cross_attention
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
next_decoder_cache
=
()
if
use_cache
else
None
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
if
output_hidden_states
:
...
...
@@ -502,11 +509,6 @@ class XLMRobertaEncoder(nn.Module):
past_key_value
=
past_key_values
[
i
]
if
past_key_values
is
not
None
else
None
if
self
.
gradient_checkpointing
and
self
.
training
:
if
use_cache
:
logger
.
warning_once
(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache
=
False
def
create_custom_forward
(
module
):
def
custom_forward
(
*
inputs
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment