Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
a2a3afbc
Unverified
Commit
a2a3afbc
authored
Sep 14, 2022
by
Sylvain Gugger
Committed by
GitHub
Sep 14, 2022
Browse files
PyTorch >= 1.7.0 and TensorFlow >= 2.4.0 (#19016)
parent
9f4acd05
Changes
30
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
73 additions
and
216 deletions
+73
-216
setup.py
setup.py
+2
-2
src/transformers/activations.py
src/transformers/activations.py
+2
-12
src/transformers/dependency_versions_table.py
src/transformers/dependency_versions_table.py
+2
-2
src/transformers/models/albert/modeling_albert.py
src/transformers/models/albert/modeling_albert.py
+4
-12
src/transformers/models/bert/modeling_bert.py
src/transformers/models/bert/modeling_bert.py
+4
-12
src/transformers/models/big_bird/modeling_big_bird.py
src/transformers/models/big_bird/modeling_big_bird.py
+4
-7
src/transformers/models/convbert/modeling_convbert.py
src/transformers/models/convbert/modeling_convbert.py
+4
-12
src/transformers/models/data2vec/modeling_data2vec_text.py
src/transformers/models/data2vec/modeling_data2vec_text.py
+4
-12
src/transformers/models/decision_transformer/modeling_decision_transformer.py
...els/decision_transformer/modeling_decision_transformer.py
+4
-21
src/transformers/models/distilbert/modeling_distilbert.py
src/transformers/models/distilbert/modeling_distilbert.py
+4
-10
src/transformers/models/electra/modeling_electra.py
src/transformers/models/electra/modeling_electra.py
+4
-12
src/transformers/models/ernie/modeling_ernie.py
src/transformers/models/ernie/modeling_ernie.py
+4
-12
src/transformers/models/flaubert/modeling_flaubert.py
src/transformers/models/flaubert/modeling_flaubert.py
+3
-5
src/transformers/models/flava/modeling_flava.py
src/transformers/models/flava/modeling_flava.py
+3
-7
src/transformers/models/fnet/modeling_fnet.py
src/transformers/models/fnet/modeling_fnet.py
+4
-7
src/transformers/models/gpt2/modeling_gpt2.py
src/transformers/models/gpt2/modeling_gpt2.py
+3
-20
src/transformers/models/imagegpt/modeling_imagegpt.py
src/transformers/models/imagegpt/modeling_imagegpt.py
+3
-20
src/transformers/models/mctct/modeling_mctct.py
src/transformers/models/mctct/modeling_mctct.py
+5
-7
src/transformers/models/nezha/modeling_nezha.py
src/transformers/models/nezha/modeling_nezha.py
+4
-12
src/transformers/models/nystromformer/modeling_nystromformer.py
...ansformers/models/nystromformer/modeling_nystromformer.py
+6
-12
No files found.
setup.py
View file @
a2a3afbc
...
...
@@ -155,13 +155,13 @@ _deps = [
"librosa"
,
"starlette"
,
"tensorflow-cpu>=2.3"
,
"tensorflow>=2.
3
"
,
"tensorflow>=2.
4
"
,
"tensorflow-text"
,
"tf2onnx"
,
"timeout-decorator"
,
"timm"
,
"tokenizers>=0.11.1,!=0.11.3,<0.13"
,
"torch>=1.
0
,!=
0
.12.0"
,
"torch>=1.
7
,!=
1
.12.0"
,
"torchaudio"
,
"pyctcdecode>=0.3.0"
,
"tqdm>=4.27"
,
...
...
src/transformers/activations.py
View file @
a2a3afbc
...
...
@@ -44,7 +44,7 @@ class GELUActivation(nn.Module):
def
__init__
(
self
,
use_gelu_python
:
bool
=
False
):
super
().
__init__
()
if
version
.
parse
(
version
.
parse
(
torch
.
__version__
).
base_version
)
<
version
.
parse
(
"1.4"
)
or
use_gelu_python
:
if
use_gelu_python
:
self
.
act
=
self
.
_gelu_python
else
:
self
.
act
=
nn
.
functional
.
gelu
...
...
@@ -108,18 +108,8 @@ class SiLUActivation(nn.Module):
later.
"""
def
__init__
(
self
):
super
().
__init__
()
if
version
.
parse
(
version
.
parse
(
torch
.
__version__
).
base_version
)
<
version
.
parse
(
"1.7"
):
self
.
act
=
self
.
_silu_python
else
:
self
.
act
=
nn
.
functional
.
silu
def
_silu_python
(
self
,
input
:
Tensor
)
->
Tensor
:
return
input
*
torch
.
sigmoid
(
input
)
def
forward
(
self
,
input
:
Tensor
)
->
Tensor
:
return
self
.
act
(
input
)
return
nn
.
functional
.
silu
(
input
)
class
MishActivation
(
nn
.
Module
):
...
...
src/transformers/dependency_versions_table.py
View file @
a2a3afbc
...
...
@@ -61,13 +61,13 @@ deps = {
"librosa"
:
"librosa"
,
"starlette"
:
"starlette"
,
"tensorflow-cpu"
:
"tensorflow-cpu>=2.3"
,
"tensorflow"
:
"tensorflow>=2.
3
"
,
"tensorflow"
:
"tensorflow>=2.
4
"
,
"tensorflow-text"
:
"tensorflow-text"
,
"tf2onnx"
:
"tf2onnx"
,
"timeout-decorator"
:
"timeout-decorator"
,
"timm"
:
"timm"
,
"tokenizers"
:
"tokenizers>=0.11.1,!=0.11.3,<0.13"
,
"torch"
:
"torch>=1.
0
,!=
0
.12.0"
,
"torch"
:
"torch>=1.
7
,!=
1
.12.0"
,
"torchaudio"
:
"torchaudio"
,
"pyctcdecode"
:
"pyctcdecode>=0.3.0"
,
"tqdm"
:
"tqdm>=4.27"
,
...
...
src/transformers/models/albert/modeling_albert.py
View file @
a2a3afbc
...
...
@@ -34,12 +34,7 @@ from ...modeling_outputs import (
TokenClassifierOutput
,
)
from
...modeling_utils
import
PreTrainedModel
from
...pytorch_utils
import
(
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
is_torch_greater_than_1_6
,
prune_linear_layer
,
)
from
...pytorch_utils
import
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
prune_linear_layer
from
...utils
import
(
ModelOutput
,
add_code_sample_docstrings
,
...
...
@@ -216,11 +211,8 @@ class AlbertEmbeddings(nn.Module):
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self
.
register_buffer
(
"position_ids"
,
torch
.
arange
(
config
.
max_position_embeddings
).
expand
((
1
,
-
1
)))
self
.
position_embedding_type
=
getattr
(
config
,
"position_embedding_type"
,
"absolute"
)
if
is_torch_greater_than_1_6
:
self
.
register_buffer
(
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
,
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
)
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward
...
...
src/transformers/models/bert/modeling_bert.py
View file @
a2a3afbc
...
...
@@ -40,12 +40,7 @@ from ...modeling_outputs import (
TokenClassifierOutput
,
)
from
...modeling_utils
import
PreTrainedModel
from
...pytorch_utils
import
(
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
is_torch_greater_than_1_6
,
prune_linear_layer
,
)
from
...pytorch_utils
import
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
prune_linear_layer
from
...utils
import
(
ModelOutput
,
add_code_sample_docstrings
,
...
...
@@ -199,11 +194,8 @@ class BertEmbeddings(nn.Module):
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self
.
position_embedding_type
=
getattr
(
config
,
"position_embedding_type"
,
"absolute"
)
self
.
register_buffer
(
"position_ids"
,
torch
.
arange
(
config
.
max_position_embeddings
).
expand
((
1
,
-
1
)))
if
is_torch_greater_than_1_6
:
self
.
register_buffer
(
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
,
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
)
def
forward
(
...
...
src/transformers/models/big_bird/modeling_big_bird.py
View file @
a2a3afbc
...
...
@@ -37,7 +37,7 @@ from ...modeling_outputs import (
TokenClassifierOutput
,
)
from
...modeling_utils
import
PreTrainedModel
from
...pytorch_utils
import
apply_chunking_to_forward
,
is_torch_greater_than_1_6
from
...pytorch_utils
import
apply_chunking_to_forward
from
...utils
import
(
ModelOutput
,
add_code_sample_docstrings
,
...
...
@@ -259,11 +259,8 @@ class BigBirdEmbeddings(nn.Module):
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self
.
position_embedding_type
=
getattr
(
config
,
"position_embedding_type"
,
"absolute"
)
self
.
register_buffer
(
"position_ids"
,
torch
.
arange
(
config
.
max_position_embeddings
).
expand
((
1
,
-
1
)))
if
is_torch_greater_than_1_6
:
self
.
register_buffer
(
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
,
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
)
# End copy
...
...
src/transformers/models/convbert/modeling_convbert.py
View file @
a2a3afbc
...
...
@@ -35,12 +35,7 @@ from ...modeling_outputs import (
TokenClassifierOutput
,
)
from
...modeling_utils
import
PreTrainedModel
,
SequenceSummary
from
...pytorch_utils
import
(
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
is_torch_greater_than_1_6
,
prune_linear_layer
,
)
from
...pytorch_utils
import
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
prune_linear_layer
from
...utils
import
add_code_sample_docstrings
,
add_start_docstrings
,
add_start_docstrings_to_model_forward
,
logging
from
.configuration_convbert
import
ConvBertConfig
...
...
@@ -198,11 +193,8 @@ class ConvBertEmbeddings(nn.Module):
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self
.
register_buffer
(
"position_ids"
,
torch
.
arange
(
config
.
max_position_embeddings
).
expand
((
1
,
-
1
)))
if
is_torch_greater_than_1_6
:
self
.
register_buffer
(
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
,
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
)
def
forward
(
...
...
src/transformers/models/data2vec/modeling_data2vec_text.py
View file @
a2a3afbc
...
...
@@ -34,12 +34,7 @@ from ...modeling_outputs import (
TokenClassifierOutput
,
)
from
...modeling_utils
import
PreTrainedModel
from
...pytorch_utils
import
(
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
is_torch_greater_than_1_6
,
prune_linear_layer
,
)
from
...pytorch_utils
import
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
prune_linear_layer
from
...utils
import
(
add_code_sample_docstrings
,
add_start_docstrings
,
...
...
@@ -87,11 +82,8 @@ class Data2VecTextForTextEmbeddings(nn.Module):
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self
.
position_embedding_type
=
getattr
(
config
,
"position_embedding_type"
,
"absolute"
)
self
.
register_buffer
(
"position_ids"
,
torch
.
arange
(
config
.
max_position_embeddings
).
expand
((
1
,
-
1
)))
if
is_torch_greater_than_1_6
:
self
.
register_buffer
(
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
,
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
)
# End copy
...
...
src/transformers/models/decision_transformer/modeling_decision_transformer.py
View file @
a2a3afbc
...
...
@@ -22,15 +22,12 @@ from typing import Optional, Tuple, Union
import
torch
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.cuda.amp
import
autocast
from
...activations
import
ACT2FN
from
...modeling_outputs
import
BaseModelOutputWithPastAndCrossAttentions
from
...modeling_utils
import
PreTrainedModel
from
...pytorch_utils
import
(
Conv1D
,
find_pruneable_heads_and_indices
,
is_torch_greater_or_equal_than_1_6
,
prune_conv1d_layer
,
)
from
...pytorch_utils
import
Conv1D
,
find_pruneable_heads_and_indices
,
prune_conv1d_layer
from
...utils
import
(
ModelOutput
,
add_start_docstrings
,
...
...
@@ -38,15 +35,6 @@ from ...utils import (
logging
,
replace_return_docstrings
,
)
if
is_torch_greater_or_equal_than_1_6
:
is_amp_available
=
True
from
torch.cuda.amp
import
autocast
else
:
is_amp_available
=
False
from
...modeling_outputs
import
BaseModelOutputWithPastAndCrossAttentions
from
.configuration_decision_transformer
import
DecisionTransformerConfig
...
...
@@ -235,15 +223,10 @@ class DecisionTransformerGPT2Attention(nn.Module):
scale_factor
/=
float
(
self
.
layer_idx
+
1
)
# Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
if
is_amp_available
:
with
autocast
(
enabled
=
False
):
q
,
k
=
query
.
reshape
(
-
1
,
q_seq_len
,
dk
),
key
.
transpose
(
-
1
,
-
2
).
reshape
(
-
1
,
dk
,
k_seq_len
)
attn_weights
=
torch
.
baddbmm
(
attn_weights
,
q
.
float
(),
k
.
float
(),
beta
=
0
,
alpha
=
scale_factor
)
attn_weights
=
attn_weights
.
reshape
(
bsz
,
num_heads
,
q_seq_len
,
k_seq_len
)
else
:
q
,
k
=
query
.
reshape
(
-
1
,
q_seq_len
,
dk
),
key
.
transpose
(
-
1
,
-
2
).
reshape
(
-
1
,
dk
,
k_seq_len
)
attn_weights
=
torch
.
baddbmm
(
attn_weights
,
q
.
float
(),
k
.
float
(),
beta
=
0
,
alpha
=
scale_factor
)
attn_weights
=
attn_weights
.
reshape
(
bsz
,
num_heads
,
q_seq_len
,
k_seq_len
)
if
not
self
.
is_cross_attention
:
# if only "normal" attention layer implements causal mask
...
...
src/transformers/models/distilbert/modeling_distilbert.py
View file @
a2a3afbc
...
...
@@ -39,12 +39,7 @@ from ...modeling_outputs import (
TokenClassifierOutput
,
)
from
...modeling_utils
import
PreTrainedModel
from
...pytorch_utils
import
(
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
is_torch_greater_than_1_6
,
prune_linear_layer
,
)
from
...pytorch_utils
import
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
prune_linear_layer
from
...utils
import
(
add_code_sample_docstrings
,
add_start_docstrings
,
...
...
@@ -106,7 +101,6 @@ class Embeddings(nn.Module):
self
.
LayerNorm
=
nn
.
LayerNorm
(
config
.
dim
,
eps
=
1e-12
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout
)
if
is_torch_greater_than_1_6
:
self
.
register_buffer
(
"position_ids"
,
torch
.
arange
(
config
.
max_position_embeddings
).
expand
((
1
,
-
1
)),
persistent
=
False
)
...
...
src/transformers/models/electra/modeling_electra.py
View file @
a2a3afbc
...
...
@@ -36,12 +36,7 @@ from ...modeling_outputs import (
TokenClassifierOutput
,
)
from
...modeling_utils
import
PreTrainedModel
,
SequenceSummary
from
...pytorch_utils
import
(
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
is_torch_greater_than_1_6
,
prune_linear_layer
,
)
from
...pytorch_utils
import
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
prune_linear_layer
from
...utils
import
(
ModelOutput
,
add_code_sample_docstrings
,
...
...
@@ -169,11 +164,8 @@ class ElectraEmbeddings(nn.Module):
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self
.
register_buffer
(
"position_ids"
,
torch
.
arange
(
config
.
max_position_embeddings
).
expand
((
1
,
-
1
)))
self
.
position_embedding_type
=
getattr
(
config
,
"position_embedding_type"
,
"absolute"
)
if
is_torch_greater_than_1_6
:
self
.
register_buffer
(
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
,
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
)
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward
...
...
src/transformers/models/ernie/modeling_ernie.py
View file @
a2a3afbc
...
...
@@ -38,12 +38,7 @@ from ...modeling_outputs import (
TokenClassifierOutput
,
)
from
...modeling_utils
import
PreTrainedModel
from
...pytorch_utils
import
(
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
is_torch_greater_than_1_6
,
prune_linear_layer
,
)
from
...pytorch_utils
import
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
prune_linear_layer
from
...utils
import
(
ModelOutput
,
add_code_sample_docstrings
,
...
...
@@ -96,11 +91,8 @@ class ErnieEmbeddings(nn.Module):
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self
.
position_embedding_type
=
getattr
(
config
,
"position_embedding_type"
,
"absolute"
)
self
.
register_buffer
(
"position_ids"
,
torch
.
arange
(
config
.
max_position_embeddings
).
expand
((
1
,
-
1
)))
if
is_torch_greater_than_1_6
:
self
.
register_buffer
(
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
,
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
)
def
forward
(
...
...
src/transformers/models/flaubert/modeling_flaubert.py
View file @
a2a3afbc
...
...
@@ -22,7 +22,6 @@ import torch
from
torch
import
nn
from
...modeling_outputs
import
BaseModelOutput
from
...pytorch_utils
import
is_torch_greater_than_1_6
from
...utils
import
add_code_sample_docstrings
,
add_start_docstrings
,
add_start_docstrings_to_model_forward
,
logging
from
..xlm.modeling_xlm
import
(
XLMForMultipleChoice
,
...
...
@@ -139,7 +138,6 @@ class FlaubertModel(XLMModel):
super
().
__init__
(
config
)
self
.
layerdrop
=
getattr
(
config
,
"layerdrop"
,
0.0
)
self
.
pre_norm
=
getattr
(
config
,
"pre_norm"
,
False
)
if
is_torch_greater_than_1_6
:
self
.
register_buffer
(
"position_ids"
,
torch
.
arange
(
config
.
max_position_embeddings
).
expand
((
1
,
-
1
)),
persistent
=
False
)
...
...
src/transformers/models/flava/modeling_flava.py
View file @
a2a3afbc
...
...
@@ -29,7 +29,6 @@ from transformers.utils.doc import add_code_sample_docstrings
from
...activations
import
ACT2FN
from
...modeling_outputs
import
BaseModelOutput
,
BaseModelOutputWithPooling
from
...modeling_utils
import
PreTrainedModel
,
find_pruneable_heads_and_indices
,
prune_linear_layer
from
...pytorch_utils
import
is_torch_greater_than_1_6
from
...utils
import
(
ModelOutput
,
add_start_docstrings
,
...
...
@@ -392,11 +391,8 @@ class FlavaTextEmbeddings(nn.Module):
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self
.
position_embedding_type
=
getattr
(
config
,
"position_embedding_type"
,
"absolute"
)
self
.
register_buffer
(
"position_ids"
,
torch
.
arange
(
config
.
max_position_embeddings
).
expand
((
1
,
-
1
)))
if
is_torch_greater_than_1_6
:
self
.
register_buffer
(
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
,
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
)
def
forward
(
...
...
src/transformers/models/fnet/modeling_fnet.py
View file @
a2a3afbc
...
...
@@ -43,7 +43,7 @@ from ...modeling_outputs import (
TokenClassifierOutput
,
)
from
...modeling_utils
import
PreTrainedModel
from
...pytorch_utils
import
apply_chunking_to_forward
,
is_torch_greater_than_1_6
from
...pytorch_utils
import
apply_chunking_to_forward
from
...utils
import
(
add_code_sample_docstrings
,
add_start_docstrings
,
...
...
@@ -117,11 +117,8 @@ class FNetEmbeddings(nn.Module):
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self
.
register_buffer
(
"position_ids"
,
torch
.
arange
(
config
.
max_position_embeddings
).
expand
((
1
,
-
1
)))
if
is_torch_greater_than_1_6
:
self
.
register_buffer
(
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
,
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
),
persistent
=
False
)
def
forward
(
self
,
input_ids
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
inputs_embeds
=
None
):
...
...
src/transformers/models/gpt2/modeling_gpt2.py
View file @
a2a3afbc
...
...
@@ -23,22 +23,9 @@ from typing import Optional, Tuple, Union
import
torch
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.cuda.amp
import
autocast
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
from
...pytorch_utils
import
(
Conv1D
,
find_pruneable_heads_and_indices
,
is_torch_greater_or_equal_than_1_6
,
prune_conv1d_layer
,
)
if
is_torch_greater_or_equal_than_1_6
:
is_amp_available
=
True
from
torch.cuda.amp
import
autocast
else
:
is_amp_available
=
False
from
...activations
import
ACT2FN
from
...modeling_outputs
import
(
BaseModelOutputWithPastAndCrossAttentions
,
...
...
@@ -47,6 +34,7 @@ from ...modeling_outputs import (
TokenClassifierOutput
,
)
from
...modeling_utils
import
PreTrainedModel
,
SequenceSummary
from
...pytorch_utils
import
Conv1D
,
find_pruneable_heads_and_indices
,
prune_conv1d_layer
from
...utils
import
(
ModelOutput
,
add_code_sample_docstrings
,
...
...
@@ -247,15 +235,10 @@ class GPT2Attention(nn.Module):
scale_factor
/=
float
(
self
.
layer_idx
+
1
)
# Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
if
is_amp_available
:
with
autocast
(
enabled
=
False
):
q
,
k
=
query
.
reshape
(
-
1
,
q_seq_len
,
dk
),
key
.
transpose
(
-
1
,
-
2
).
reshape
(
-
1
,
dk
,
k_seq_len
)
attn_weights
=
torch
.
baddbmm
(
attn_weights
,
q
.
float
(),
k
.
float
(),
beta
=
0
,
alpha
=
scale_factor
)
attn_weights
=
attn_weights
.
reshape
(
bsz
,
num_heads
,
q_seq_len
,
k_seq_len
)
else
:
q
,
k
=
query
.
reshape
(
-
1
,
q_seq_len
,
dk
),
key
.
transpose
(
-
1
,
-
2
).
reshape
(
-
1
,
dk
,
k_seq_len
)
attn_weights
=
torch
.
baddbmm
(
attn_weights
,
q
.
float
(),
k
.
float
(),
beta
=
0
,
alpha
=
scale_factor
)
attn_weights
=
attn_weights
.
reshape
(
bsz
,
num_heads
,
q_seq_len
,
k_seq_len
)
if
not
self
.
is_cross_attention
:
# if only "normal" attention layer implements causal mask
...
...
src/transformers/models/imagegpt/modeling_imagegpt.py
View file @
a2a3afbc
...
...
@@ -22,22 +22,9 @@ from typing import Any, Optional, Tuple, Union
import
torch
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.cuda.amp
import
autocast
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
from
...pytorch_utils
import
(
Conv1D
,
find_pruneable_heads_and_indices
,
is_torch_greater_or_equal_than_1_6
,
prune_conv1d_layer
,
)
if
is_torch_greater_or_equal_than_1_6
:
is_amp_available
=
True
from
torch.cuda.amp
import
autocast
else
:
is_amp_available
=
False
from
...activations
import
ACT2FN
from
...modeling_outputs
import
(
BaseModelOutputWithPastAndCrossAttentions
,
...
...
@@ -45,6 +32,7 @@ from ...modeling_outputs import (
SequenceClassifierOutputWithPast
,
)
from
...modeling_utils
import
PreTrainedModel
from
...pytorch_utils
import
Conv1D
,
find_pruneable_heads_and_indices
,
prune_conv1d_layer
from
...utils
import
add_start_docstrings
,
add_start_docstrings_to_model_forward
,
logging
,
replace_return_docstrings
from
.configuration_imagegpt
import
ImageGPTConfig
...
...
@@ -299,15 +287,10 @@ class ImageGPTAttention(nn.Module):
scale_factor
/=
float
(
self
.
layer_idx
+
1
)
# Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
if
is_amp_available
:
with
autocast
(
enabled
=
False
):
q
,
k
=
query
.
reshape
(
-
1
,
q_seq_len
,
dk
),
key
.
transpose
(
-
1
,
-
2
).
reshape
(
-
1
,
dk
,
k_seq_len
)
attn_weights
=
torch
.
baddbmm
(
attn_weights
,
q
.
float
(),
k
.
float
(),
beta
=
0
,
alpha
=
scale_factor
)
attn_weights
=
attn_weights
.
reshape
(
bsz
,
num_heads
,
q_seq_len
,
k_seq_len
)
else
:
q
,
k
=
query
.
reshape
(
-
1
,
q_seq_len
,
dk
),
key
.
transpose
(
-
1
,
-
2
).
reshape
(
-
1
,
dk
,
k_seq_len
)
attn_weights
=
torch
.
baddbmm
(
attn_weights
,
q
.
float
(),
k
.
float
(),
beta
=
0
,
alpha
=
scale_factor
)
attn_weights
=
attn_weights
.
reshape
(
bsz
,
num_heads
,
q_seq_len
,
k_seq_len
)
if
not
self
.
is_cross_attention
:
# if only "normal" attention layer implements causal mask
...
...
src/transformers/models/mctct/modeling_mctct.py
View file @
a2a3afbc
...
...
@@ -33,7 +33,6 @@ from ...modeling_utils import (
find_pruneable_heads_and_indices
,
prune_linear_layer
,
)
from
...pytorch_utils
import
is_torch_greater_than_1_6
from
...utils
import
logging
from
.configuration_mctct
import
MCTCTConfig
...
...
@@ -153,7 +152,6 @@ class MCTCTEmbeddings(nn.Module):
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self
.
register_buffer
(
"position_ids"
,
torch
.
arange
(
config
.
max_position_embeddings
).
expand
((
1
,
-
1
)))
if
is_torch_greater_than_1_6
:
self
.
register_buffer
(
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
,
device
=
self
.
position_ids
.
device
),
...
...
src/transformers/models/nezha/modeling_nezha.py
View file @
a2a3afbc
...
...
@@ -38,12 +38,7 @@ from ...modeling_outputs import (
TokenClassifierOutput
,
)
from
...modeling_utils
import
PreTrainedModel
from
...pytorch_utils
import
(
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
is_torch_greater_than_1_6
,
prune_linear_layer
,
)
from
...pytorch_utils
import
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
prune_linear_layer
from
...utils
import
(
ModelOutput
,
add_code_sample_docstrings
,
...
...
@@ -187,11 +182,8 @@ class NezhaEmbeddings(nn.Module):
# any TensorFlow checkpoint file
self
.
LayerNorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
if
is_torch_greater_than_1_6
:
self
.
register_buffer
(
"token_type_ids"
,
torch
.
zeros
((
1
,
config
.
max_position_embeddings
),
dtype
=
torch
.
long
),
persistent
=
False
,
"token_type_ids"
,
torch
.
zeros
((
1
,
config
.
max_position_embeddings
),
dtype
=
torch
.
long
),
persistent
=
False
)
def
forward
(
...
...
src/transformers/models/nystromformer/modeling_nystromformer.py
View file @
a2a3afbc
...
...
@@ -33,12 +33,7 @@ from ...modeling_outputs import (
TokenClassifierOutput
,
)
from
...modeling_utils
import
PreTrainedModel
from
...pytorch_utils
import
(
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
is_torch_greater_than_1_6
,
prune_linear_layer
,
)
from
...pytorch_utils
import
apply_chunking_to_forward
,
find_pruneable_heads_and_indices
,
prune_linear_layer
from
...utils
import
add_code_sample_docstrings
,
add_start_docstrings
,
add_start_docstrings_to_model_forward
,
logging
from
.configuration_nystromformer
import
NystromformerConfig
...
...
@@ -72,7 +67,6 @@ class NystromformerEmbeddings(nn.Module):
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self
.
register_buffer
(
"position_ids"
,
torch
.
arange
(
config
.
max_position_embeddings
).
expand
((
1
,
-
1
))
+
2
)
self
.
position_embedding_type
=
getattr
(
config
,
"position_embedding_type"
,
"absolute"
)
if
is_torch_greater_than_1_6
:
self
.
register_buffer
(
"token_type_ids"
,
torch
.
zeros
(
self
.
position_ids
.
size
(),
dtype
=
torch
.
long
,
device
=
self
.
position_ids
.
device
),
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment