Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
1ed2ebf6
Unverified
Commit
1ed2ebf6
authored
Jun 14, 2021
by
Stas Bekman
Committed by
GitHub
Jun 14, 2021
Browse files
[style] consistent nn. and nn.functional (#12124)
* consistent nn. and nn.functional * fix glitch * fix glitch #2
parent
ff7c8168
Changes
63
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
207 additions
and
201 deletions
+207
-201
src/transformers/activations.py
src/transformers/activations.py
+7
-7
src/transformers/generation_utils.py
src/transformers/generation_utils.py
+15
-9
src/transformers/modeling_fx_utils.py
src/transformers/modeling_fx_utils.py
+2
-1
src/transformers/modeling_utils.py
src/transformers/modeling_utils.py
+14
-17
src/transformers/models/albert/modeling_albert.py
src/transformers/models/albert/modeling_albert.py
+1
-1
src/transformers/models/bart/modeling_bart.py
src/transformers/models/bart/modeling_bart.py
+13
-14
src/transformers/models/bert_generation/modeling_bert_generation.py
...ormers/models/bert_generation/modeling_bert_generation.py
+1
-1
src/transformers/models/big_bird/modeling_big_bird.py
src/transformers/models/big_bird/modeling_big_bird.py
+14
-11
src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
...ormers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+23
-20
src/transformers/models/blenderbot/modeling_blenderbot.py
src/transformers/models/blenderbot/modeling_blenderbot.py
+13
-14
src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
...mers/models/blenderbot_small/modeling_blenderbot_small.py
+13
-14
src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
...t/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
+3
-2
src/transformers/models/clip/modeling_clip.py
src/transformers/models/clip/modeling_clip.py
+4
-5
src/transformers/models/convbert/modeling_convbert.py
src/transformers/models/convbert/modeling_convbert.py
+1
-1
src/transformers/models/ctrl/modeling_ctrl.py
src/transformers/models/ctrl/modeling_ctrl.py
+12
-12
src/transformers/models/deberta/modeling_deberta.py
src/transformers/models/deberta/modeling_deberta.py
+12
-14
src/transformers/models/deberta_v2/modeling_deberta_v2.py
src/transformers/models/deberta_v2/modeling_deberta_v2.py
+6
-6
src/transformers/models/detr/feature_extraction_detr.py
src/transformers/models/detr/feature_extraction_detr.py
+8
-6
src/transformers/models/detr/modeling_detr.py
src/transformers/models/detr/modeling_detr.py
+44
-45
src/transformers/models/distilbert/modeling_distilbert.py
src/transformers/models/distilbert/modeling_distilbert.py
+1
-1
No files found.
src/transformers/activations.py
View file @
1ed2ebf6
...
...
@@ -15,8 +15,8 @@
import
math
import
torch
import
torch.nn.functional
as
F
from
packaging
import
version
from
torch
import
nn
from
.utils
import
logging
...
...
@@ -28,8 +28,8 @@ def _gelu_python(x):
"""
Original Implementation of the GELU activation function in Google BERT repo when initially created. For
information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in
torch.nn.functional
Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in
nn.functional
Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
"""
return
x
*
0.5
*
(
1.0
+
torch
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
...
...
@@ -45,7 +45,7 @@ def gelu_new(x):
if
version
.
parse
(
torch
.
__version__
)
<
version
.
parse
(
"1.4"
):
gelu
=
_gelu_python
else
:
gelu
=
F
.
gelu
gelu
=
nn
.
functional
.
gelu
def
gelu_fast
(
x
):
...
...
@@ -70,11 +70,11 @@ def _silu_python(x):
if
version
.
parse
(
torch
.
__version__
)
<
version
.
parse
(
"1.7"
):
silu
=
_silu_python
else
:
silu
=
F
.
silu
silu
=
nn
.
functional
.
silu
def
mish
(
x
):
return
x
*
torch
.
tanh
(
torch
.
nn
.
functional
.
softplus
(
x
))
return
x
*
torch
.
tanh
(
nn
.
functional
.
softplus
(
x
))
def
linear_act
(
x
):
...
...
@@ -82,7 +82,7 @@ def linear_act(x):
ACT2FN
=
{
"relu"
:
F
.
relu
,
"relu"
:
nn
.
functional
.
relu
,
"silu"
:
silu
,
"swish"
:
silu
,
"gelu"
:
gelu
,
...
...
src/transformers/generation_utils.py
View file @
1ed2ebf6
...
...
@@ -20,7 +20,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
import
torch
import
torch.distributed
as
dist
from
torch
.nn
import
functional
as
F
from
torch
import
nn
from
.file_utils
import
ModelOutput
from
.generation_beam_search
import
BeamScorer
,
BeamSearchScorer
...
...
@@ -1564,7 +1564,7 @@ class GenerationMixin:
)
# sample
probs
=
F
.
softmax
(
next_token_scores
,
dim
=-
1
)
probs
=
nn
.
functional
.
softmax
(
next_token_scores
,
dim
=-
1
)
next_tokens
=
torch
.
multinomial
(
probs
,
num_samples
=
1
).
squeeze
(
1
)
# finished sentences should have their next token be a padding token
...
...
@@ -1801,9 +1801,11 @@ class GenerationMixin:
next_token_logits
=
outputs
.
logits
[:,
-
1
,
:]
# hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
# cannot be generated both before and after the `
F
.log_softmax` operation.
# cannot be generated both before and after the `
nn.functional
.log_softmax` operation.
next_token_logits
=
self
.
adjust_logits_during_generation
(
next_token_logits
,
cur_len
=
cur_len
)
next_token_scores
=
F
.
log_softmax
(
next_token_logits
,
dim
=-
1
)
# (batch_size * num_beams, vocab_size)
next_token_scores
=
nn
.
functional
.
log_softmax
(
next_token_logits
,
dim
=-
1
)
# (batch_size * num_beams, vocab_size)
next_token_scores
=
logits_processor
(
input_ids
,
next_token_scores
)
next_token_scores
=
next_token_scores
+
beam_scores
[:,
None
].
expand_as
(
next_token_scores
)
...
...
@@ -2098,9 +2100,11 @@ class GenerationMixin:
next_token_logits
=
outputs
.
logits
[:,
-
1
,
:]
# hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
# cannot be generated both before and after the `
F
.log_softmax` operation.
# cannot be generated both before and after the `
nn.functional
.log_softmax` operation.
next_token_logits
=
self
.
adjust_logits_during_generation
(
next_token_logits
,
cur_len
=
cur_len
)
next_token_scores
=
F
.
log_softmax
(
next_token_logits
,
dim
=-
1
)
# (batch_size * num_beams, vocab_size)
next_token_scores
=
nn
.
functional
.
log_softmax
(
next_token_logits
,
dim
=-
1
)
# (batch_size * num_beams, vocab_size)
next_token_scores
=
logits_processor
(
input_ids
,
next_token_scores
)
next_token_scores
=
next_token_scores
+
beam_scores
[:,
None
].
expand_as
(
next_token_scores
)
...
...
@@ -2128,7 +2132,7 @@ class GenerationMixin:
vocab_size
=
next_token_scores
.
shape
[
-
1
]
next_token_scores
=
next_token_scores
.
view
(
batch_size
,
num_beams
*
vocab_size
)
probs
=
F
.
softmax
(
next_token_scores
,
dim
=-
1
)
probs
=
nn
.
functional
.
softmax
(
next_token_scores
,
dim
=-
1
)
next_tokens
=
torch
.
multinomial
(
probs
,
num_samples
=
2
*
num_beams
)
next_token_scores
=
torch
.
gather
(
next_token_scores
,
-
1
,
next_tokens
)
...
...
@@ -2426,9 +2430,11 @@ class GenerationMixin:
next_token_logits
=
outputs
.
logits
[
batch_group_indices
,
-
1
,
:]
# hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
# cannot be generated both before and after the `
F
.log_softmax` operation.
# cannot be generated both before and after the `
nn.functional
.log_softmax` operation.
next_token_logits
=
self
.
adjust_logits_during_generation
(
next_token_logits
,
cur_len
=
cur_len
)
next_token_scores
=
F
.
log_softmax
(
next_token_logits
,
dim
=-
1
)
# (batch_size * group_size, vocab_size)
next_token_scores
=
nn
.
functional
.
log_softmax
(
next_token_logits
,
dim
=-
1
)
# (batch_size * group_size, vocab_size)
vocab_size
=
next_token_scores
.
shape
[
-
1
]
next_token_scores
=
logits_processor
(
...
...
src/transformers/modeling_fx_utils.py
View file @
1ed2ebf6
...
...
@@ -4,6 +4,7 @@ import inspect
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Union
import
torch
from
torch
import
nn
from
torch.fx
import
Graph
,
GraphModule
,
Node
,
Proxy
,
Tracer
from
torch.fx.node
import
Argument
...
...
@@ -277,7 +278,7 @@ class HFTracer(Tracer):
return
path
def
path_of_module
(
self
,
mod
:
torch
.
nn
.
Module
)
->
str
:
def
path_of_module
(
self
,
mod
:
nn
.
Module
)
->
str
:
"""
Helper method to find the qualified name of ``mod`` in the Module hierarchy of ``root``. For example, if
``root`` has a submodule named ``foo``, which has a submodule named ``bar``, passing ``bar`` into this function
...
...
src/transformers/modeling_utils.py
View file @
1ed2ebf6
...
...
@@ -25,7 +25,6 @@ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
import
torch
from
torch
import
Tensor
,
device
,
dtype
,
nn
from
torch.nn
import
CrossEntropyLoss
from
torch.nn
import
functional
as
F
from
.activations
import
get_activation
from
.configuration_utils
import
PretrainedConfig
...
...
@@ -355,9 +354,7 @@ class ModuleUtilsMixin:
"""
def
parameter_filter
(
x
):
return
(
x
.
requires_grad
or
not
only_trainable
)
and
not
(
isinstance
(
x
,
torch
.
nn
.
Embedding
)
and
exclude_embeddings
)
return
(
x
.
requires_grad
or
not
only_trainable
)
and
not
(
isinstance
(
x
,
nn
.
Embedding
)
and
exclude_embeddings
)
params
=
filter
(
parameter_filter
,
self
.
parameters
())
if
only_trainable
else
self
.
parameters
()
return
sum
(
p
.
numel
()
for
p
in
params
)
...
...
@@ -549,7 +546,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
):
assert
isinstance
(
decoder_pointer
,
nn
.
Module
)
and
isinstance
(
encoder_pointer
,
nn
.
Module
),
f
"
{
decoder_pointer
}
and
{
encoder_pointer
}
have to be of type
torch.
nn.Module"
),
f
"
{
decoder_pointer
}
and
{
encoder_pointer
}
have to be of type nn.Module"
if
hasattr
(
decoder_pointer
,
"weight"
):
assert
hasattr
(
encoder_pointer
,
"weight"
)
encoder_pointer
.
weight
=
decoder_pointer
.
weight
...
...
@@ -613,7 +610,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
output_embeddings
.
weight
=
input_embeddings
.
weight
if
getattr
(
output_embeddings
,
"bias"
,
None
)
is
not
None
:
output_embeddings
.
bias
.
data
=
torch
.
nn
.
functional
.
pad
(
output_embeddings
.
bias
.
data
=
nn
.
functional
.
pad
(
output_embeddings
.
bias
.
data
,
(
0
,
...
...
@@ -625,7 +622,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
if
hasattr
(
output_embeddings
,
"out_features"
)
and
hasattr
(
input_embeddings
,
"num_embeddings"
):
output_embeddings
.
out_features
=
input_embeddings
.
num_embeddings
def
resize_token_embeddings
(
self
,
new_num_tokens
:
Optional
[
int
]
=
None
)
->
torch
.
nn
.
Embedding
:
def
resize_token_embeddings
(
self
,
new_num_tokens
:
Optional
[
int
]
=
None
)
->
nn
.
Embedding
:
"""
Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`.
...
...
@@ -668,8 +665,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
return
self
.
get_input_embeddings
()
def
_get_resized_embeddings
(
self
,
old_embeddings
:
torch
.
nn
.
Embedding
,
new_num_tokens
:
Optional
[
int
]
=
None
)
->
torch
.
nn
.
Embedding
:
self
,
old_embeddings
:
nn
.
Embedding
,
new_num_tokens
:
Optional
[
int
]
=
None
)
->
nn
.
Embedding
:
"""
Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
initialized vectors at the end. Reducing the size will remove vectors from the end
...
...
@@ -732,8 +729,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
return
new_embeddings
def
_get_resized_lm_head
(
self
,
old_lm_head
:
torch
.
nn
.
Linear
,
new_num_tokens
:
Optional
[
int
]
=
None
,
transposed
:
Optional
[
bool
]
=
False
)
->
torch
.
nn
.
Linear
:
self
,
old_lm_head
:
nn
.
Linear
,
new_num_tokens
:
Optional
[
int
]
=
None
,
transposed
:
Optional
[
bool
]
=
False
)
->
nn
.
Linear
:
"""
Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized
vectors at the end. Reducing the size will remove vectors from the end
...
...
@@ -1681,7 +1678,7 @@ class SQuADHead(nn.Module):
else
:
# during inference, compute the end logits based on beam search
bsz
,
slen
,
hsz
=
hidden_states
.
size
()
start_log_probs
=
F
.
softmax
(
start_logits
,
dim
=-
1
)
# shape (bsz, slen)
start_log_probs
=
nn
.
functional
.
softmax
(
start_logits
,
dim
=-
1
)
# shape (bsz, slen)
start_top_log_probs
,
start_top_index
=
torch
.
topk
(
start_log_probs
,
self
.
start_n_top
,
dim
=-
1
...
...
@@ -1695,7 +1692,7 @@ class SQuADHead(nn.Module):
)
# shape (bsz, slen, start_n_top, hsz)
p_mask
=
p_mask
.
unsqueeze
(
-
1
)
if
p_mask
is
not
None
else
None
end_logits
=
self
.
end_logits
(
hidden_states_expanded
,
start_states
=
start_states
,
p_mask
=
p_mask
)
end_log_probs
=
F
.
softmax
(
end_logits
,
dim
=
1
)
# shape (bsz, slen, start_n_top)
end_log_probs
=
nn
.
functional
.
softmax
(
end_logits
,
dim
=
1
)
# shape (bsz, slen, start_n_top)
end_top_log_probs
,
end_top_index
=
torch
.
topk
(
end_log_probs
,
self
.
end_n_top
,
dim
=
1
...
...
@@ -1820,7 +1817,7 @@ class SequenceSummary(nn.Module):
return
output
def
unwrap_model
(
model
:
torch
.
nn
.
Module
)
->
torch
.
nn
.
Module
:
def
unwrap_model
(
model
:
nn
.
Module
)
->
nn
.
Module
:
"""
Recursively unwraps a model from potential containers (as used in distributed training).
...
...
@@ -1834,7 +1831,7 @@ def unwrap_model(model: torch.nn.Module) -> torch.nn.Module:
return
model
def
prune_linear_layer
(
layer
:
torch
.
nn
.
Linear
,
index
:
torch
.
LongTensor
,
dim
:
int
=
0
)
->
torch
.
nn
.
Linear
:
def
prune_linear_layer
(
layer
:
nn
.
Linear
,
index
:
torch
.
LongTensor
,
dim
:
int
=
0
)
->
nn
.
Linear
:
"""
Prune a linear layer to keep only entries in index.
...
...
@@ -1902,8 +1899,8 @@ def prune_conv1d_layer(layer: Conv1D, index: torch.LongTensor, dim: int = 1) ->
def
prune_layer
(
layer
:
Union
[
torch
.
nn
.
Linear
,
Conv1D
],
index
:
torch
.
LongTensor
,
dim
:
Optional
[
int
]
=
None
)
->
Union
[
torch
.
nn
.
Linear
,
Conv1D
]:
layer
:
Union
[
nn
.
Linear
,
Conv1D
],
index
:
torch
.
LongTensor
,
dim
:
Optional
[
int
]
=
None
)
->
Union
[
nn
.
Linear
,
Conv1D
]:
"""
Prune a Conv1D or linear layer to keep only entries in index.
...
...
src/transformers/models/albert/modeling_albert.py
View file @
1ed2ebf6
...
...
@@ -20,7 +20,7 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
from
...activations
import
ACT2FN
...
...
src/transformers/models/bart/modeling_bart.py
View file @
1ed2ebf6
...
...
@@ -20,7 +20,6 @@ import warnings
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
...
...
@@ -223,7 +222,7 @@ class BartAttention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
if
layer_head_mask
.
size
()
!=
(
self
.
num_heads
,):
...
...
@@ -243,7 +242,7 @@ class BartAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -303,15 +302,15 @@ class BartEncoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
self_attn_layer_norm
(
hidden_states
)
residual
=
hidden_states
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
...
...
@@ -398,7 +397,7 @@ class BartDecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
self_attn_layer_norm
(
hidden_states
)
...
...
@@ -418,7 +417,7 @@ class BartDecoderLayer(nn.Module):
past_key_value
=
cross_attn_past_key_value
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
encoder_attn_layer_norm
(
hidden_states
)
...
...
@@ -428,9 +427,9 @@ class BartDecoderLayer(nn.Module):
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
...
...
@@ -661,7 +660,7 @@ class BartEncoder(BartPretrainedModel):
Args:
config: BartConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
BartConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -760,7 +759,7 @@ class BartEncoder(BartPretrainedModel):
hidden_states
=
inputs_embeds
+
embed_pos
hidden_states
=
self
.
layernorm_embedding
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# expand attention_mask
if
attention_mask
is
not
None
:
...
...
@@ -826,7 +825,7 @@ class BartDecoder(BartPretrainedModel):
Args:
config: BartConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
BartConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -997,7 +996,7 @@ class BartDecoder(BartPretrainedModel):
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
self
.
layernorm_embedding
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# decoder layers
all_hidden_states
=
()
if
output_hidden_states
else
None
...
...
src/transformers/models/bert_generation/modeling_bert_generation.py
View file @
1ed2ebf6
...
...
@@ -139,7 +139,7 @@ class BertGenerationEmbeddings(nn.Module):
self
.
position_embeddings
=
nn
.
Embedding
(
config
.
max_position_embeddings
,
config
.
hidden_size
)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self
.
LayerNorm
=
torch
.
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
LayerNorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
...
...
src/transformers/models/big_bird/modeling_big_bird.py
View file @
1ed2ebf6
...
...
@@ -22,7 +22,6 @@ from typing import Optional, Tuple
import
numpy
as
np
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
...
...
@@ -379,7 +378,7 @@ class BigBirdSelfAttention(nn.Module):
attention_scores
=
attention_scores
+
attention_mask
# Normalize the attention scores to probabilities.
attention_probs
=
F
.
softmax
(
attention_scores
,
dim
=-
1
)
attention_probs
=
nn
.
functional
.
softmax
(
attention_scores
,
dim
=-
1
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
...
...
@@ -608,7 +607,9 @@ class BigBirdBlockSparseAttention(nn.Module):
first_product
=
first_product
*
rsqrt_d
first_product
+=
(
1.0
-
to_mask
)
*
attn_mask_penalty
first_attn_weights
=
F
.
softmax
(
first_product
,
dim
=-
1
)
# [bsz, n_heads, from_block_size, to_seq_len]
first_attn_weights
=
nn
.
functional
.
softmax
(
first_product
,
dim
=-
1
)
# [bsz, n_heads, from_block_size, to_seq_len]
# [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1]
first_context_layer
=
self
.
torch_bmm_nd
(
first_attn_weights
,
value_layer
,
ndim
=
4
)
...
...
@@ -660,7 +661,7 @@ class BigBirdBlockSparseAttention(nn.Module):
)
second_product
=
second_product
*
rsqrt_d
second_product
+=
(
1.0
-
torch
.
minimum
(
second_seq_pad
,
second_rand_pad
))
*
attn_mask_penalty
second_attn_weights
=
F
.
softmax
(
second_attn_weights
=
nn
.
functional
.
softmax
(
second_product
,
dim
=-
1
)
# [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size]
...
...
@@ -721,7 +722,7 @@ class BigBirdBlockSparseAttention(nn.Module):
)
# [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size]
# safely doing softmax since attention matrix is completed
attn_weights
=
F
.
softmax
(
attn_weights
=
nn
.
functional
.
softmax
(
band_product
,
dim
=-
1
)
# [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size]
...
...
@@ -794,7 +795,7 @@ class BigBirdBlockSparseAttention(nn.Module):
)
second_last_product
=
second_last_product
*
rsqrt_d
second_last_product
+=
(
1.0
-
torch
.
minimum
(
second_last_seq_pad
,
second_last_rand_pad
))
*
attn_mask_penalty
second_last_attn_weights
=
F
.
softmax
(
second_last_attn_weights
=
nn
.
functional
.
softmax
(
second_last_product
,
dim
=-
1
)
# [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size]
...
...
@@ -810,7 +811,7 @@ class BigBirdBlockSparseAttention(nn.Module):
last_product
=
self
.
torch_bmm_nd_transpose
(
blocked_query_matrix
[:,
:,
-
1
],
key_layer
,
ndim
=
4
)
last_product
=
last_product
*
rsqrt_d
last_product
+=
(
1.0
-
to_mask
)
*
attn_mask_penalty
last_attn_weights
=
F
.
softmax
(
last_product
,
dim
=-
1
)
# [bsz, n_heads, from_block_size, n]
last_attn_weights
=
nn
.
functional
.
softmax
(
last_product
,
dim
=-
1
)
# [bsz, n_heads, from_block_size, n]
# [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1]
last_context_layer
=
self
.
torch_bmm_nd
(
last_attn_weights
,
value_layer
,
ndim
=
4
)
...
...
@@ -2210,10 +2211,10 @@ class BigBirdModel(BigBirdPreTrainedModel):
f
"`config.block_size`:
{
block_size
}
"
)
if
input_ids
is
not
None
:
input_ids
=
F
.
pad
(
input_ids
,
(
0
,
padding_len
),
value
=
pad_token_id
)
input_ids
=
nn
.
functional
.
pad
(
input_ids
,
(
0
,
padding_len
),
value
=
pad_token_id
)
if
position_ids
is
not
None
:
# pad with position_id = pad_token_id as in modeling_bigbird.BigBirdEmbeddings
position_ids
=
F
.
pad
(
position_ids
,
(
0
,
padding_len
),
value
=
pad_token_id
)
position_ids
=
nn
.
functional
.
pad
(
position_ids
,
(
0
,
padding_len
),
value
=
pad_token_id
)
if
inputs_embeds
is
not
None
:
input_ids_padding
=
inputs_embeds
.
new_full
(
(
batch_size
,
padding_len
),
...
...
@@ -2223,8 +2224,10 @@ class BigBirdModel(BigBirdPreTrainedModel):
inputs_embeds_padding
=
self
.
embeddings
(
input_ids_padding
)
inputs_embeds
=
torch
.
cat
([
inputs_embeds
,
inputs_embeds_padding
],
dim
=-
2
)
attention_mask
=
F
.
pad
(
attention_mask
,
(
0
,
padding_len
),
value
=
False
)
# no attention on the padding tokens
token_type_ids
=
F
.
pad
(
token_type_ids
,
(
0
,
padding_len
),
value
=
0
)
# pad with token_type_id = 0
attention_mask
=
nn
.
functional
.
pad
(
attention_mask
,
(
0
,
padding_len
),
value
=
False
)
# no attention on the padding tokens
token_type_ids
=
nn
.
functional
.
pad
(
token_type_ids
,
(
0
,
padding_len
),
value
=
0
)
# pad with token_type_id = 0
return
padding_len
,
input_ids
,
attention_mask
,
token_type_ids
,
position_ids
,
inputs_embeds
...
...
src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
View file @
1ed2ebf6
...
...
@@ -22,7 +22,6 @@ from typing import Optional, Tuple
import
numpy
as
np
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
...
...
@@ -206,7 +205,7 @@ class BigBirdPegasusSelfAttention(nn.Module):
attention_scores
=
attention_scores
+
attention_mask
# Normalize the attention scores to probabilities.
attention_probs
=
F
.
softmax
(
attention_scores
,
dim
=-
1
)
attention_probs
=
nn
.
functional
.
softmax
(
attention_scores
,
dim
=-
1
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
...
...
@@ -436,7 +435,9 @@ class BigBirdPegasusBlockSparseAttention(nn.Module):
first_product
=
first_product
*
rsqrt_d
first_product
+=
(
1.0
-
to_mask
)
*
attn_mask_penalty
first_attn_weights
=
F
.
softmax
(
first_product
,
dim
=-
1
)
# [bsz, n_heads, from_block_size, to_seq_len]
first_attn_weights
=
nn
.
functional
.
softmax
(
first_product
,
dim
=-
1
)
# [bsz, n_heads, from_block_size, to_seq_len]
# [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1]
first_context_layer
=
self
.
torch_bmm_nd
(
first_attn_weights
,
value_layer
,
ndim
=
4
)
...
...
@@ -488,7 +489,7 @@ class BigBirdPegasusBlockSparseAttention(nn.Module):
)
second_product
=
second_product
*
rsqrt_d
second_product
+=
(
1.0
-
torch
.
minimum
(
second_seq_pad
,
second_rand_pad
))
*
attn_mask_penalty
second_attn_weights
=
F
.
softmax
(
second_attn_weights
=
nn
.
functional
.
softmax
(
second_product
,
dim
=-
1
)
# [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size]
...
...
@@ -549,7 +550,7 @@ class BigBirdPegasusBlockSparseAttention(nn.Module):
)
# [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size]
# safely doing softmax since attention matrix is completed
attn_weights
=
F
.
softmax
(
attn_weights
=
nn
.
functional
.
softmax
(
band_product
,
dim
=-
1
)
# [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size]
...
...
@@ -622,7 +623,7 @@ class BigBirdPegasusBlockSparseAttention(nn.Module):
)
second_last_product
=
second_last_product
*
rsqrt_d
second_last_product
+=
(
1.0
-
torch
.
minimum
(
second_last_seq_pad
,
second_last_rand_pad
))
*
attn_mask_penalty
second_last_attn_weights
=
F
.
softmax
(
second_last_attn_weights
=
nn
.
functional
.
softmax
(
second_last_product
,
dim
=-
1
)
# [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size]
...
...
@@ -638,7 +639,7 @@ class BigBirdPegasusBlockSparseAttention(nn.Module):
last_product
=
self
.
torch_bmm_nd_transpose
(
blocked_query_matrix
[:,
:,
-
1
],
key_layer
,
ndim
=
4
)
last_product
=
last_product
*
rsqrt_d
last_product
+=
(
1.0
-
to_mask
)
*
attn_mask_penalty
last_attn_weights
=
F
.
softmax
(
last_product
,
dim
=-
1
)
# [bsz, n_heads, from_block_size, n]
last_attn_weights
=
nn
.
functional
.
softmax
(
last_product
,
dim
=-
1
)
# [bsz, n_heads, from_block_size, n]
# [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1]
last_context_layer
=
self
.
torch_bmm_nd
(
last_attn_weights
,
value_layer
,
ndim
=
4
)
...
...
@@ -1295,7 +1296,7 @@ class BigBirdPegasusDecoderAttention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
if
layer_head_mask
.
size
()
!=
(
self
.
num_heads
,):
...
...
@@ -1315,7 +1316,7 @@ class BigBirdPegasusDecoderAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -1384,7 +1385,7 @@ class BigBirdPegasusEncoderLayer(nn.Module):
)
hidden_states
=
self_attention_outputs
[
0
]
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
residual
=
hidden_states
...
...
@@ -1392,7 +1393,7 @@ class BigBirdPegasusEncoderLayer(nn.Module):
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
if
hidden_states
.
dtype
==
torch
.
float16
and
(
...
...
@@ -1492,7 +1493,7 @@ class BigBirdPegasusDecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# Cross-Attention Block
...
...
@@ -1512,7 +1513,7 @@ class BigBirdPegasusDecoderLayer(nn.Module):
past_key_value
=
cross_attn_past_key_value
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# add cross-attn to positions 3,4 of present_key_value tuple
...
...
@@ -1522,9 +1523,9 @@ class BigBirdPegasusDecoderLayer(nn.Module):
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
outputs
=
(
hidden_states
,)
...
...
@@ -1733,7 +1734,7 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
Args:
config: BigBirdPegasusConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
BigBirdPegasusConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -1829,7 +1830,7 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
embed_pos
=
self
.
embed_positions
(
input_shape
)
hidden_states
=
inputs_embeds
+
embed_pos
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
if
attention_mask
is
None
:
attention_mask
=
torch
.
ones
(
input_shape
,
device
=
hidden_states
.
device
)
...
...
@@ -2015,7 +2016,9 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
inputs_embeds_padding
=
self
.
embed_tokens
(
input_ids_padding
)
hidden_states
=
torch
.
cat
([
hidden_states
,
inputs_embeds_padding
],
dim
=-
2
)
attention_mask
=
F
.
pad
(
attention_mask
,
(
0
,
padding_len
),
value
=
0
)
# no attention on the padding tokens
attention_mask
=
nn
.
functional
.
pad
(
attention_mask
,
(
0
,
padding_len
),
value
=
0
)
# no attention on the padding tokens
return
padding_len
,
hidden_states
,
attention_mask
...
...
@@ -2027,7 +2030,7 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
Args:
config: BigBirdPegasusConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
BigBirdPegasusConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -2198,7 +2201,7 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# decoder layers
all_hidden_states
=
()
if
output_hidden_states
else
None
...
...
src/transformers/models/blenderbot/modeling_blenderbot.py
View file @
1ed2ebf6
...
...
@@ -23,7 +23,6 @@ import warnings
from
typing
import
Optional
,
Tuple
,
Union
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
...
...
@@ -224,7 +223,7 @@ class BlenderbotAttention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
if
layer_head_mask
.
size
()
!=
(
self
.
num_heads
,):
...
...
@@ -244,7 +243,7 @@ class BlenderbotAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -306,15 +305,15 @@ class BlenderbotEncoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
if
hidden_states
.
dtype
==
torch
.
float16
and
(
...
...
@@ -402,7 +401,7 @@ class BlenderbotDecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# Cross-Attention Block
...
...
@@ -422,7 +421,7 @@ class BlenderbotDecoderLayer(nn.Module):
past_key_value
=
cross_attn_past_key_value
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# add cross-attn to positions 3,4 of present_key_value tuple
...
...
@@ -432,9 +431,9 @@ class BlenderbotDecoderLayer(nn.Module):
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
outputs
=
(
hidden_states
,)
...
...
@@ -617,7 +616,7 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel):
Args:
config: BlenderbotConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
BlenderbotConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -715,7 +714,7 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel):
embed_pos
=
self
.
embed_positions
(
input_shape
)
hidden_states
=
inputs_embeds
+
embed_pos
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# expand attention_mask
if
attention_mask
is
not
None
:
...
...
@@ -784,7 +783,7 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel):
Args:
config: BlenderbotConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
BlenderbotConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -956,7 +955,7 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel):
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# decoder layers
all_hidden_states
=
()
if
output_hidden_states
else
None
...
...
src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
View file @
1ed2ebf6
...
...
@@ -21,7 +21,6 @@ import random
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
...
...
@@ -222,7 +221,7 @@ class BlenderbotSmallAttention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
if
layer_head_mask
.
size
()
!=
(
self
.
num_heads
,):
...
...
@@ -242,7 +241,7 @@ class BlenderbotSmallAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -303,15 +302,15 @@ class BlenderbotSmallEncoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
self_attn_layer_norm
(
hidden_states
)
residual
=
hidden_states
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
...
...
@@ -399,7 +398,7 @@ class BlenderbotSmallDecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
self_attn_layer_norm
(
hidden_states
)
...
...
@@ -419,7 +418,7 @@ class BlenderbotSmallDecoderLayer(nn.Module):
past_key_value
=
cross_attn_past_key_value
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
encoder_attn_layer_norm
(
hidden_states
)
...
...
@@ -429,9 +428,9 @@ class BlenderbotSmallDecoderLayer(nn.Module):
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
...
...
@@ -618,7 +617,7 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
Args:
config: BlenderbotSmallConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
BlenderbotSmallConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -717,7 +716,7 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
hidden_states
=
inputs_embeds
+
embed_pos
hidden_states
=
self
.
layernorm_embedding
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# expand attention_mask
if
attention_mask
is
not
None
:
...
...
@@ -784,7 +783,7 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
Args:
config: BlenderbotSmallConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
BlenderbotSmallConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -957,7 +956,7 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
inputs_embeds
=
self
.
layernorm_embedding
(
inputs_embeds
)
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# decoder layers
all_hidden_states
=
()
if
output_hidden_states
else
None
...
...
src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
View file @
1ed2ebf6
...
...
@@ -21,6 +21,7 @@ import os
import
numpy
as
np
import
torch
from
packaging
import
version
from
torch
import
nn
import
gluonnlp
as
nlp
import
mxnet
as
mx
...
...
@@ -170,8 +171,8 @@ def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_f
# | `encoder.transformer_cells.*.proj.weight` | `bert.encoder.layer.*.output.dense.weight`
# Helper function to convert MXNET Arrays to PyTorch
def
to_torch
(
mx_array
)
->
torch
.
nn
.
Parameter
:
return
torch
.
nn
.
Parameter
(
torch
.
FloatTensor
(
mx_array
.
data
().
asnumpy
()))
def
to_torch
(
mx_array
)
->
nn
.
Parameter
:
return
nn
.
Parameter
(
torch
.
FloatTensor
(
mx_array
.
data
().
asnumpy
()))
# Check param shapes and map new HF param back
def
check_and_map_params
(
hf_param
,
gluon_param
):
...
...
src/transformers/models/clip/modeling_clip.py
View file @
1ed2ebf6
...
...
@@ -18,7 +18,6 @@
from
typing
import
Any
,
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
...
...
@@ -62,7 +61,7 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
# contrastive loss function, adapted from
# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
def
contrastive_loss
(
logits
:
torch
.
Tensor
,
dim
:
int
)
->
torch
.
Tensor
:
neg_ce
=
torch
.
diag
(
F
.
log_softmax
(
logits
,
dim
=
dim
))
neg_ce
=
torch
.
diag
(
nn
.
functional
.
log_softmax
(
logits
,
dim
=
dim
))
return
-
neg_ce
.
mean
()
...
...
@@ -235,7 +234,7 @@ class CLIPAttention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
output_attentions
:
# this operation is a bit akward, but it's required to
...
...
@@ -247,7 +246,7 @@ class CLIPAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -493,7 +492,7 @@ class CLIPEncoder(nn.Module):
Args:
config: CLIPConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
CLIPConfig
):
...
...
src/transformers/models/convbert/modeling_convbert.py
View file @
1ed2ebf6
...
...
@@ -383,7 +383,7 @@ class ConvBertSelfAttention(nn.Module):
attention_scores
=
attention_scores
+
attention_mask
# Normalize the attention scores to probabilities.
attention_probs
=
torch
.
nn
.
functional
.
softmax
(
attention_scores
,
dim
=-
1
)
attention_probs
=
nn
.
functional
.
softmax
(
attention_scores
,
dim
=-
1
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
...
...
src/transformers/models/ctrl/modeling_ctrl.py
View file @
1ed2ebf6
...
...
@@ -19,7 +19,7 @@ from typing import Tuple
import
numpy
as
np
import
torch
import
torch.nn
as
nn
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
...file_utils
import
add_code_sample_docstrings
,
add_start_docstrings
,
add_start_docstrings_to_model_forward
...
...
@@ -87,7 +87,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
return
output
,
attention_weights
class
MultiHeadAttention
(
torch
.
nn
.
Module
):
class
MultiHeadAttention
(
nn
.
Module
):
def
__init__
(
self
,
d_model_size
,
num_heads
):
super
().
__init__
()
self
.
num_heads
=
num_heads
...
...
@@ -95,11 +95,11 @@ class MultiHeadAttention(torch.nn.Module):
self
.
depth
=
int
(
d_model_size
/
self
.
num_heads
)
self
.
Wq
=
torch
.
nn
.
Linear
(
d_model_size
,
d_model_size
)
self
.
Wk
=
torch
.
nn
.
Linear
(
d_model_size
,
d_model_size
)
self
.
Wv
=
torch
.
nn
.
Linear
(
d_model_size
,
d_model_size
)
self
.
Wq
=
nn
.
Linear
(
d_model_size
,
d_model_size
)
self
.
Wk
=
nn
.
Linear
(
d_model_size
,
d_model_size
)
self
.
Wv
=
nn
.
Linear
(
d_model_size
,
d_model_size
)
self
.
dense
=
torch
.
nn
.
Linear
(
d_model_size
,
d_model_size
)
self
.
dense
=
nn
.
Linear
(
d_model_size
,
d_model_size
)
self
.
pruned_heads
=
set
()
def
prune_heads
(
self
,
heads
):
...
...
@@ -167,21 +167,21 @@ class MultiHeadAttention(torch.nn.Module):
def
point_wise_feed_forward_network
(
d_model_size
,
dff
):
return
torch
.
nn
.
Sequential
(
torch
.
nn
.
Linear
(
d_model_size
,
dff
),
torch
.
nn
.
ReLU
(),
torch
.
nn
.
Linear
(
dff
,
d_model_size
))
return
nn
.
Sequential
(
nn
.
Linear
(
d_model_size
,
dff
),
nn
.
ReLU
(),
nn
.
Linear
(
dff
,
d_model_size
))
class
EncoderLayer
(
torch
.
nn
.
Module
):
class
EncoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
d_model_size
,
num_heads
,
dff
,
rate
=
0.1
):
super
().
__init__
()
self
.
multi_head_attention
=
MultiHeadAttention
(
d_model_size
,
num_heads
)
self
.
ffn
=
point_wise_feed_forward_network
(
d_model_size
,
dff
)
self
.
layernorm1
=
torch
.
nn
.
LayerNorm
(
d_model_size
,
eps
=
1e-6
)
self
.
layernorm2
=
torch
.
nn
.
LayerNorm
(
d_model_size
,
eps
=
1e-6
)
self
.
layernorm1
=
nn
.
LayerNorm
(
d_model_size
,
eps
=
1e-6
)
self
.
layernorm2
=
nn
.
LayerNorm
(
d_model_size
,
eps
=
1e-6
)
self
.
dropout1
=
torch
.
nn
.
Dropout
(
rate
)
self
.
dropout2
=
torch
.
nn
.
Dropout
(
rate
)
self
.
dropout1
=
nn
.
Dropout
(
rate
)
self
.
dropout2
=
nn
.
Dropout
(
rate
)
def
forward
(
self
,
x
,
mask
,
layer_past
=
None
,
attention_mask
=
None
,
head_mask
=
None
,
use_cache
=
False
,
output_attentions
=
False
...
...
src/transformers/models/deberta/modeling_deberta.py
View file @
1ed2ebf6
...
...
@@ -163,7 +163,7 @@ class XDropout(torch.autograd.Function):
return
grad_output
,
None
class
StableDropout
(
torch
.
nn
.
Module
):
class
StableDropout
(
nn
.
Module
):
"""
Optimized dropout module for stabilizing the training
...
...
@@ -477,7 +477,7 @@ def pos_dynamic_expand(pos_index, p2c_att, key_layer):
return
pos_index
.
expand
(
p2c_att
.
size
()[:
2
]
+
(
pos_index
.
size
(
-
2
),
key_layer
.
size
(
-
2
)))
class
DisentangledSelfAttention
(
torch
.
nn
.
Module
):
class
DisentangledSelfAttention
(
nn
.
Module
):
"""
Disentangled self-attention module
...
...
@@ -498,19 +498,17 @@ class DisentangledSelfAttention(torch.nn.Module):
self
.
num_attention_heads
=
config
.
num_attention_heads
self
.
attention_head_size
=
int
(
config
.
hidden_size
/
config
.
num_attention_heads
)
self
.
all_head_size
=
self
.
num_attention_heads
*
self
.
attention_head_size
self
.
in_proj
=
torch
.
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
*
3
,
bias
=
False
)
self
.
q_bias
=
torch
.
nn
.
Parameter
(
torch
.
zeros
((
self
.
all_head_size
),
dtype
=
torch
.
float
))
self
.
v_bias
=
torch
.
nn
.
Parameter
(
torch
.
zeros
((
self
.
all_head_size
),
dtype
=
torch
.
float
))
self
.
in_proj
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
*
3
,
bias
=
False
)
self
.
q_bias
=
nn
.
Parameter
(
torch
.
zeros
((
self
.
all_head_size
),
dtype
=
torch
.
float
))
self
.
v_bias
=
nn
.
Parameter
(
torch
.
zeros
((
self
.
all_head_size
),
dtype
=
torch
.
float
))
self
.
pos_att_type
=
config
.
pos_att_type
if
config
.
pos_att_type
is
not
None
else
[]
self
.
relative_attention
=
getattr
(
config
,
"relative_attention"
,
False
)
self
.
talking_head
=
getattr
(
config
,
"talking_head"
,
False
)
if
self
.
talking_head
:
self
.
head_logits_proj
=
torch
.
nn
.
Linear
(
config
.
num_attention_heads
,
config
.
num_attention_heads
,
bias
=
False
)
self
.
head_weights_proj
=
torch
.
nn
.
Linear
(
config
.
num_attention_heads
,
config
.
num_attention_heads
,
bias
=
False
)
self
.
head_logits_proj
=
nn
.
Linear
(
config
.
num_attention_heads
,
config
.
num_attention_heads
,
bias
=
False
)
self
.
head_weights_proj
=
nn
.
Linear
(
config
.
num_attention_heads
,
config
.
num_attention_heads
,
bias
=
False
)
if
self
.
relative_attention
:
self
.
max_relative_positions
=
getattr
(
config
,
"max_relative_positions"
,
-
1
)
...
...
@@ -519,9 +517,9 @@ class DisentangledSelfAttention(torch.nn.Module):
self
.
pos_dropout
=
StableDropout
(
config
.
hidden_dropout_prob
)
if
"c2p"
in
self
.
pos_att_type
or
"p2p"
in
self
.
pos_att_type
:
self
.
pos_proj
=
torch
.
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
,
bias
=
False
)
self
.
pos_proj
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
,
bias
=
False
)
if
"p2c"
in
self
.
pos_att_type
or
"p2p"
in
self
.
pos_att_type
:
self
.
pos_q_proj
=
torch
.
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
pos_q_proj
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
dropout
=
StableDropout
(
config
.
attention_probs_dropout_prob
)
...
...
@@ -1122,7 +1120,7 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel):
self
.
pooler
=
ContextPooler
(
config
)
output_dim
=
self
.
pooler
.
output_dim
self
.
classifier
=
torch
.
nn
.
Linear
(
output_dim
,
num_labels
)
self
.
classifier
=
nn
.
Linear
(
output_dim
,
num_labels
)
drop_out
=
getattr
(
config
,
"cls_dropout"
,
None
)
drop_out
=
self
.
config
.
hidden_dropout_prob
if
drop_out
is
None
else
drop_out
self
.
dropout
=
StableDropout
(
drop_out
)
...
...
@@ -1182,7 +1180,7 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel):
if
labels
is
not
None
:
if
self
.
num_labels
==
1
:
# regression task
loss_fn
=
torch
.
nn
.
MSELoss
()
loss_fn
=
nn
.
MSELoss
()
logits
=
logits
.
view
(
-
1
).
to
(
labels
.
dtype
)
loss
=
loss_fn
(
logits
,
labels
.
view
(
-
1
))
elif
labels
.
dim
()
==
1
or
labels
.
size
(
-
1
)
==
1
:
...
...
@@ -1196,7 +1194,7 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel):
else
:
loss
=
torch
.
tensor
(
0
).
to
(
logits
)
else
:
log_softmax
=
torch
.
nn
.
LogSoftmax
(
-
1
)
log_softmax
=
nn
.
LogSoftmax
(
-
1
)
loss
=
-
((
log_softmax
(
logits
)
*
labels
).
sum
(
-
1
)).
mean
()
if
not
return_dict
:
output
=
(
logits
,)
+
outputs
[
1
:]
...
...
src/transformers/models/deberta_v2/modeling_deberta_v2.py
View file @
1ed2ebf6
...
...
@@ -168,7 +168,7 @@ class XDropout(torch.autograd.Function):
# Copied from transformers.models.deberta.modeling_deberta.StableDropout
class
StableDropout
(
torch
.
nn
.
Module
):
class
StableDropout
(
nn
.
Module
):
"""
Optimized dropout module for stabilizing the training
...
...
@@ -342,7 +342,7 @@ class ConvLayer(nn.Module):
kernel_size
=
getattr
(
config
,
"conv_kernel_size"
,
3
)
groups
=
getattr
(
config
,
"conv_groups"
,
1
)
self
.
conv_act
=
getattr
(
config
,
"conv_act"
,
"tanh"
)
self
.
conv
=
torch
.
nn
.
Conv1d
(
self
.
conv
=
nn
.
Conv1d
(
config
.
hidden_size
,
config
.
hidden_size
,
kernel_size
,
padding
=
(
kernel_size
-
1
)
//
2
,
groups
=
groups
)
self
.
LayerNorm
=
LayerNorm
(
config
.
hidden_size
,
config
.
layer_norm_eps
)
...
...
@@ -546,7 +546,7 @@ def pos_dynamic_expand(pos_index, p2c_att, key_layer):
return
pos_index
.
expand
(
p2c_att
.
size
()[:
2
]
+
(
pos_index
.
size
(
-
2
),
key_layer
.
size
(
-
2
)))
class
DisentangledSelfAttention
(
torch
.
nn
.
Module
):
class
DisentangledSelfAttention
(
nn
.
Module
):
"""
Disentangled self-attention module
...
...
@@ -1244,7 +1244,7 @@ class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
self
.
pooler
=
ContextPooler
(
config
)
output_dim
=
self
.
pooler
.
output_dim
self
.
classifier
=
torch
.
nn
.
Linear
(
output_dim
,
num_labels
)
self
.
classifier
=
nn
.
Linear
(
output_dim
,
num_labels
)
drop_out
=
getattr
(
config
,
"cls_dropout"
,
None
)
drop_out
=
self
.
config
.
hidden_dropout_prob
if
drop_out
is
None
else
drop_out
self
.
dropout
=
StableDropout
(
drop_out
)
...
...
@@ -1304,7 +1304,7 @@ class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
if
labels
is
not
None
:
if
self
.
num_labels
==
1
:
# regression task
loss_fn
=
torch
.
nn
.
MSELoss
()
loss_fn
=
nn
.
MSELoss
()
logits
=
logits
.
view
(
-
1
).
to
(
labels
.
dtype
)
loss
=
loss_fn
(
logits
,
labels
.
view
(
-
1
))
elif
labels
.
dim
()
==
1
or
labels
.
size
(
-
1
)
==
1
:
...
...
@@ -1318,7 +1318,7 @@ class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
else
:
loss
=
torch
.
tensor
(
0
).
to
(
logits
)
else
:
log_softmax
=
torch
.
nn
.
LogSoftmax
(
-
1
)
log_softmax
=
nn
.
LogSoftmax
(
-
1
)
loss
=
-
((
log_softmax
(
logits
)
*
labels
).
sum
(
-
1
)).
mean
()
if
not
return_dict
:
output
=
(
logits
,)
+
outputs
[
1
:]
...
...
src/transformers/models/detr/feature_extraction_detr.py
View file @
1ed2ebf6
...
...
@@ -30,7 +30,7 @@ from ...utils import logging
if
is_torch_available
():
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
logger
=
logging
.
get_logger
(
__name__
)
...
...
@@ -374,7 +374,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
# use PyTorch as current workaround
# TODO replace by self.resize
masks
=
torch
.
from_numpy
(
target
[
"masks"
][:,
None
]).
float
()
interpolated_masks
=
F
.
interpolate
(
masks
,
size
=
(
h
,
w
),
mode
=
"nearest"
)[:,
0
]
>
0.5
interpolated_masks
=
nn
.
functional
.
interpolate
(
masks
,
size
=
(
h
,
w
),
mode
=
"nearest"
)[:,
0
]
>
0.5
target
[
"masks"
]
=
interpolated_masks
.
numpy
()
return
rescaled_image
,
target
...
...
@@ -697,7 +697,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
target_sizes
.
shape
[
1
]
==
2
),
"Each element of target_sizes must contain the size (h, w) of each image of the batch"
prob
=
F
.
softmax
(
out_logits
,
-
1
)
prob
=
nn
.
functional
.
softmax
(
out_logits
,
-
1
)
scores
,
labels
=
prob
[...,
:
-
1
].
max
(
-
1
)
# convert to [x0, y0, x1, y1] format
...
...
@@ -742,13 +742,15 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
),
"Make sure to pass in as many orig_target_sizes as max_target_sizes"
max_h
,
max_w
=
max_target_sizes
.
max
(
0
)[
0
].
tolist
()
outputs_masks
=
outputs
.
pred_masks
.
squeeze
(
2
)
outputs_masks
=
F
.
interpolate
(
outputs_masks
,
size
=
(
max_h
,
max_w
),
mode
=
"bilinear"
,
align_corners
=
False
)
outputs_masks
=
nn
.
functional
.
interpolate
(
outputs_masks
,
size
=
(
max_h
,
max_w
),
mode
=
"bilinear"
,
align_corners
=
False
)
outputs_masks
=
(
outputs_masks
.
sigmoid
()
>
threshold
).
cpu
()
for
i
,
(
cur_mask
,
t
,
tt
)
in
enumerate
(
zip
(
outputs_masks
,
max_target_sizes
,
orig_target_sizes
)):
img_h
,
img_w
=
t
[
0
],
t
[
1
]
results
[
i
][
"masks"
]
=
cur_mask
[:,
:
img_h
,
:
img_w
].
unsqueeze
(
1
)
results
[
i
][
"masks"
]
=
F
.
interpolate
(
results
[
i
][
"masks"
]
=
nn
.
functional
.
interpolate
(
results
[
i
][
"masks"
].
float
(),
size
=
tuple
(
tt
.
tolist
()),
mode
=
"nearest"
).
byte
()
...
...
@@ -810,7 +812,7 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
cur_scores
=
cur_scores
[
keep
]
cur_classes
=
cur_classes
[
keep
]
cur_masks
=
cur_masks
[
keep
]
cur_masks
=
F
.
interpolate
(
cur_masks
[:,
None
],
to_tuple
(
size
),
mode
=
"bilinear"
).
squeeze
(
1
)
cur_masks
=
nn
.
functional
.
interpolate
(
cur_masks
[:,
None
],
to_tuple
(
size
),
mode
=
"bilinear"
).
squeeze
(
1
)
cur_boxes
=
center_to_corners_format
(
cur_boxes
[
keep
])
h
,
w
=
cur_masks
.
shape
[
-
2
:]
...
...
src/transformers/models/detr/modeling_detr.py
View file @
1ed2ebf6
...
...
@@ -21,7 +21,6 @@ from dataclasses import dataclass
from
typing
import
Dict
,
List
,
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
from
torch
import
Tensor
,
nn
from
...activations
import
ACT2FN
...
...
@@ -314,7 +313,7 @@ class DetrFrozenBatchNorm2d(nn.Module):
def
replace_batch_norm
(
m
,
name
=
""
):
for
attr_str
in
dir
(
m
):
target_attr
=
getattr
(
m
,
attr_str
)
if
isinstance
(
target_attr
,
torch
.
nn
.
BatchNorm2d
):
if
isinstance
(
target_attr
,
nn
.
BatchNorm2d
):
frozen
=
DetrFrozenBatchNorm2d
(
target_attr
.
num_features
)
bn
=
getattr
(
m
,
attr_str
)
frozen
.
weight
.
data
.
copy_
(
bn
.
weight
)
...
...
@@ -362,7 +361,7 @@ class DetrTimmConvEncoder(nn.Module):
out
=
[]
for
feature_map
in
features
:
# downsample pixel_mask to match shape of corresponding feature_map
mask
=
F
.
interpolate
(
pixel_mask
[
None
].
float
(),
size
=
feature_map
.
shape
[
-
2
:]).
to
(
torch
.
bool
)[
0
]
mask
=
nn
.
functional
.
interpolate
(
pixel_mask
[
None
].
float
(),
size
=
feature_map
.
shape
[
-
2
:]).
to
(
torch
.
bool
)[
0
]
out
.
append
((
feature_map
,
mask
))
return
out
...
...
@@ -570,7 +569,7 @@ class DetrAttention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
output_attentions
:
# this operation is a bit awkward, but it's required to
...
...
@@ -582,7 +581,7 @@ class DetrAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -642,16 +641,16 @@ class DetrEncoderLayer(nn.Module):
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
self_attn_layer_norm
(
hidden_states
)
residual
=
hidden_states
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
...
...
@@ -731,7 +730,7 @@ class DetrDecoderLayer(nn.Module):
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
self_attn_layer_norm
(
hidden_states
)
...
...
@@ -749,16 +748,16 @@ class DetrDecoderLayer(nn.Module):
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
encoder_attn_layer_norm
(
hidden_states
)
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
...
...
@@ -885,7 +884,7 @@ class DetrEncoder(DetrPreTrainedModel):
Args:
config: DetrConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
DetrConfig
):
...
...
@@ -946,7 +945,7 @@ class DetrEncoder(DetrPreTrainedModel):
return_dict
=
return_dict
if
return_dict
is
not
None
else
self
.
config
.
use_return_dict
hidden_states
=
inputs_embeds
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# expand attention_mask
if
attention_mask
is
not
None
:
...
...
@@ -999,7 +998,7 @@ class DetrDecoder(DetrPreTrainedModel):
Args:
config: DetrConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
DetrConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -1717,23 +1716,23 @@ class DetrMaskHeadSmallConv(nn.Module):
inter_dims
=
[
dim
,
context_dim
//
2
,
context_dim
//
4
,
context_dim
//
8
,
context_dim
//
16
,
context_dim
//
64
]
self
.
lay1
=
torch
.
nn
.
Conv2d
(
dim
,
dim
,
3
,
padding
=
1
)
self
.
gn1
=
torch
.
nn
.
GroupNorm
(
8
,
dim
)
self
.
lay2
=
torch
.
nn
.
Conv2d
(
dim
,
inter_dims
[
1
],
3
,
padding
=
1
)
self
.
gn2
=
torch
.
nn
.
GroupNorm
(
8
,
inter_dims
[
1
])
self
.
lay3
=
torch
.
nn
.
Conv2d
(
inter_dims
[
1
],
inter_dims
[
2
],
3
,
padding
=
1
)
self
.
gn3
=
torch
.
nn
.
GroupNorm
(
8
,
inter_dims
[
2
])
self
.
lay4
=
torch
.
nn
.
Conv2d
(
inter_dims
[
2
],
inter_dims
[
3
],
3
,
padding
=
1
)
self
.
gn4
=
torch
.
nn
.
GroupNorm
(
8
,
inter_dims
[
3
])
self
.
lay5
=
torch
.
nn
.
Conv2d
(
inter_dims
[
3
],
inter_dims
[
4
],
3
,
padding
=
1
)
self
.
gn5
=
torch
.
nn
.
GroupNorm
(
8
,
inter_dims
[
4
])
self
.
out_lay
=
torch
.
nn
.
Conv2d
(
inter_dims
[
4
],
1
,
3
,
padding
=
1
)
self
.
lay1
=
nn
.
Conv2d
(
dim
,
dim
,
3
,
padding
=
1
)
self
.
gn1
=
nn
.
GroupNorm
(
8
,
dim
)
self
.
lay2
=
nn
.
Conv2d
(
dim
,
inter_dims
[
1
],
3
,
padding
=
1
)
self
.
gn2
=
nn
.
GroupNorm
(
8
,
inter_dims
[
1
])
self
.
lay3
=
nn
.
Conv2d
(
inter_dims
[
1
],
inter_dims
[
2
],
3
,
padding
=
1
)
self
.
gn3
=
nn
.
GroupNorm
(
8
,
inter_dims
[
2
])
self
.
lay4
=
nn
.
Conv2d
(
inter_dims
[
2
],
inter_dims
[
3
],
3
,
padding
=
1
)
self
.
gn4
=
nn
.
GroupNorm
(
8
,
inter_dims
[
3
])
self
.
lay5
=
nn
.
Conv2d
(
inter_dims
[
3
],
inter_dims
[
4
],
3
,
padding
=
1
)
self
.
gn5
=
nn
.
GroupNorm
(
8
,
inter_dims
[
4
])
self
.
out_lay
=
nn
.
Conv2d
(
inter_dims
[
4
],
1
,
3
,
padding
=
1
)
self
.
dim
=
dim
self
.
adapter1
=
torch
.
nn
.
Conv2d
(
fpn_dims
[
0
],
inter_dims
[
1
],
1
)
self
.
adapter2
=
torch
.
nn
.
Conv2d
(
fpn_dims
[
1
],
inter_dims
[
2
],
1
)
self
.
adapter3
=
torch
.
nn
.
Conv2d
(
fpn_dims
[
2
],
inter_dims
[
3
],
1
)
self
.
adapter1
=
nn
.
Conv2d
(
fpn_dims
[
0
],
inter_dims
[
1
],
1
)
self
.
adapter2
=
nn
.
Conv2d
(
fpn_dims
[
1
],
inter_dims
[
2
],
1
)
self
.
adapter3
=
nn
.
Conv2d
(
fpn_dims
[
2
],
inter_dims
[
3
],
1
)
for
m
in
self
.
modules
():
if
isinstance
(
m
,
nn
.
Conv2d
):
...
...
@@ -1748,34 +1747,34 @@ class DetrMaskHeadSmallConv(nn.Module):
x
=
self
.
lay1
(
x
)
x
=
self
.
gn1
(
x
)
x
=
F
.
relu
(
x
)
x
=
nn
.
functional
.
relu
(
x
)
x
=
self
.
lay2
(
x
)
x
=
self
.
gn2
(
x
)
x
=
F
.
relu
(
x
)
x
=
nn
.
functional
.
relu
(
x
)
cur_fpn
=
self
.
adapter1
(
fpns
[
0
])
if
cur_fpn
.
size
(
0
)
!=
x
.
size
(
0
):
cur_fpn
=
_expand
(
cur_fpn
,
x
.
size
(
0
)
//
cur_fpn
.
size
(
0
))
x
=
cur_fpn
+
F
.
interpolate
(
x
,
size
=
cur_fpn
.
shape
[
-
2
:],
mode
=
"nearest"
)
x
=
cur_fpn
+
nn
.
functional
.
interpolate
(
x
,
size
=
cur_fpn
.
shape
[
-
2
:],
mode
=
"nearest"
)
x
=
self
.
lay3
(
x
)
x
=
self
.
gn3
(
x
)
x
=
F
.
relu
(
x
)
x
=
nn
.
functional
.
relu
(
x
)
cur_fpn
=
self
.
adapter2
(
fpns
[
1
])
if
cur_fpn
.
size
(
0
)
!=
x
.
size
(
0
):
cur_fpn
=
_expand
(
cur_fpn
,
x
.
size
(
0
)
//
cur_fpn
.
size
(
0
))
x
=
cur_fpn
+
F
.
interpolate
(
x
,
size
=
cur_fpn
.
shape
[
-
2
:],
mode
=
"nearest"
)
x
=
cur_fpn
+
nn
.
functional
.
interpolate
(
x
,
size
=
cur_fpn
.
shape
[
-
2
:],
mode
=
"nearest"
)
x
=
self
.
lay4
(
x
)
x
=
self
.
gn4
(
x
)
x
=
F
.
relu
(
x
)
x
=
nn
.
functional
.
relu
(
x
)
cur_fpn
=
self
.
adapter3
(
fpns
[
2
])
if
cur_fpn
.
size
(
0
)
!=
x
.
size
(
0
):
cur_fpn
=
_expand
(
cur_fpn
,
x
.
size
(
0
)
//
cur_fpn
.
size
(
0
))
x
=
cur_fpn
+
F
.
interpolate
(
x
,
size
=
cur_fpn
.
shape
[
-
2
:],
mode
=
"nearest"
)
x
=
cur_fpn
+
nn
.
functional
.
interpolate
(
x
,
size
=
cur_fpn
.
shape
[
-
2
:],
mode
=
"nearest"
)
x
=
self
.
lay5
(
x
)
x
=
self
.
gn5
(
x
)
x
=
F
.
relu
(
x
)
x
=
nn
.
functional
.
relu
(
x
)
x
=
self
.
out_lay
(
x
)
return
x
...
...
@@ -1797,14 +1796,14 @@ class DetrMHAttentionMap(nn.Module):
def
forward
(
self
,
q
,
k
,
mask
:
Optional
[
Tensor
]
=
None
):
q
=
self
.
q_linear
(
q
)
k
=
F
.
conv2d
(
k
,
self
.
k_linear
.
weight
.
unsqueeze
(
-
1
).
unsqueeze
(
-
1
),
self
.
k_linear
.
bias
)
k
=
nn
.
functional
.
conv2d
(
k
,
self
.
k_linear
.
weight
.
unsqueeze
(
-
1
).
unsqueeze
(
-
1
),
self
.
k_linear
.
bias
)
queries_per_head
=
q
.
view
(
q
.
shape
[
0
],
q
.
shape
[
1
],
self
.
num_heads
,
self
.
hidden_dim
//
self
.
num_heads
)
keys_per_head
=
k
.
view
(
k
.
shape
[
0
],
self
.
num_heads
,
self
.
hidden_dim
//
self
.
num_heads
,
k
.
shape
[
-
2
],
k
.
shape
[
-
1
])
weights
=
torch
.
einsum
(
"bqnc,bnchw->bqnhw"
,
queries_per_head
*
self
.
normalize_fact
,
keys_per_head
)
if
mask
is
not
None
:
weights
.
masked_fill_
(
mask
.
unsqueeze
(
1
).
unsqueeze
(
1
),
float
(
"-inf"
))
weights
=
F
.
softmax
(
weights
.
flatten
(
2
),
dim
=-
1
).
view
(
weights
.
size
())
weights
=
nn
.
functional
.
softmax
(
weights
.
flatten
(
2
),
dim
=-
1
).
view
(
weights
.
size
())
weights
=
self
.
dropout
(
weights
)
return
weights
...
...
@@ -1847,7 +1846,7 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
Loss tensor
"""
prob
=
inputs
.
sigmoid
()
ce_loss
=
F
.
binary_cross_entropy_with_logits
(
inputs
,
targets
,
reduction
=
"none"
)
ce_loss
=
nn
.
functional
.
binary_cross_entropy_with_logits
(
inputs
,
targets
,
reduction
=
"none"
)
p_t
=
prob
*
targets
+
(
1
-
prob
)
*
(
1
-
targets
)
loss
=
ce_loss
*
((
1
-
p_t
)
**
gamma
)
...
...
@@ -1909,7 +1908,7 @@ class DetrLoss(nn.Module):
)
target_classes
[
idx
]
=
target_classes_o
loss_ce
=
F
.
cross_entropy
(
src_logits
.
transpose
(
1
,
2
),
target_classes
,
self
.
empty_weight
)
loss_ce
=
nn
.
functional
.
cross_entropy
(
src_logits
.
transpose
(
1
,
2
),
target_classes
,
self
.
empty_weight
)
losses
=
{
"loss_ce"
:
loss_ce
}
return
losses
...
...
@@ -1926,7 +1925,7 @@ class DetrLoss(nn.Module):
tgt_lengths
=
torch
.
as_tensor
([
len
(
v
[
"class_labels"
])
for
v
in
targets
],
device
=
device
)
# Count the number of predictions that are NOT "no-object" (which is the last class)
card_pred
=
(
logits
.
argmax
(
-
1
)
!=
logits
.
shape
[
-
1
]
-
1
).
sum
(
1
)
card_err
=
F
.
l1_loss
(
card_pred
.
float
(),
tgt_lengths
.
float
())
card_err
=
nn
.
functional
.
l1_loss
(
card_pred
.
float
(),
tgt_lengths
.
float
())
losses
=
{
"cardinality_error"
:
card_err
}
return
losses
...
...
@@ -1942,7 +1941,7 @@ class DetrLoss(nn.Module):
src_boxes
=
outputs
[
"pred_boxes"
][
idx
]
target_boxes
=
torch
.
cat
([
t
[
"boxes"
][
i
]
for
t
,
(
_
,
i
)
in
zip
(
targets
,
indices
)],
dim
=
0
)
loss_bbox
=
F
.
l1_loss
(
src_boxes
,
target_boxes
,
reduction
=
"none"
)
loss_bbox
=
nn
.
functional
.
l1_loss
(
src_boxes
,
target_boxes
,
reduction
=
"none"
)
losses
=
{}
losses
[
"loss_bbox"
]
=
loss_bbox
.
sum
()
/
num_boxes
...
...
@@ -1972,7 +1971,7 @@ class DetrLoss(nn.Module):
target_masks
=
target_masks
[
tgt_idx
]
# upsample predictions to the target size
src_masks
=
F
.
interpolate
(
src_masks
=
nn
.
functional
.
interpolate
(
src_masks
[:,
None
],
size
=
target_masks
.
shape
[
-
2
:],
mode
=
"bilinear"
,
align_corners
=
False
)
src_masks
=
src_masks
[:,
0
].
flatten
(
1
)
...
...
@@ -2068,7 +2067,7 @@ class DetrMLPPredictionHead(nn.Module):
def
forward
(
self
,
x
):
for
i
,
layer
in
enumerate
(
self
.
layers
):
x
=
F
.
relu
(
layer
(
x
))
if
i
<
self
.
num_layers
-
1
else
layer
(
x
)
x
=
nn
.
functional
.
relu
(
layer
(
x
))
if
i
<
self
.
num_layers
-
1
else
layer
(
x
)
return
x
...
...
src/transformers/models/distilbert/modeling_distilbert.py
View file @
1ed2ebf6
...
...
@@ -23,7 +23,7 @@ import math
import
numpy
as
np
import
torch
import
torch.nn
as
nn
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
from
...activations
import
gelu
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment