Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
1ed2ebf6
Unverified
Commit
1ed2ebf6
authored
Jun 14, 2021
by
Stas Bekman
Committed by
GitHub
Jun 14, 2021
Browse files
[style] consistent nn. and nn.functional (#12124)
* consistent nn. and nn.functional * fix glitch * fix glitch #2
parent
ff7c8168
Changes
63
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
118 additions
and
123 deletions
+118
-123
src/transformers/models/openai/modeling_openai.py
src/transformers/models/openai/modeling_openai.py
+1
-1
src/transformers/models/pegasus/modeling_pegasus.py
src/transformers/models/pegasus/modeling_pegasus.py
+13
-14
src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
...vert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
+8
-8
src/transformers/models/prophetnet/modeling_prophetnet.py
src/transformers/models/prophetnet/modeling_prophetnet.py
+18
-17
src/transformers/models/rag/modeling_rag.py
src/transformers/models/rag/modeling_rag.py
+4
-3
src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
...s/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
+4
-3
src/transformers/models/reformer/modeling_reformer.py
src/transformers/models/reformer/modeling_reformer.py
+1
-1
src/transformers/models/retribert/modeling_retribert.py
src/transformers/models/retribert/modeling_retribert.py
+1
-1
src/transformers/models/roberta/modeling_roberta.py
src/transformers/models/roberta/modeling_roberta.py
+1
-1
src/transformers/models/speech_to_text/modeling_speech_to_text.py
...sformers/models/speech_to_text/modeling_speech_to_text.py
+13
-14
src/transformers/models/squeezebert/modeling_squeezebert.py
src/transformers/models/squeezebert/modeling_squeezebert.py
+1
-1
src/transformers/models/t5/modeling_t5.py
src/transformers/models/t5/modeling_t5.py
+4
-5
src/transformers/models/tapas/modeling_tapas.py
src/transformers/models/tapas/modeling_tapas.py
+4
-6
src/transformers/models/transfo_xl/modeling_transfo_xl.py
src/transformers/models/transfo_xl/modeling_transfo_xl.py
+4
-5
src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
...ormers/models/transfo_xl/modeling_transfo_xl_utilities.py
+11
-12
src/transformers/models/visual_bert/modeling_visual_bert.py
src/transformers/models/visual_bert/modeling_visual_bert.py
+4
-4
src/transformers/models/wav2vec2/modeling_wav2vec2.py
src/transformers/models/wav2vec2/modeling_wav2vec2.py
+11
-12
src/transformers/models/xlm/modeling_tf_xlm.py
src/transformers/models/xlm/modeling_tf_xlm.py
+1
-1
src/transformers/models/xlm/modeling_xlm.py
src/transformers/models/xlm/modeling_xlm.py
+10
-9
src/transformers/models/xlnet/modeling_xlnet.py
src/transformers/models/xlnet/modeling_xlnet.py
+4
-5
No files found.
src/transformers/models/openai/modeling_openai.py
View file @
1ed2ebf6
...
...
@@ -23,7 +23,7 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
...activations
import
gelu_new
,
silu
...
...
src/transformers/models/pegasus/modeling_pegasus.py
View file @
1ed2ebf6
...
...
@@ -21,7 +21,6 @@ from typing import Optional, Tuple
import
numpy
as
np
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
...
...
@@ -239,7 +238,7 @@ class PegasusAttention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
if
layer_head_mask
.
size
()
!=
(
self
.
num_heads
,):
...
...
@@ -259,7 +258,7 @@ class PegasusAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -321,15 +320,15 @@ class PegasusEncoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
if
hidden_states
.
dtype
==
torch
.
float16
and
(
...
...
@@ -417,7 +416,7 @@ class PegasusDecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# Cross-Attention Block
...
...
@@ -437,7 +436,7 @@ class PegasusDecoderLayer(nn.Module):
past_key_value
=
cross_attn_past_key_value
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# add cross-attn to positions 3,4 of present_key_value tuple
...
...
@@ -447,9 +446,9 @@ class PegasusDecoderLayer(nn.Module):
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
outputs
=
(
hidden_states
,)
...
...
@@ -629,7 +628,7 @@ class PegasusEncoder(PegasusPreTrainedModel):
Args:
config: PegasusConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
PegasusConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -729,7 +728,7 @@ class PegasusEncoder(PegasusPreTrainedModel):
hidden_states
=
inputs_embeds
+
embed_pos
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# expand attention_mask
if
attention_mask
is
not
None
:
...
...
@@ -797,7 +796,7 @@ class PegasusDecoder(PegasusPreTrainedModel):
Args:
config: PegasusConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
PegasusConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -969,7 +968,7 @@ class PegasusDecoder(PegasusPreTrainedModel):
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# decoder layers
all_hidden_states
=
()
if
output_hidden_states
else
None
...
...
src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
View file @
1ed2ebf6
...
...
@@ -17,7 +17,7 @@
import
argparse
import
torch
from
torch
import
nn
from
transformers
import
ProphetNetForConditionalGeneration
,
XLMProphetNetForConditionalGeneration
,
logging
...
...
@@ -107,15 +107,15 @@ def convert_prophetnet_checkpoint_to_pytorch(prophetnet_checkpoint_path: str, py
param
.
weight
.
shape
==
old_model
.
in_proj_weight
[:
embed_dim
,
:].
shape
,
"Shapes have to match"
param
.
bias
.
shape
==
old_model
.
in_proj_bias
[:
embed_dim
].
shape
,
"Shapes have to match"
if
attribute
==
"query_proj"
:
model
.
query_proj
.
weight
=
torch
.
nn
.
Parameter
(
old_model
.
in_proj_weight
[:
embed_dim
,
:])
model
.
query_proj
.
bias
=
torch
.
nn
.
Parameter
(
old_model
.
in_proj_bias
[:
embed_dim
])
model
.
query_proj
.
weight
=
nn
.
Parameter
(
old_model
.
in_proj_weight
[:
embed_dim
,
:])
model
.
query_proj
.
bias
=
nn
.
Parameter
(
old_model
.
in_proj_bias
[:
embed_dim
])
elif
attribute
==
"key_proj"
:
model
.
key_proj
.
weight
=
torch
.
nn
.
Parameter
(
old_model
.
in_proj_weight
[
embed_dim
:
2
*
embed_dim
,
:])
model
.
key_proj
.
bias
=
torch
.
nn
.
Parameter
(
old_model
.
in_proj_bias
[
embed_dim
:
2
*
embed_dim
])
model
.
key_proj
.
weight
=
nn
.
Parameter
(
old_model
.
in_proj_weight
[
embed_dim
:
2
*
embed_dim
,
:])
model
.
key_proj
.
bias
=
nn
.
Parameter
(
old_model
.
in_proj_bias
[
embed_dim
:
2
*
embed_dim
])
elif
attribute
==
"value_proj"
:
model
.
value_proj
.
weight
=
torch
.
nn
.
Parameter
(
old_model
.
in_proj_weight
[
2
*
embed_dim
:,
:])
model
.
value_proj
.
bias
=
torch
.
nn
.
Parameter
(
old_model
.
in_proj_bias
[
2
*
embed_dim
:])
model
.
value_proj
.
weight
=
nn
.
Parameter
(
old_model
.
in_proj_weight
[
2
*
embed_dim
:,
:])
model
.
value_proj
.
bias
=
nn
.
Parameter
(
old_model
.
in_proj_bias
[
2
*
embed_dim
:])
is_key_init
=
True
break
elif
attribute
==
"position_embeddings"
:
...
...
@@ -123,7 +123,7 @@ def convert_prophetnet_checkpoint_to_pytorch(prophetnet_checkpoint_path: str, py
model
.
position_embeddings
.
weight
.
shape
[
-
1
]
==
old_model
.
embed_positions
.
weight
.
shape
[
-
1
]
),
"Hidden size has to match"
assert
model
.
position_embeddings
.
weight
.
shape
[
0
]
==
512
,
"We want 512 position_embeddings."
model
.
position_embeddings
.
weight
=
torch
.
nn
.
Parameter
(
old_model
.
embed_positions
.
weight
[:
512
,
:])
model
.
position_embeddings
.
weight
=
nn
.
Parameter
(
old_model
.
embed_positions
.
weight
[:
512
,
:])
is_key_init
=
True
break
...
...
src/transformers/models/prophetnet/modeling_prophetnet.py
View file @
1ed2ebf6
...
...
@@ -21,7 +21,6 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
Tensor
,
nn
from
torch.nn
import
LayerNorm
...
...
@@ -183,9 +182,9 @@ PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r"""
def
softmax
(
hidden_state
,
dim
,
onnx_trace
=
False
):
if
onnx_trace
:
return
F
.
softmax
(
hidden_state
.
float
(),
dim
=
dim
)
return
nn
.
functional
.
softmax
(
hidden_state
.
float
(),
dim
=
dim
)
else
:
return
F
.
softmax
(
hidden_state
,
dim
=
dim
,
dtype
=
torch
.
float32
)
return
nn
.
functional
.
softmax
(
hidden_state
,
dim
=
dim
,
dtype
=
torch
.
float32
)
def
ngram_attention_bias
(
sequence_length
,
ngram
,
device
,
dtype
):
...
...
@@ -732,7 +731,7 @@ class ProphetNetAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
assert
layer_head_mask
.
size
()
==
(
...
...
@@ -746,7 +745,7 @@ class ProphetNetAttention(nn.Module):
# apply head_mask also on attn_weights_reshaped which is used for n-gram attention inside the model
attn_weights_reshaped
=
layer_head_mask
.
view
(
1
,
-
1
,
1
,
1
)
*
attn_weights_reshaped
attn_probs
=
F
.
dropout
(
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
attention_dropout
,
training
=
self
.
training
,
...
...
@@ -767,7 +766,7 @@ class ProphetNetAttention(nn.Module):
attn_output
=
self
.
out_proj
(
attn_output
)
attn_output
=
F
.
dropout
(
attn_output
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
nn
.
functional
.
dropout
(
attn_output
,
p
=
self
.
dropout
,
training
=
self
.
training
)
return
attn_output
,
attn_weights_reshaped
,
past_key_value
...
...
@@ -788,9 +787,9 @@ class ProphetNetFeedForward(nn.Module):
hidden_states
=
self
.
intermediate
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
output
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
return
hidden_states
...
...
@@ -924,7 +923,7 @@ class ProphetNetNgramSelfAttention(nn.Module):
)
main_attn_probs
=
main_attn_probs
.
view
(
batch_size
*
self
.
num_attn_heads
,
-
1
,
sequence_length
)
main_attn_probs
=
F
.
dropout
(
main_attn_probs
,
p
=
self
.
attention_dropout
,
training
=
self
.
training
)
main_attn_probs
=
nn
.
functional
.
dropout
(
main_attn_probs
,
p
=
self
.
attention_dropout
,
training
=
self
.
training
)
# project to attn_output
main_attn_output
=
torch
.
bmm
(
main_attn_probs
,
main_value_states
)
...
...
@@ -989,7 +988,9 @@ class ProphetNetNgramSelfAttention(nn.Module):
self
.
ngram
,
batch_size
*
self
.
num_attn_heads
,
sequence_length
,
2
*
sequence_length
)
predict_attn_probs
=
F
.
dropout
(
predict_attn_probs
,
p
=
self
.
attention_dropout
,
training
=
self
.
training
)
predict_attn_probs
=
nn
.
functional
.
dropout
(
predict_attn_probs
,
p
=
self
.
attention_dropout
,
training
=
self
.
training
)
# project to attention output
# [ngram, B*head, T, c]
predict_attn_output
=
torch
.
einsum
(
"nbts,nbsc->nbtc"
,
(
predict_attn_probs
,
predict_value_states
))
...
...
@@ -1012,7 +1013,7 @@ class ProphetNetNgramSelfAttention(nn.Module):
self
.
ngram
,
batch_size
,
self
.
num_attn_heads
,
sequence_length
,
-
1
).
transpose
(
0
,
1
)
attn_output
=
F
.
dropout
(
attn_output
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
nn
.
functional
.
dropout
(
attn_output
,
p
=
self
.
dropout
,
training
=
self
.
training
)
return
attn_output
,
main_attn_probs
,
predict_attn_probs
,
past_key_value
...
...
@@ -1321,7 +1322,7 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
hidden_states
=
inputs_embeds
+
position_embeddings
hidden_states
=
self
.
embeddings_layer_norm
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
config
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
config
.
dropout
,
training
=
self
.
training
)
encoder_hidden_states
=
()
if
output_hidden_states
else
None
all_attentions
=
()
if
output_attentions
else
None
...
...
@@ -1538,7 +1539,7 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):
if
self
.
embeddings_layer_norm
:
hidden_states
=
self
.
embeddings_layer_norm
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# init attentions, hidden_states and cache with empty tuples
all_main_stream_hidden_states
=
()
if
output_hidden_states
else
None
...
...
@@ -1995,13 +1996,13 @@ class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel):
break
expend_targets
[
i
,
:,
:]
=
labels
lprobs
=
F
.
log_softmax
(
lprobs
=
nn
.
functional
.
log_softmax
(
logits
.
view
(
-
1
,
logits
.
size
(
-
1
)),
dim
=-
1
,
dtype
=
torch
.
float32
,
)
loss
=
F
.
nll_loss
(
lprobs
,
expend_targets
.
view
(
-
1
),
reduction
=
"mean"
)
loss
=
nn
.
functional
.
nll_loss
(
lprobs
,
expend_targets
.
view
(
-
1
),
reduction
=
"mean"
)
if
self
.
config
.
eps
>
0.0
:
smooth_loss
=
-
lprobs
.
sum
(
dim
=-
1
,
keepdim
=
True
)
...
...
@@ -2239,13 +2240,13 @@ class ProphetNetForCausalLM(ProphetNetPreTrainedModel):
break
expend_targets
[
i
,
:,
:]
=
labels
lprobs
=
F
.
log_softmax
(
lprobs
=
nn
.
functional
.
log_softmax
(
logits
.
view
(
-
1
,
logits
.
size
(
-
1
)),
dim
=-
1
,
dtype
=
torch
.
float32
,
)
loss
=
F
.
nll_loss
(
lprobs
,
expend_targets
.
view
(
-
1
),
reduction
=
"mean"
)
loss
=
nn
.
functional
.
nll_loss
(
lprobs
,
expend_targets
.
view
(
-
1
),
reduction
=
"mean"
)
if
self
.
config
.
eps
>
0.0
:
smooth_loss
=
-
lprobs
.
sum
(
dim
=-
1
,
keepdim
=
True
)
...
...
src/transformers/models/rag/modeling_rag.py
View file @
1ed2ebf6
...
...
@@ -18,6 +18,7 @@ from dataclasses import dataclass
from
typing
import
Callable
,
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
...configuration_utils
import
PretrainedConfig
from
...file_utils
import
add_start_docstrings_to_model_forward
,
replace_return_docstrings
...
...
@@ -1065,10 +1066,10 @@ class RagSequenceForGeneration(RagPreTrainedModel):
return
ll
.
squeeze
(
-
1
),
smooth_obj
.
squeeze
(
-
1
)
# seq_logits dim = (batch*n_docs, tgt_len , #vocabs)
seq_logprobs
=
torch
.
nn
.
functional
.
log_softmax
(
seq_logits
,
dim
=-
1
).
view
(
seq_logprobs
=
nn
.
functional
.
log_softmax
(
seq_logits
,
dim
=-
1
).
view
(
seq_logits
.
shape
[
0
]
//
n_docs
,
n_docs
,
-
1
,
seq_logits
.
size
(
-
1
)
)
# batch_size x n_docs x tgt_len x #vocab_size
doc_logprobs
=
torch
.
nn
.
functional
.
log_softmax
(
doc_scores
,
dim
=
1
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
doc_logprobs
=
nn
.
functional
.
log_softmax
(
doc_scores
,
dim
=
1
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
# RAG-sequence marginalization
first_token_scores
=
seq_logprobs
[:,
:,
:
1
,
:]
...
...
@@ -1212,7 +1213,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
n_docs
=
n_docs
if
n_docs
is
not
None
else
self
.
config
.
n_docs
# RAG-token marginalization
seq_logprobs
=
torch
.
nn
.
functional
.
log_softmax
(
seq_logits
,
dim
=-
1
).
view
(
seq_logprobs
=
nn
.
functional
.
log_softmax
(
seq_logits
,
dim
=-
1
).
view
(
seq_logits
.
shape
[
0
]
//
n_docs
,
n_docs
,
-
1
,
seq_logits
.
size
(
-
1
)
)
doc_logprobs
=
torch
.
log_softmax
(
doc_scores
,
dim
=
1
)
...
...
src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
View file @
1ed2ebf6
...
...
@@ -20,6 +20,7 @@ import pickle
import
numpy
as
np
import
torch
from
torch
import
nn
from
transformers
import
ReformerConfig
,
ReformerModelWithLMHead
from
transformers.utils
import
logging
...
...
@@ -31,10 +32,10 @@ logging.set_verbosity_info()
def
set_param
(
torch_layer
,
weight
,
bias
=
None
):
# set parameter of one layer
assert
torch_layer
.
weight
.
shape
==
weight
.
shape
,
f
"
{
torch_layer
}
layer.weight does not match"
torch_layer
.
weight
=
torch
.
nn
.
Parameter
(
weight
)
torch_layer
.
weight
=
nn
.
Parameter
(
weight
)
if
bias
is
not
None
:
assert
torch_layer
.
bias
.
shape
==
bias
.
shape
,
f
"
{
torch_layer
}
layer.bias does not match"
torch_layer
.
bias
=
torch
.
nn
.
Parameter
(
bias
)
torch_layer
.
bias
=
nn
.
Parameter
(
bias
)
def
set_layer_weights_in_torch_lsh
(
weights
,
torch_layer
,
hidden_size
):
...
...
@@ -153,7 +154,7 @@ def set_model_weights_in_torch(weights, torch_model, hidden_size):
assert
(
position_embeddings
.
weights
[
emb_idx
].
shape
==
emb_weights
.
shape
),
f
"
{
position_embeddings
[
emb_idx
]
}
emb does not match"
position_embeddings
.
weights
[
emb_idx
]
=
torch
.
nn
.
Parameter
(
torch
.
tensor
(
emb_weights
))
position_embeddings
.
weights
[
emb_idx
]
=
nn
.
Parameter
(
torch
.
tensor
(
emb_weights
))
trax_layer_weights
=
weights
[
5
]
assert
len
(
torch_model_reformer
.
encoder
.
layers
)
*
4
==
len
(
...
...
src/transformers/models/reformer/modeling_reformer.py
View file @
1ed2ebf6
...
...
@@ -1782,7 +1782,7 @@ class ReformerPreTrainedModel(PreTrainedModel):
"""Initialize the weights"""
if
isinstance
(
module
,
AxialPositionEmbeddings
):
for
weight
in
module
.
weights
:
torch
.
nn
.
init
.
normal_
(
weight
,
std
=
self
.
config
.
axial_norm_std
)
nn
.
init
.
normal_
(
weight
,
std
=
self
.
config
.
axial_norm_std
)
elif
isinstance
(
module
,
nn
.
Embedding
):
module
.
weight
.
data
.
normal_
(
mean
=
0.0
,
std
=
self
.
config
.
initializer_range
)
if
module
.
padding_idx
is
not
None
:
...
...
src/transformers/models/retribert/modeling_retribert.py
View file @
1ed2ebf6
...
...
@@ -20,8 +20,8 @@ RetriBERT model
import
math
import
torch
import
torch.nn
as
nn
import
torch.utils.checkpoint
as
checkpoint
from
torch
import
nn
from
...file_utils
import
add_start_docstrings
from
...modeling_utils
import
PreTrainedModel
...
...
src/transformers/models/roberta/modeling_roberta.py
View file @
1ed2ebf6
...
...
@@ -18,8 +18,8 @@
import
math
import
torch
import
torch.nn
as
nn
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
from
...activations
import
ACT2FN
,
gelu
...
...
src/transformers/models/speech_to_text/modeling_speech_to_text.py
View file @
1ed2ebf6
...
...
@@ -20,7 +20,6 @@ import random
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
...
...
@@ -306,7 +305,7 @@ class Speech2TextAttention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
if
layer_head_mask
.
size
()
!=
(
self
.
num_heads
,):
...
...
@@ -326,7 +325,7 @@ class Speech2TextAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -387,15 +386,15 @@ class Speech2TextEncoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
if
hidden_states
.
dtype
==
torch
.
float16
and
(
...
...
@@ -482,7 +481,7 @@ class Speech2TextDecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# Cross-Attention Block
...
...
@@ -502,7 +501,7 @@ class Speech2TextDecoderLayer(nn.Module):
past_key_value
=
cross_attn_past_key_value
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# add cross-attn to positions 3,4 of present_key_value tuple
...
...
@@ -512,9 +511,9 @@ class Speech2TextDecoderLayer(nn.Module):
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
outputs
=
(
hidden_states
,)
...
...
@@ -686,7 +685,7 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
Args:
config: Speech2TextConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
Speech2TextConfig
):
...
...
@@ -772,7 +771,7 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
embed_pos
=
self
.
embed_positions
(
padding_mask
)
hidden_states
=
inputs_embeds
+
embed_pos
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# expand attention_mask
if
attention_mask
is
not
None
:
...
...
@@ -840,7 +839,7 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel):
Args:
config: Speech2TextConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
Speech2TextConfig
):
...
...
@@ -1008,7 +1007,7 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel):
positions
=
self
.
embed_positions
(
input_ids
,
past_key_values_length
=
past_key_values_length
)
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# decoder layers
all_hidden_states
=
()
if
output_hidden_states
else
None
...
...
src/transformers/models/squeezebert/modeling_squeezebert.py
View file @
1ed2ebf6
...
...
@@ -92,7 +92,7 @@ class SqueezeBertEmbeddings(nn.Module):
return
embeddings
class
MatMulWrapper
(
torch
.
nn
.
Module
):
class
MatMulWrapper
(
nn
.
Module
):
"""
Wrapper for torch.matmul(). This makes flop-counting easier to implement. Note that if you directly call
torch.matmul() in your code, the flop counter will typically ignore the flops of the matmul.
...
...
src/transformers/models/t5/modeling_t5.py
View file @
1ed2ebf6
...
...
@@ -21,7 +21,6 @@ import os
import
warnings
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
from
torch.utils.checkpoint
import
checkpoint
...
...
@@ -179,7 +178,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
####################################################
# PyTorch Models are constructed by sub-classing
# - torch.nn.Module for the layers and
# - PreTrainedModel for the models (it-self a sub-class of
torch.
nn.Module)
# - PreTrainedModel for the models (it-self a sub-class of nn.Module)
####################################################
PARALLELIZE_DOCSTRING
=
r
"""
This is an experimental feature and is a subject to change at a moment's notice.
...
...
@@ -257,7 +256,7 @@ class T5DenseReluDense(nn.Module):
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
wi
(
hidden_states
)
hidden_states
=
F
.
relu
(
hidden_states
)
hidden_states
=
nn
.
functional
.
relu
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
wo
(
hidden_states
)
return
hidden_states
...
...
@@ -502,10 +501,10 @@ class T5Attention(nn.Module):
position_bias
=
position_bias
+
mask
# (batch_size, n_heads, seq_length, key_length)
scores
+=
position_bias
attn_weights
=
F
.
softmax
(
scores
.
float
(),
dim
=-
1
).
type_as
(
attn_weights
=
nn
.
functional
.
softmax
(
scores
.
float
(),
dim
=-
1
).
type_as
(
scores
)
# (batch_size, n_heads, seq_length, key_length)
attn_weights
=
F
.
dropout
(
attn_weights
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# (batch_size, n_heads, seq_length, key_length)
...
...
src/transformers/models/tapas/modeling_tapas.py
View file @
1ed2ebf6
...
...
@@ -22,8 +22,8 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
...activations
import
ACT2FN
...
...
@@ -2096,10 +2096,8 @@ def _calculate_aggregation_loss_known(
# Use aggregation supervision as the target.
target_aggregation
=
aggregation_labels
one_hot_labels
=
torch
.
nn
.
functional
.
one_hot
(
target_aggregation
,
num_classes
=
num_aggregation_labels
).
type
(
torch
.
float32
)
log_probs
=
torch
.
nn
.
functional
.
log_softmax
(
logits_aggregation
,
dim
=-
1
)
one_hot_labels
=
nn
.
functional
.
one_hot
(
target_aggregation
,
num_classes
=
num_aggregation_labels
).
type
(
torch
.
float32
)
log_probs
=
nn
.
functional
.
log_softmax
(
logits_aggregation
,
dim
=-
1
)
# torch.FloatTensor[batch_size]
per_example_aggregation_intermediate
=
-
torch
.
sum
(
one_hot_labels
*
log_probs
,
dim
=-
1
)
...
...
@@ -2243,7 +2241,7 @@ def _calculate_expected_result(
aggregation_op_only_probs
=
gumbel_dist
.
sample
()
else
:
# <float32>[batch_size, num_aggregation_labels - 1]
aggregation_op_only_probs
=
torch
.
nn
.
functional
.
softmax
(
aggregation_op_only_probs
=
nn
.
functional
.
softmax
(
logits_aggregation
[:,
1
:]
/
config
.
aggregation_temperature
,
dim
=-
1
)
...
...
src/transformers/models/transfo_xl/modeling_transfo_xl.py
View file @
1ed2ebf6
...
...
@@ -21,8 +21,7 @@ from dataclasses import dataclass
from
typing
import
List
,
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
...file_utils
import
(
...
...
@@ -344,7 +343,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
attn_score
=
attn_score
.
float
().
masked_fill
(
attn_mask
[:,
:,
:,
None
],
-
1e30
).
type_as
(
attn_score
)
# [qlen x klen x bsz x n_head]
attn_prob
=
F
.
softmax
(
attn_score
,
dim
=
1
)
attn_prob
=
nn
.
functional
.
softmax
(
attn_score
,
dim
=
1
)
attn_prob
=
self
.
dropatt
(
attn_prob
)
# Mask heads if we want to
...
...
@@ -434,7 +433,7 @@ class AdaptiveEmbedding(nn.Module):
if
self
.
div_val
==
1
:
embed
=
self
.
emb_layers
[
0
](
inp
)
if
self
.
d_proj
!=
self
.
d_embed
:
embed
=
F
.
linear
(
embed
,
self
.
emb_projs
[
0
])
embed
=
nn
.
functional
.
linear
(
embed
,
self
.
emb_projs
[
0
])
else
:
param
=
next
(
self
.
parameters
())
inp_flat
=
inp
.
view
(
-
1
)
...
...
@@ -450,7 +449,7 @@ class AdaptiveEmbedding(nn.Module):
inp_i
=
inp_flat
.
index_select
(
0
,
indices_i
)
-
l_idx
emb_i
=
self
.
emb_layers
[
i
](
inp_i
)
emb_i
=
F
.
linear
(
emb_i
,
self
.
emb_projs
[
i
])
emb_i
=
nn
.
functional
.
linear
(
emb_i
,
self
.
emb_projs
[
i
])
emb_flat
.
index_copy_
(
0
,
indices_i
,
emb_i
)
...
...
src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
View file @
1ed2ebf6
...
...
@@ -19,8 +19,7 @@
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch
import
nn
# CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
...
...
@@ -71,11 +70,11 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
def
_compute_logit
(
self
,
hidden
,
weight
,
bias
,
proj
):
if
proj
is
None
:
logit
=
F
.
linear
(
hidden
,
weight
,
bias
=
bias
)
logit
=
nn
.
functional
.
linear
(
hidden
,
weight
,
bias
=
bias
)
else
:
# if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1:
proj_hid
=
F
.
linear
(
hidden
,
proj
.
t
().
contiguous
())
logit
=
F
.
linear
(
proj_hid
,
weight
,
bias
=
bias
)
proj_hid
=
nn
.
functional
.
linear
(
hidden
,
proj
.
t
().
contiguous
())
logit
=
nn
.
functional
.
linear
(
proj_hid
,
weight
,
bias
=
bias
)
# else:
# logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))
# if bias is not None:
...
...
@@ -110,9 +109,9 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
if
self
.
n_clusters
==
0
:
logit
=
self
.
_compute_logit
(
hidden
,
self
.
out_layers
[
0
].
weight
,
self
.
out_layers
[
0
].
bias
,
self
.
out_projs
[
0
])
if
labels
is
not
None
:
out
=
-
F
.
log_softmax
(
logit
,
dim
=-
1
).
gather
(
1
,
labels
.
unsqueeze
(
1
)).
squeeze
(
1
)
out
=
-
nn
.
functional
.
log_softmax
(
logit
,
dim
=-
1
).
gather
(
1
,
labels
.
unsqueeze
(
1
)).
squeeze
(
1
)
else
:
out
=
F
.
log_softmax
(
logit
,
dim
=-
1
)
out
=
nn
.
functional
.
log_softmax
(
logit
,
dim
=-
1
)
else
:
# construct weights and biases
weights
,
biases
=
[],
[]
...
...
@@ -135,7 +134,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
head_weight
,
head_bias
,
head_proj
=
weights
[
0
],
biases
[
0
],
self
.
out_projs
[
0
]
head_logit
=
self
.
_compute_logit
(
hidden
,
head_weight
,
head_bias
,
head_proj
)
head_logprob
=
F
.
log_softmax
(
head_logit
,
dim
=
1
)
head_logprob
=
nn
.
functional
.
log_softmax
(
head_logit
,
dim
=
1
)
if
labels
is
None
:
out
=
hidden
.
new_empty
((
head_logit
.
size
(
0
),
self
.
n_token
))
...
...
@@ -169,7 +168,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
weight_i
,
bias_i
,
proj_i
=
weights
[
i
],
biases
[
i
],
self
.
out_projs
[
i
]
tail_logit_i
=
self
.
_compute_logit
(
hidden_i
,
weight_i
,
bias_i
,
proj_i
)
tail_logprob_i
=
F
.
log_softmax
(
tail_logit_i
,
dim
=
1
)
tail_logprob_i
=
nn
.
functional
.
log_softmax
(
tail_logit_i
,
dim
=
1
)
cluster_prob_idx
=
self
.
cutoffs
[
0
]
+
i
-
1
# No probability for the head cluster
if
labels
is
not
None
:
logprob_i
=
head_logprob_i
[:,
cluster_prob_idx
]
+
tail_logprob_i
.
gather
(
...
...
@@ -205,7 +204,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
"""
if
self
.
n_clusters
==
0
:
logit
=
self
.
_compute_logit
(
hidden
,
self
.
out_layers
[
0
].
weight
,
self
.
out_layers
[
0
].
bias
,
self
.
out_projs
[
0
])
return
F
.
log_softmax
(
logit
,
dim
=-
1
)
return
nn
.
functional
.
log_softmax
(
logit
,
dim
=-
1
)
else
:
# construct weights and biases
weights
,
biases
=
[],
[]
...
...
@@ -229,7 +228,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
head_logit
=
self
.
_compute_logit
(
hidden
,
head_weight
,
head_bias
,
head_proj
)
out
=
hidden
.
new_empty
((
head_logit
.
size
(
0
),
self
.
n_token
))
head_logprob
=
F
.
log_softmax
(
head_logit
,
dim
=
1
)
head_logprob
=
nn
.
functional
.
log_softmax
(
head_logit
,
dim
=
1
)
cutoff_values
=
[
0
]
+
self
.
cutoffs
for
i
in
range
(
len
(
cutoff_values
)
-
1
):
...
...
@@ -241,7 +240,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
weight_i
,
bias_i
,
proj_i
=
weights
[
i
],
biases
[
i
],
self
.
out_projs
[
i
]
tail_logit_i
=
self
.
_compute_logit
(
hidden
,
weight_i
,
bias_i
,
proj_i
)
tail_logprob_i
=
F
.
log_softmax
(
tail_logit_i
,
dim
=
1
)
tail_logprob_i
=
nn
.
functional
.
log_softmax
(
tail_logit_i
,
dim
=
1
)
logprob_i
=
head_logprob
[:,
-
i
]
+
tail_logprob_i
out
[:,
start_idx
,
stop_idx
]
=
logprob_i
...
...
src/transformers/models/visual_bert/modeling_visual_bert.py
View file @
1ed2ebf6
...
...
@@ -89,10 +89,10 @@ class VisualBertEmbeddings(nn.Module):
self
.
visual_position_embeddings
=
nn
.
Embedding
(
config
.
max_position_embeddings
,
config
.
hidden_size
)
if
config
.
special_visual_initialize
:
self
.
visual_token_type_embeddings
.
weight
.
data
=
torch
.
nn
.
Parameter
(
self
.
visual_token_type_embeddings
.
weight
.
data
=
nn
.
Parameter
(
self
.
token_type_embeddings
.
weight
.
data
.
clone
(),
requires_grad
=
True
)
self
.
visual_position_embeddings
.
weight
.
data
=
torch
.
nn
.
Parameter
(
self
.
visual_position_embeddings
.
weight
.
data
=
nn
.
Parameter
(
self
.
position_embeddings
.
weight
.
data
.
clone
(),
requires_grad
=
True
)
...
...
@@ -1253,8 +1253,8 @@ class VisualBertForQuestionAnswering(VisualBertPreTrainedModel):
loss
=
None
if
labels
is
not
None
:
loss_fct
=
torch
.
nn
.
KLDivLoss
(
reduction
=
"batchmean"
)
log_softmax
=
torch
.
nn
.
LogSoftmax
(
dim
=-
1
)
loss_fct
=
nn
.
KLDivLoss
(
reduction
=
"batchmean"
)
log_softmax
=
nn
.
LogSoftmax
(
dim
=-
1
)
reshaped_logits
=
log_softmax
(
reshaped_logits
)
loss
=
loss_fct
(
reshaped_logits
,
labels
.
contiguous
())
if
not
return_dict
:
...
...
src/transformers/models/wav2vec2/modeling_wav2vec2.py
View file @
1ed2ebf6
...
...
@@ -20,7 +20,6 @@ from typing import Optional, Tuple, Union
import
numpy
as
np
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
...
...
@@ -449,7 +448,7 @@ class Wav2Vec2Attention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
if
layer_head_mask
.
size
()
!=
(
self
.
num_heads
,):
...
...
@@ -469,7 +468,7 @@ class Wav2Vec2Attention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -805,9 +804,9 @@ class Wav2Vec2GumbelVectorQuantizer(nn.Module):
if
self
.
training
:
# sample code vector probs via gumbel in differentiateable way
codevector_probs
=
F
.
gumbel_softmax
(
hidden_states
.
float
(),
tau
=
self
.
temperature
,
hard
=
True
).
type_as
(
hidden_states
)
codevector_probs
=
nn
.
functional
.
gumbel_softmax
(
hidden_states
.
float
(),
tau
=
self
.
temperature
,
hard
=
True
)
.
type_as
(
hidden_states
)
# compute perplexity
codevector_soft_dist
=
torch
.
softmax
(
...
...
@@ -867,12 +866,12 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel):
if
hasattr
(
module
,
"weight_v"
)
and
hasattr
(
module
,
"weight_g"
):
with
deepspeed
.
zero
.
GatheredParameters
([
module
.
weight_v
,
module
.
weight_g
],
modifier_rank
=
0
):
torch
.
nn
.
init
.
kaiming_normal_
(
module
.
weight
.
data
)
nn
.
init
.
kaiming_normal_
(
module
.
weight
.
data
)
else
:
with
deepspeed
.
zero
.
GatheredParameters
(
module
.
weight
,
modifier_rank
=
0
):
torch
.
nn
.
init
.
kaiming_normal_
(
module
.
weight
.
data
)
nn
.
init
.
kaiming_normal_
(
module
.
weight
.
data
)
else
:
torch
.
nn
.
init
.
kaiming_normal_
(
module
.
weight
.
data
)
nn
.
init
.
kaiming_normal_
(
module
.
weight
.
data
)
if
isinstance
(
module
,
(
nn
.
Linear
,
nn
.
Conv1d
))
and
module
.
bias
is
not
None
:
module
.
bias
.
data
.
zero_
()
...
...
@@ -1296,7 +1295,7 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
# -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa))
preds
=
logits
.
transpose
(
0
,
2
).
reshape
(
-
1
,
logits
.
size
(
0
))
target
=
((
1
-
mask_time_indices
.
long
())
*
-
100
).
transpose
(
0
,
1
).
flatten
()
contrastive_loss
=
F
.
cross_entropy
(
preds
.
float
(),
target
,
reduction
=
"sum"
)
contrastive_loss
=
nn
.
functional
.
cross_entropy
(
preds
.
float
(),
target
,
reduction
=
"sum"
)
# 7. compute diversity loss: \mathbf{L}_d
num_codevectors
=
self
.
config
.
num_codevectors_per_group
*
self
.
config
.
num_codevector_groups
...
...
@@ -1502,10 +1501,10 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
flattened_targets
=
labels
.
masked_select
(
labels_mask
)
# ctc_loss doesn't support fp16
log_probs
=
F
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
).
transpose
(
0
,
1
)
log_probs
=
nn
.
functional
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
).
transpose
(
0
,
1
)
with
torch
.
backends
.
cudnn
.
flags
(
enabled
=
False
):
loss
=
F
.
ctc_loss
(
loss
=
nn
.
functional
.
ctc_loss
(
log_probs
,
flattened_targets
,
input_lengths
,
...
...
src/transformers/models/xlm/modeling_tf_xlm.py
View file @
1ed2ebf6
...
...
@@ -503,7 +503,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
# encoder attention (for decoder only)
# if self.is_decoder and src_enc is not None:
# attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
# attn =
F
.dropout(attn, p=self.dropout, training=self.training)
# attn =
nn.functional
.dropout(attn, p=self.dropout, training=self.training)
# tensor = tensor + attn
# tensor = self.layer_norm15[i](tensor)
...
...
src/transformers/models/xlm/modeling_xlm.py
View file @
1ed2ebf6
...
...
@@ -25,7 +25,6 @@ import numpy as np
import
torch
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
from
torch.nn
import
functional
as
F
from
...activations
import
gelu
from
...file_utils
import
(
...
...
@@ -190,8 +189,8 @@ class MultiHeadAttention(nn.Module):
mask
=
(
mask
==
0
).
view
(
mask_reshape
).
expand_as
(
scores
)
# (bs, n_heads, qlen, klen)
scores
.
masked_fill_
(
mask
,
-
float
(
"inf"
))
# (bs, n_heads, qlen, klen)
weights
=
F
.
softmax
(
scores
.
float
(),
dim
=-
1
).
type_as
(
scores
)
# (bs, n_heads, qlen, klen)
weights
=
F
.
dropout
(
weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# (bs, n_heads, qlen, klen)
weights
=
nn
.
functional
.
softmax
(
scores
.
float
(),
dim
=-
1
).
type_as
(
scores
)
# (bs, n_heads, qlen, klen)
weights
=
nn
.
functional
.
dropout
(
weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# (bs, n_heads, qlen, klen)
# Mask heads if we want to
if
head_mask
is
not
None
:
...
...
@@ -212,7 +211,7 @@ class TransformerFFN(nn.Module):
self
.
dropout
=
config
.
dropout
self
.
lin1
=
nn
.
Linear
(
in_dim
,
dim_hidden
)
self
.
lin2
=
nn
.
Linear
(
dim_hidden
,
out_dim
)
self
.
act
=
gelu
if
config
.
gelu_activation
else
F
.
relu
self
.
act
=
gelu
if
config
.
gelu_activation
else
nn
.
functional
.
relu
self
.
chunk_size_feed_forward
=
config
.
chunk_size_feed_forward
self
.
seq_len_dim
=
1
...
...
@@ -223,7 +222,7 @@ class TransformerFFN(nn.Module):
x
=
self
.
lin1
(
input
)
x
=
self
.
act
(
x
)
x
=
self
.
lin2
(
x
)
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
return
x
...
...
@@ -578,7 +577,7 @@ class XLMModel(XLMPreTrainedModel):
if
token_type_ids
is
not
None
:
tensor
=
tensor
+
self
.
embeddings
(
token_type_ids
)
tensor
=
self
.
layer_norm_emb
(
tensor
)
tensor
=
F
.
dropout
(
tensor
,
p
=
self
.
dropout
,
training
=
self
.
training
)
tensor
=
nn
.
functional
.
dropout
(
tensor
,
p
=
self
.
dropout
,
training
=
self
.
training
)
tensor
*=
mask
.
unsqueeze
(
-
1
).
to
(
tensor
.
dtype
)
# transformer layers
...
...
@@ -599,14 +598,14 @@ class XLMModel(XLMPreTrainedModel):
attn
=
attn_outputs
[
0
]
if
output_attentions
:
attentions
=
attentions
+
(
attn_outputs
[
1
],)
attn
=
F
.
dropout
(
attn
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn
=
nn
.
functional
.
dropout
(
attn
,
p
=
self
.
dropout
,
training
=
self
.
training
)
tensor
=
tensor
+
attn
tensor
=
self
.
layer_norm1
[
i
](
tensor
)
# encoder attention (for decoder only)
# if self.is_decoder and src_enc is not None:
# attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
# attn =
F
.dropout(attn, p=self.dropout, training=self.training)
# attn =
nn.functional
.dropout(attn, p=self.dropout, training=self.training)
# tensor = tensor + attn
# tensor = self.layer_norm15[i](tensor)
...
...
@@ -661,7 +660,9 @@ class XLMPredLayer(nn.Module):
scores
=
self
.
proj
(
x
)
outputs
=
(
scores
,)
+
outputs
if
y
is
not
None
:
loss
=
F
.
cross_entropy
(
scores
.
view
(
-
1
,
self
.
n_words
),
y
.
view
(
-
1
),
reduction
=
"elementwise_mean"
)
loss
=
nn
.
functional
.
cross_entropy
(
scores
.
view
(
-
1
,
self
.
n_words
),
y
.
view
(
-
1
),
reduction
=
"elementwise_mean"
)
outputs
=
(
loss
,)
+
outputs
else
:
scores
=
self
.
proj
.
log_prob
(
x
)
...
...
src/transformers/models/xlnet/modeling_xlnet.py
View file @
1ed2ebf6
...
...
@@ -23,7 +23,6 @@ from typing import List, Optional, Tuple
import
torch
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
from
torch.nn
import
functional
as
F
from
...activations
import
ACT2FN
from
...file_utils
import
(
...
...
@@ -305,7 +304,7 @@ class XLNetRelativeAttention(nn.Module):
attn_score
=
attn_score
-
1e30
*
torch
.
einsum
(
"ijbn->bnij"
,
attn_mask
)
# attention probability
attn_prob
=
F
.
softmax
(
attn_score
,
dim
=
3
)
attn_prob
=
nn
.
functional
.
softmax
(
attn_score
,
dim
=
3
)
attn_prob
=
self
.
dropout
(
attn_prob
)
# Mask heads if we want to
...
...
@@ -1208,7 +1207,7 @@ class XLNetModel(XLNetPreTrainedModel):
# `1` indicates not in the same segment [qlen x klen x bsz]
seg_mat
=
(
token_type_ids
[:,
None
]
!=
cat_ids
[
None
,
:]).
long
()
seg_mat
=
F
.
one_hot
(
seg_mat
,
num_classes
=
2
).
to
(
dtype_float
)
seg_mat
=
nn
.
functional
.
one_hot
(
seg_mat
,
num_classes
=
2
).
to
(
dtype_float
)
else
:
seg_mat
=
None
...
...
@@ -2034,7 +2033,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
else
:
# during inference, compute the end logits based on beam search
bsz
,
slen
,
hsz
=
hidden_states
.
size
()
start_log_probs
=
F
.
softmax
(
start_logits
,
dim
=-
1
)
# shape (bsz, slen)
start_log_probs
=
nn
.
functional
.
softmax
(
start_logits
,
dim
=-
1
)
# shape (bsz, slen)
start_top_log_probs
,
start_top_index
=
torch
.
topk
(
start_log_probs
,
self
.
start_n_top
,
dim
=-
1
...
...
@@ -2048,7 +2047,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
)
# shape (bsz, slen, start_n_top, hsz)
p_mask
=
p_mask
.
unsqueeze
(
-
1
)
if
p_mask
is
not
None
else
None
end_logits
=
self
.
end_logits
(
hidden_states_expanded
,
start_states
=
start_states
,
p_mask
=
p_mask
)
end_log_probs
=
F
.
softmax
(
end_logits
,
dim
=
1
)
# shape (bsz, slen, start_n_top)
end_log_probs
=
nn
.
functional
.
softmax
(
end_logits
,
dim
=
1
)
# shape (bsz, slen, start_n_top)
end_top_log_probs
,
end_top_index
=
torch
.
topk
(
end_log_probs
,
self
.
end_n_top
,
dim
=
1
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment