Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
1ed2ebf6
Unverified
Commit
1ed2ebf6
authored
Jun 14, 2021
by
Stas Bekman
Committed by
GitHub
Jun 14, 2021
Browse files
[style] consistent nn. and nn.functional (#12124)
* consistent nn. and nn.functional * fix glitch * fix glitch #2
parent
ff7c8168
Changes
63
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
118 additions
and
123 deletions
+118
-123
src/transformers/models/openai/modeling_openai.py
src/transformers/models/openai/modeling_openai.py
+1
-1
src/transformers/models/pegasus/modeling_pegasus.py
src/transformers/models/pegasus/modeling_pegasus.py
+13
-14
src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
...vert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
+8
-8
src/transformers/models/prophetnet/modeling_prophetnet.py
src/transformers/models/prophetnet/modeling_prophetnet.py
+18
-17
src/transformers/models/rag/modeling_rag.py
src/transformers/models/rag/modeling_rag.py
+4
-3
src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
...s/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
+4
-3
src/transformers/models/reformer/modeling_reformer.py
src/transformers/models/reformer/modeling_reformer.py
+1
-1
src/transformers/models/retribert/modeling_retribert.py
src/transformers/models/retribert/modeling_retribert.py
+1
-1
src/transformers/models/roberta/modeling_roberta.py
src/transformers/models/roberta/modeling_roberta.py
+1
-1
src/transformers/models/speech_to_text/modeling_speech_to_text.py
...sformers/models/speech_to_text/modeling_speech_to_text.py
+13
-14
src/transformers/models/squeezebert/modeling_squeezebert.py
src/transformers/models/squeezebert/modeling_squeezebert.py
+1
-1
src/transformers/models/t5/modeling_t5.py
src/transformers/models/t5/modeling_t5.py
+4
-5
src/transformers/models/tapas/modeling_tapas.py
src/transformers/models/tapas/modeling_tapas.py
+4
-6
src/transformers/models/transfo_xl/modeling_transfo_xl.py
src/transformers/models/transfo_xl/modeling_transfo_xl.py
+4
-5
src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
...ormers/models/transfo_xl/modeling_transfo_xl_utilities.py
+11
-12
src/transformers/models/visual_bert/modeling_visual_bert.py
src/transformers/models/visual_bert/modeling_visual_bert.py
+4
-4
src/transformers/models/wav2vec2/modeling_wav2vec2.py
src/transformers/models/wav2vec2/modeling_wav2vec2.py
+11
-12
src/transformers/models/xlm/modeling_tf_xlm.py
src/transformers/models/xlm/modeling_tf_xlm.py
+1
-1
src/transformers/models/xlm/modeling_xlm.py
src/transformers/models/xlm/modeling_xlm.py
+10
-9
src/transformers/models/xlnet/modeling_xlnet.py
src/transformers/models/xlnet/modeling_xlnet.py
+4
-5
No files found.
src/transformers/models/openai/modeling_openai.py
View file @
1ed2ebf6
...
...
@@ -23,7 +23,7 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
...activations
import
gelu_new
,
silu
...
...
src/transformers/models/pegasus/modeling_pegasus.py
View file @
1ed2ebf6
...
...
@@ -21,7 +21,6 @@ from typing import Optional, Tuple
import
numpy
as
np
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
...
...
@@ -239,7 +238,7 @@ class PegasusAttention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
if
layer_head_mask
.
size
()
!=
(
self
.
num_heads
,):
...
...
@@ -259,7 +258,7 @@ class PegasusAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -321,15 +320,15 @@ class PegasusEncoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
if
hidden_states
.
dtype
==
torch
.
float16
and
(
...
...
@@ -417,7 +416,7 @@ class PegasusDecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# Cross-Attention Block
...
...
@@ -437,7 +436,7 @@ class PegasusDecoderLayer(nn.Module):
past_key_value
=
cross_attn_past_key_value
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# add cross-attn to positions 3,4 of present_key_value tuple
...
...
@@ -447,9 +446,9 @@ class PegasusDecoderLayer(nn.Module):
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
outputs
=
(
hidden_states
,)
...
...
@@ -629,7 +628,7 @@ class PegasusEncoder(PegasusPreTrainedModel):
Args:
config: PegasusConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
PegasusConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -729,7 +728,7 @@ class PegasusEncoder(PegasusPreTrainedModel):
hidden_states
=
inputs_embeds
+
embed_pos
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# expand attention_mask
if
attention_mask
is
not
None
:
...
...
@@ -797,7 +796,7 @@ class PegasusDecoder(PegasusPreTrainedModel):
Args:
config: PegasusConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
PegasusConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -969,7 +968,7 @@ class PegasusDecoder(PegasusPreTrainedModel):
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# decoder layers
all_hidden_states
=
()
if
output_hidden_states
else
None
...
...
src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
View file @
1ed2ebf6
...
...
@@ -17,7 +17,7 @@
import
argparse
import
torch
from
torch
import
nn
from
transformers
import
ProphetNetForConditionalGeneration
,
XLMProphetNetForConditionalGeneration
,
logging
...
...
@@ -107,15 +107,15 @@ def convert_prophetnet_checkpoint_to_pytorch(prophetnet_checkpoint_path: str, py
param
.
weight
.
shape
==
old_model
.
in_proj_weight
[:
embed_dim
,
:].
shape
,
"Shapes have to match"
param
.
bias
.
shape
==
old_model
.
in_proj_bias
[:
embed_dim
].
shape
,
"Shapes have to match"
if
attribute
==
"query_proj"
:
model
.
query_proj
.
weight
=
torch
.
nn
.
Parameter
(
old_model
.
in_proj_weight
[:
embed_dim
,
:])
model
.
query_proj
.
bias
=
torch
.
nn
.
Parameter
(
old_model
.
in_proj_bias
[:
embed_dim
])
model
.
query_proj
.
weight
=
nn
.
Parameter
(
old_model
.
in_proj_weight
[:
embed_dim
,
:])
model
.
query_proj
.
bias
=
nn
.
Parameter
(
old_model
.
in_proj_bias
[:
embed_dim
])
elif
attribute
==
"key_proj"
:
model
.
key_proj
.
weight
=
torch
.
nn
.
Parameter
(
old_model
.
in_proj_weight
[
embed_dim
:
2
*
embed_dim
,
:])
model
.
key_proj
.
bias
=
torch
.
nn
.
Parameter
(
old_model
.
in_proj_bias
[
embed_dim
:
2
*
embed_dim
])
model
.
key_proj
.
weight
=
nn
.
Parameter
(
old_model
.
in_proj_weight
[
embed_dim
:
2
*
embed_dim
,
:])
model
.
key_proj
.
bias
=
nn
.
Parameter
(
old_model
.
in_proj_bias
[
embed_dim
:
2
*
embed_dim
])
elif
attribute
==
"value_proj"
:
model
.
value_proj
.
weight
=
torch
.
nn
.
Parameter
(
old_model
.
in_proj_weight
[
2
*
embed_dim
:,
:])
model
.
value_proj
.
bias
=
torch
.
nn
.
Parameter
(
old_model
.
in_proj_bias
[
2
*
embed_dim
:])
model
.
value_proj
.
weight
=
nn
.
Parameter
(
old_model
.
in_proj_weight
[
2
*
embed_dim
:,
:])
model
.
value_proj
.
bias
=
nn
.
Parameter
(
old_model
.
in_proj_bias
[
2
*
embed_dim
:])
is_key_init
=
True
break
elif
attribute
==
"position_embeddings"
:
...
...
@@ -123,7 +123,7 @@ def convert_prophetnet_checkpoint_to_pytorch(prophetnet_checkpoint_path: str, py
model
.
position_embeddings
.
weight
.
shape
[
-
1
]
==
old_model
.
embed_positions
.
weight
.
shape
[
-
1
]
),
"Hidden size has to match"
assert
model
.
position_embeddings
.
weight
.
shape
[
0
]
==
512
,
"We want 512 position_embeddings."
model
.
position_embeddings
.
weight
=
torch
.
nn
.
Parameter
(
old_model
.
embed_positions
.
weight
[:
512
,
:])
model
.
position_embeddings
.
weight
=
nn
.
Parameter
(
old_model
.
embed_positions
.
weight
[:
512
,
:])
is_key_init
=
True
break
...
...
src/transformers/models/prophetnet/modeling_prophetnet.py
View file @
1ed2ebf6
...
...
@@ -21,7 +21,6 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
Tensor
,
nn
from
torch.nn
import
LayerNorm
...
...
@@ -183,9 +182,9 @@ PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r"""
def
softmax
(
hidden_state
,
dim
,
onnx_trace
=
False
):
if
onnx_trace
:
return
F
.
softmax
(
hidden_state
.
float
(),
dim
=
dim
)
return
nn
.
functional
.
softmax
(
hidden_state
.
float
(),
dim
=
dim
)
else
:
return
F
.
softmax
(
hidden_state
,
dim
=
dim
,
dtype
=
torch
.
float32
)
return
nn
.
functional
.
softmax
(
hidden_state
,
dim
=
dim
,
dtype
=
torch
.
float32
)
def
ngram_attention_bias
(
sequence_length
,
ngram
,
device
,
dtype
):
...
...
@@ -732,7 +731,7 @@ class ProphetNetAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
assert
layer_head_mask
.
size
()
==
(
...
...
@@ -746,7 +745,7 @@ class ProphetNetAttention(nn.Module):
# apply head_mask also on attn_weights_reshaped which is used for n-gram attention inside the model
attn_weights_reshaped
=
layer_head_mask
.
view
(
1
,
-
1
,
1
,
1
)
*
attn_weights_reshaped
attn_probs
=
F
.
dropout
(
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
attention_dropout
,
training
=
self
.
training
,
...
...
@@ -767,7 +766,7 @@ class ProphetNetAttention(nn.Module):
attn_output
=
self
.
out_proj
(
attn_output
)
attn_output
=
F
.
dropout
(
attn_output
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
nn
.
functional
.
dropout
(
attn_output
,
p
=
self
.
dropout
,
training
=
self
.
training
)
return
attn_output
,
attn_weights_reshaped
,
past_key_value
...
...
@@ -788,9 +787,9 @@ class ProphetNetFeedForward(nn.Module):
hidden_states
=
self
.
intermediate
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
output
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
return
hidden_states
...
...
@@ -924,7 +923,7 @@ class ProphetNetNgramSelfAttention(nn.Module):
)
main_attn_probs
=
main_attn_probs
.
view
(
batch_size
*
self
.
num_attn_heads
,
-
1
,
sequence_length
)
main_attn_probs
=
F
.
dropout
(
main_attn_probs
,
p
=
self
.
attention_dropout
,
training
=
self
.
training
)
main_attn_probs
=
nn
.
functional
.
dropout
(
main_attn_probs
,
p
=
self
.
attention_dropout
,
training
=
self
.
training
)
# project to attn_output
main_attn_output
=
torch
.
bmm
(
main_attn_probs
,
main_value_states
)
...
...
@@ -989,7 +988,9 @@ class ProphetNetNgramSelfAttention(nn.Module):
self
.
ngram
,
batch_size
*
self
.
num_attn_heads
,
sequence_length
,
2
*
sequence_length
)
predict_attn_probs
=
F
.
dropout
(
predict_attn_probs
,
p
=
self
.
attention_dropout
,
training
=
self
.
training
)
predict_attn_probs
=
nn
.
functional
.
dropout
(
predict_attn_probs
,
p
=
self
.
attention_dropout
,
training
=
self
.
training
)
# project to attention output
# [ngram, B*head, T, c]
predict_attn_output
=
torch
.
einsum
(
"nbts,nbsc->nbtc"
,
(
predict_attn_probs
,
predict_value_states
))
...
...
@@ -1012,7 +1013,7 @@ class ProphetNetNgramSelfAttention(nn.Module):
self
.
ngram
,
batch_size
,
self
.
num_attn_heads
,
sequence_length
,
-
1
).
transpose
(
0
,
1
)
attn_output
=
F
.
dropout
(
attn_output
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
nn
.
functional
.
dropout
(
attn_output
,
p
=
self
.
dropout
,
training
=
self
.
training
)
return
attn_output
,
main_attn_probs
,
predict_attn_probs
,
past_key_value
...
...
@@ -1321,7 +1322,7 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
hidden_states
=
inputs_embeds
+
position_embeddings
hidden_states
=
self
.
embeddings_layer_norm
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
config
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
config
.
dropout
,
training
=
self
.
training
)
encoder_hidden_states
=
()
if
output_hidden_states
else
None
all_attentions
=
()
if
output_attentions
else
None
...
...
@@ -1538,7 +1539,7 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):
if
self
.
embeddings_layer_norm
:
hidden_states
=
self
.
embeddings_layer_norm
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# init attentions, hidden_states and cache with empty tuples
all_main_stream_hidden_states
=
()
if
output_hidden_states
else
None
...
...
@@ -1995,13 +1996,13 @@ class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel):
break
expend_targets
[
i
,
:,
:]
=
labels
lprobs
=
F
.
log_softmax
(
lprobs
=
nn
.
functional
.
log_softmax
(
logits
.
view
(
-
1
,
logits
.
size
(
-
1
)),
dim
=-
1
,
dtype
=
torch
.
float32
,
)
loss
=
F
.
nll_loss
(
lprobs
,
expend_targets
.
view
(
-
1
),
reduction
=
"mean"
)
loss
=
nn
.
functional
.
nll_loss
(
lprobs
,
expend_targets
.
view
(
-
1
),
reduction
=
"mean"
)
if
self
.
config
.
eps
>
0.0
:
smooth_loss
=
-
lprobs
.
sum
(
dim
=-
1
,
keepdim
=
True
)
...
...
@@ -2239,13 +2240,13 @@ class ProphetNetForCausalLM(ProphetNetPreTrainedModel):
break
expend_targets
[
i
,
:,
:]
=
labels
lprobs
=
F
.
log_softmax
(
lprobs
=
nn
.
functional
.
log_softmax
(
logits
.
view
(
-
1
,
logits
.
size
(
-
1
)),
dim
=-
1
,
dtype
=
torch
.
float32
,
)
loss
=
F
.
nll_loss
(
lprobs
,
expend_targets
.
view
(
-
1
),
reduction
=
"mean"
)
loss
=
nn
.
functional
.
nll_loss
(
lprobs
,
expend_targets
.
view
(
-
1
),
reduction
=
"mean"
)
if
self
.
config
.
eps
>
0.0
:
smooth_loss
=
-
lprobs
.
sum
(
dim
=-
1
,
keepdim
=
True
)
...
...
src/transformers/models/rag/modeling_rag.py
View file @
1ed2ebf6
...
...
@@ -18,6 +18,7 @@ from dataclasses import dataclass
from
typing
import
Callable
,
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
...configuration_utils
import
PretrainedConfig
from
...file_utils
import
add_start_docstrings_to_model_forward
,
replace_return_docstrings
...
...
@@ -1065,10 +1066,10 @@ class RagSequenceForGeneration(RagPreTrainedModel):
return
ll
.
squeeze
(
-
1
),
smooth_obj
.
squeeze
(
-
1
)
# seq_logits dim = (batch*n_docs, tgt_len , #vocabs)
seq_logprobs
=
torch
.
nn
.
functional
.
log_softmax
(
seq_logits
,
dim
=-
1
).
view
(
seq_logprobs
=
nn
.
functional
.
log_softmax
(
seq_logits
,
dim
=-
1
).
view
(
seq_logits
.
shape
[
0
]
//
n_docs
,
n_docs
,
-
1
,
seq_logits
.
size
(
-
1
)
)
# batch_size x n_docs x tgt_len x #vocab_size
doc_logprobs
=
torch
.
nn
.
functional
.
log_softmax
(
doc_scores
,
dim
=
1
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
doc_logprobs
=
nn
.
functional
.
log_softmax
(
doc_scores
,
dim
=
1
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
# RAG-sequence marginalization
first_token_scores
=
seq_logprobs
[:,
:,
:
1
,
:]
...
...
@@ -1212,7 +1213,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
n_docs
=
n_docs
if
n_docs
is
not
None
else
self
.
config
.
n_docs
# RAG-token marginalization
seq_logprobs
=
torch
.
nn
.
functional
.
log_softmax
(
seq_logits
,
dim
=-
1
).
view
(
seq_logprobs
=
nn
.
functional
.
log_softmax
(
seq_logits
,
dim
=-
1
).
view
(
seq_logits
.
shape
[
0
]
//
n_docs
,
n_docs
,
-
1
,
seq_logits
.
size
(
-
1
)
)
doc_logprobs
=
torch
.
log_softmax
(
doc_scores
,
dim
=
1
)
...
...
src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
View file @
1ed2ebf6
...
...
@@ -20,6 +20,7 @@ import pickle
import
numpy
as
np
import
torch
from
torch
import
nn
from
transformers
import
ReformerConfig
,
ReformerModelWithLMHead
from
transformers.utils
import
logging
...
...
@@ -31,10 +32,10 @@ logging.set_verbosity_info()
def
set_param
(
torch_layer
,
weight
,
bias
=
None
):
# set parameter of one layer
assert
torch_layer
.
weight
.
shape
==
weight
.
shape
,
f
"
{
torch_layer
}
layer.weight does not match"
torch_layer
.
weight
=
torch
.
nn
.
Parameter
(
weight
)
torch_layer
.
weight
=
nn
.
Parameter
(
weight
)
if
bias
is
not
None
:
assert
torch_layer
.
bias
.
shape
==
bias
.
shape
,
f
"
{
torch_layer
}
layer.bias does not match"
torch_layer
.
bias
=
torch
.
nn
.
Parameter
(
bias
)
torch_layer
.
bias
=
nn
.
Parameter
(
bias
)
def
set_layer_weights_in_torch_lsh
(
weights
,
torch_layer
,
hidden_size
):
...
...
@@ -153,7 +154,7 @@ def set_model_weights_in_torch(weights, torch_model, hidden_size):
assert
(
position_embeddings
.
weights
[
emb_idx
].
shape
==
emb_weights
.
shape
),
f
"
{
position_embeddings
[
emb_idx
]
}
emb does not match"
position_embeddings
.
weights
[
emb_idx
]
=
torch
.
nn
.
Parameter
(
torch
.
tensor
(
emb_weights
))
position_embeddings
.
weights
[
emb_idx
]
=
nn
.
Parameter
(
torch
.
tensor
(
emb_weights
))
trax_layer_weights
=
weights
[
5
]
assert
len
(
torch_model_reformer
.
encoder
.
layers
)
*
4
==
len
(
...
...
src/transformers/models/reformer/modeling_reformer.py
View file @
1ed2ebf6
...
...
@@ -1782,7 +1782,7 @@ class ReformerPreTrainedModel(PreTrainedModel):
"""Initialize the weights"""
if
isinstance
(
module
,
AxialPositionEmbeddings
):
for
weight
in
module
.
weights
:
torch
.
nn
.
init
.
normal_
(
weight
,
std
=
self
.
config
.
axial_norm_std
)
nn
.
init
.
normal_
(
weight
,
std
=
self
.
config
.
axial_norm_std
)
elif
isinstance
(
module
,
nn
.
Embedding
):
module
.
weight
.
data
.
normal_
(
mean
=
0.0
,
std
=
self
.
config
.
initializer_range
)
if
module
.
padding_idx
is
not
None
:
...
...
src/transformers/models/retribert/modeling_retribert.py
View file @
1ed2ebf6
...
...
@@ -20,8 +20,8 @@ RetriBERT model
import
math
import
torch
import
torch.nn
as
nn
import
torch.utils.checkpoint
as
checkpoint
from
torch
import
nn
from
...file_utils
import
add_start_docstrings
from
...modeling_utils
import
PreTrainedModel
...
...
src/transformers/models/roberta/modeling_roberta.py
View file @
1ed2ebf6
...
...
@@ -18,8 +18,8 @@
import
math
import
torch
import
torch.nn
as
nn
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
from
...activations
import
ACT2FN
,
gelu
...
...
src/transformers/models/speech_to_text/modeling_speech_to_text.py
View file @
1ed2ebf6
...
...
@@ -20,7 +20,6 @@ import random
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
...
...
@@ -306,7 +305,7 @@ class Speech2TextAttention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
if
layer_head_mask
.
size
()
!=
(
self
.
num_heads
,):
...
...
@@ -326,7 +325,7 @@ class Speech2TextAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -387,15 +386,15 @@ class Speech2TextEncoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
if
hidden_states
.
dtype
==
torch
.
float16
and
(
...
...
@@ -482,7 +481,7 @@ class Speech2TextDecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# Cross-Attention Block
...
...
@@ -502,7 +501,7 @@ class Speech2TextDecoderLayer(nn.Module):
past_key_value
=
cross_attn_past_key_value
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# add cross-attn to positions 3,4 of present_key_value tuple
...
...
@@ -512,9 +511,9 @@ class Speech2TextDecoderLayer(nn.Module):
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
outputs
=
(
hidden_states
,)
...
...
@@ -686,7 +685,7 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
Args:
config: Speech2TextConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
Speech2TextConfig
):
...
...
@@ -772,7 +771,7 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel):
embed_pos
=
self
.
embed_positions
(
padding_mask
)
hidden_states
=
inputs_embeds
+
embed_pos
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# expand attention_mask
if
attention_mask
is
not
None
:
...
...
@@ -840,7 +839,7 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel):
Args:
config: Speech2TextConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
Speech2TextConfig
):
...
...
@@ -1008,7 +1007,7 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel):
positions
=
self
.
embed_positions
(
input_ids
,
past_key_values_length
=
past_key_values_length
)
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# decoder layers
all_hidden_states
=
()
if
output_hidden_states
else
None
...
...
src/transformers/models/squeezebert/modeling_squeezebert.py
View file @
1ed2ebf6
...
...
@@ -92,7 +92,7 @@ class SqueezeBertEmbeddings(nn.Module):
return
embeddings
class
MatMulWrapper
(
torch
.
nn
.
Module
):
class
MatMulWrapper
(
nn
.
Module
):
"""
Wrapper for torch.matmul(). This makes flop-counting easier to implement. Note that if you directly call
torch.matmul() in your code, the flop counter will typically ignore the flops of the matmul.
...
...
src/transformers/models/t5/modeling_t5.py
View file @
1ed2ebf6
...
...
@@ -21,7 +21,6 @@ import os
import
warnings
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
from
torch.utils.checkpoint
import
checkpoint
...
...
@@ -179,7 +178,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
####################################################
# PyTorch Models are constructed by sub-classing
# - torch.nn.Module for the layers and
# - PreTrainedModel for the models (it-self a sub-class of
torch.
nn.Module)
# - PreTrainedModel for the models (it-self a sub-class of nn.Module)
####################################################
PARALLELIZE_DOCSTRING
=
r
"""
This is an experimental feature and is a subject to change at a moment's notice.
...
...
@@ -257,7 +256,7 @@ class T5DenseReluDense(nn.Module):
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
wi
(
hidden_states
)
hidden_states
=
F
.
relu
(
hidden_states
)
hidden_states
=
nn
.
functional
.
relu
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
wo
(
hidden_states
)
return
hidden_states
...
...
@@ -502,10 +501,10 @@ class T5Attention(nn.Module):
position_bias
=
position_bias
+
mask
# (batch_size, n_heads, seq_length, key_length)
scores
+=
position_bias
attn_weights
=
F
.
softmax
(
scores
.
float
(),
dim
=-
1
).
type_as
(
attn_weights
=
nn
.
functional
.
softmax
(
scores
.
float
(),
dim
=-
1
).
type_as
(
scores
)
# (batch_size, n_heads, seq_length, key_length)
attn_weights
=
F
.
dropout
(
attn_weights
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# (batch_size, n_heads, seq_length, key_length)
...
...
src/transformers/models/tapas/modeling_tapas.py
View file @
1ed2ebf6
...
...
@@ -22,8 +22,8 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
...activations
import
ACT2FN
...
...
@@ -2096,10 +2096,8 @@ def _calculate_aggregation_loss_known(
# Use aggregation supervision as the target.
target_aggregation
=
aggregation_labels
one_hot_labels
=
torch
.
nn
.
functional
.
one_hot
(
target_aggregation
,
num_classes
=
num_aggregation_labels
).
type
(
torch
.
float32
)
log_probs
=
torch
.
nn
.
functional
.
log_softmax
(
logits_aggregation
,
dim
=-
1
)
one_hot_labels
=
nn
.
functional
.
one_hot
(
target_aggregation
,
num_classes
=
num_aggregation_labels
).
type
(
torch
.
float32
)
log_probs
=
nn
.
functional
.
log_softmax
(
logits_aggregation
,
dim
=-
1
)
# torch.FloatTensor[batch_size]
per_example_aggregation_intermediate
=
-
torch
.
sum
(
one_hot_labels
*
log_probs
,
dim
=-
1
)
...
...
@@ -2243,7 +2241,7 @@ def _calculate_expected_result(
aggregation_op_only_probs
=
gumbel_dist
.
sample
()
else
:
# <float32>[batch_size, num_aggregation_labels - 1]
aggregation_op_only_probs
=
torch
.
nn
.
functional
.
softmax
(
aggregation_op_only_probs
=
nn
.
functional
.
softmax
(
logits_aggregation
[:,
1
:]
/
config
.
aggregation_temperature
,
dim
=-
1
)
...
...
src/transformers/models/transfo_xl/modeling_transfo_xl.py
View file @
1ed2ebf6
...
...
@@ -21,8 +21,7 @@ from dataclasses import dataclass
from
typing
import
List
,
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
...file_utils
import
(
...
...
@@ -344,7 +343,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
attn_score
=
attn_score
.
float
().
masked_fill
(
attn_mask
[:,
:,
:,
None
],
-
1e30
).
type_as
(
attn_score
)
# [qlen x klen x bsz x n_head]
attn_prob
=
F
.
softmax
(
attn_score
,
dim
=
1
)
attn_prob
=
nn
.
functional
.
softmax
(
attn_score
,
dim
=
1
)
attn_prob
=
self
.
dropatt
(
attn_prob
)
# Mask heads if we want to
...
...
@@ -434,7 +433,7 @@ class AdaptiveEmbedding(nn.Module):
if
self
.
div_val
==
1
:
embed
=
self
.
emb_layers
[
0
](
inp
)
if
self
.
d_proj
!=
self
.
d_embed
:
embed
=
F
.
linear
(
embed
,
self
.
emb_projs
[
0
])
embed
=
nn
.
functional
.
linear
(
embed
,
self
.
emb_projs
[
0
])
else
:
param
=
next
(
self
.
parameters
())
inp_flat
=
inp
.
view
(
-
1
)
...
...
@@ -450,7 +449,7 @@ class AdaptiveEmbedding(nn.Module):
inp_i
=
inp_flat
.
index_select
(
0
,
indices_i
)
-
l_idx
emb_i
=
self
.
emb_layers
[
i
](
inp_i
)
emb_i
=
F
.
linear
(
emb_i
,
self
.
emb_projs
[
i
])
emb_i
=
nn
.
functional
.
linear
(
emb_i
,
self
.
emb_projs
[
i
])
emb_flat
.
index_copy_
(
0
,
indices_i
,
emb_i
)
...
...
src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
View file @
1ed2ebf6
...
...
@@ -19,8 +19,7 @@
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch
import
nn
# CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
...
...
@@ -71,11 +70,11 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
def
_compute_logit
(
self
,
hidden
,
weight
,
bias
,
proj
):
if
proj
is
None
:
logit
=
F
.
linear
(
hidden
,
weight
,
bias
=
bias
)
logit
=
nn
.
functional
.
linear
(
hidden
,
weight
,
bias
=
bias
)
else
:
# if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1:
proj_hid
=
F
.
linear
(
hidden
,
proj
.
t
().
contiguous
())
logit
=
F
.
linear
(
proj_hid
,
weight
,
bias
=
bias
)
proj_hid
=
nn
.
functional
.
linear
(
hidden
,
proj
.
t
().
contiguous
())
logit
=
nn
.
functional
.
linear
(
proj_hid
,
weight
,
bias
=
bias
)
# else:
# logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))
# if bias is not None:
...
...
@@ -110,9 +109,9 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
if
self
.
n_clusters
==
0
:
logit
=
self
.
_compute_logit
(
hidden
,
self
.
out_layers
[
0
].
weight
,
self
.
out_layers
[
0
].
bias
,
self
.
out_projs
[
0
])
if
labels
is
not
None
:
out
=
-
F
.
log_softmax
(
logit
,
dim
=-
1
).
gather
(
1
,
labels
.
unsqueeze
(
1
)).
squeeze
(
1
)
out
=
-
nn
.
functional
.
log_softmax
(
logit
,
dim
=-
1
).
gather
(
1
,
labels
.
unsqueeze
(
1
)).
squeeze
(
1
)
else
:
out
=
F
.
log_softmax
(
logit
,
dim
=-
1
)
out
=
nn
.
functional
.
log_softmax
(
logit
,
dim
=-
1
)
else
:
# construct weights and biases
weights
,
biases
=
[],
[]
...
...
@@ -135,7 +134,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
head_weight
,
head_bias
,
head_proj
=
weights
[
0
],
biases
[
0
],
self
.
out_projs
[
0
]
head_logit
=
self
.
_compute_logit
(
hidden
,
head_weight
,
head_bias
,
head_proj
)
head_logprob
=
F
.
log_softmax
(
head_logit
,
dim
=
1
)
head_logprob
=
nn
.
functional
.
log_softmax
(
head_logit
,
dim
=
1
)
if
labels
is
None
:
out
=
hidden
.
new_empty
((
head_logit
.
size
(
0
),
self
.
n_token
))
...
...
@@ -169,7 +168,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
weight_i
,
bias_i
,
proj_i
=
weights
[
i
],
biases
[
i
],
self
.
out_projs
[
i
]
tail_logit_i
=
self
.
_compute_logit
(
hidden_i
,
weight_i
,
bias_i
,
proj_i
)
tail_logprob_i
=
F
.
log_softmax
(
tail_logit_i
,
dim
=
1
)
tail_logprob_i
=
nn
.
functional
.
log_softmax
(
tail_logit_i
,
dim
=
1
)
cluster_prob_idx
=
self
.
cutoffs
[
0
]
+
i
-
1
# No probability for the head cluster
if
labels
is
not
None
:
logprob_i
=
head_logprob_i
[:,
cluster_prob_idx
]
+
tail_logprob_i
.
gather
(
...
...
@@ -205,7 +204,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
"""
if
self
.
n_clusters
==
0
:
logit
=
self
.
_compute_logit
(
hidden
,
self
.
out_layers
[
0
].
weight
,
self
.
out_layers
[
0
].
bias
,
self
.
out_projs
[
0
])
return
F
.
log_softmax
(
logit
,
dim
=-
1
)
return
nn
.
functional
.
log_softmax
(
logit
,
dim
=-
1
)
else
:
# construct weights and biases
weights
,
biases
=
[],
[]
...
...
@@ -229,7 +228,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
head_logit
=
self
.
_compute_logit
(
hidden
,
head_weight
,
head_bias
,
head_proj
)
out
=
hidden
.
new_empty
((
head_logit
.
size
(
0
),
self
.
n_token
))
head_logprob
=
F
.
log_softmax
(
head_logit
,
dim
=
1
)
head_logprob
=
nn
.
functional
.
log_softmax
(
head_logit
,
dim
=
1
)
cutoff_values
=
[
0
]
+
self
.
cutoffs
for
i
in
range
(
len
(
cutoff_values
)
-
1
):
...
...
@@ -241,7 +240,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
weight_i
,
bias_i
,
proj_i
=
weights
[
i
],
biases
[
i
],
self
.
out_projs
[
i
]
tail_logit_i
=
self
.
_compute_logit
(
hidden
,
weight_i
,
bias_i
,
proj_i
)
tail_logprob_i
=
F
.
log_softmax
(
tail_logit_i
,
dim
=
1
)
tail_logprob_i
=
nn
.
functional
.
log_softmax
(
tail_logit_i
,
dim
=
1
)
logprob_i
=
head_logprob
[:,
-
i
]
+
tail_logprob_i
out
[:,
start_idx
,
stop_idx
]
=
logprob_i
...
...
src/transformers/models/visual_bert/modeling_visual_bert.py
View file @
1ed2ebf6
...
...
@@ -89,10 +89,10 @@ class VisualBertEmbeddings(nn.Module):
self
.
visual_position_embeddings
=
nn
.
Embedding
(
config
.
max_position_embeddings
,
config
.
hidden_size
)
if
config
.
special_visual_initialize
:
self
.
visual_token_type_embeddings
.
weight
.
data
=
torch
.
nn
.
Parameter
(
self
.
visual_token_type_embeddings
.
weight
.
data
=
nn
.
Parameter
(
self
.
token_type_embeddings
.
weight
.
data
.
clone
(),
requires_grad
=
True
)
self
.
visual_position_embeddings
.
weight
.
data
=
torch
.
nn
.
Parameter
(
self
.
visual_position_embeddings
.
weight
.
data
=
nn
.
Parameter
(
self
.
position_embeddings
.
weight
.
data
.
clone
(),
requires_grad
=
True
)
...
...
@@ -1253,8 +1253,8 @@ class VisualBertForQuestionAnswering(VisualBertPreTrainedModel):
loss
=
None
if
labels
is
not
None
:
loss_fct
=
torch
.
nn
.
KLDivLoss
(
reduction
=
"batchmean"
)
log_softmax
=
torch
.
nn
.
LogSoftmax
(
dim
=-
1
)
loss_fct
=
nn
.
KLDivLoss
(
reduction
=
"batchmean"
)
log_softmax
=
nn
.
LogSoftmax
(
dim
=-
1
)
reshaped_logits
=
log_softmax
(
reshaped_logits
)
loss
=
loss_fct
(
reshaped_logits
,
labels
.
contiguous
())
if
not
return_dict
:
...
...
src/transformers/models/wav2vec2/modeling_wav2vec2.py
View file @
1ed2ebf6
...
...
@@ -20,7 +20,6 @@ from typing import Optional, Tuple, Union
import
numpy
as
np
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
...
...
@@ -449,7 +448,7 @@ class Wav2Vec2Attention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
if
layer_head_mask
.
size
()
!=
(
self
.
num_heads
,):
...
...
@@ -469,7 +468,7 @@ class Wav2Vec2Attention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -805,9 +804,9 @@ class Wav2Vec2GumbelVectorQuantizer(nn.Module):
if
self
.
training
:
# sample code vector probs via gumbel in differentiateable way
codevector_probs
=
F
.
gumbel_softmax
(
hidden_states
.
float
(),
tau
=
self
.
temperature
,
hard
=
True
).
type_as
(
hidden_states
)
codevector_probs
=
nn
.
functional
.
gumbel_softmax
(
hidden_states
.
float
(),
tau
=
self
.
temperature
,
hard
=
True
)
.
type_as
(
hidden_states
)
# compute perplexity
codevector_soft_dist
=
torch
.
softmax
(
...
...
@@ -867,12 +866,12 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel):
if
hasattr
(
module
,
"weight_v"
)
and
hasattr
(
module
,
"weight_g"
):
with
deepspeed
.
zero
.
GatheredParameters
([
module
.
weight_v
,
module
.
weight_g
],
modifier_rank
=
0
):
torch
.
nn
.
init
.
kaiming_normal_
(
module
.
weight
.
data
)
nn
.
init
.
kaiming_normal_
(
module
.
weight
.
data
)
else
:
with
deepspeed
.
zero
.
GatheredParameters
(
module
.
weight
,
modifier_rank
=
0
):
torch
.
nn
.
init
.
kaiming_normal_
(
module
.
weight
.
data
)
nn
.
init
.
kaiming_normal_
(
module
.
weight
.
data
)
else
:
torch
.
nn
.
init
.
kaiming_normal_
(
module
.
weight
.
data
)
nn
.
init
.
kaiming_normal_
(
module
.
weight
.
data
)
if
isinstance
(
module
,
(
nn
.
Linear
,
nn
.
Conv1d
))
and
module
.
bias
is
not
None
:
module
.
bias
.
data
.
zero_
()
...
...
@@ -1296,7 +1295,7 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
# -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa))
preds
=
logits
.
transpose
(
0
,
2
).
reshape
(
-
1
,
logits
.
size
(
0
))
target
=
((
1
-
mask_time_indices
.
long
())
*
-
100
).
transpose
(
0
,
1
).
flatten
()
contrastive_loss
=
F
.
cross_entropy
(
preds
.
float
(),
target
,
reduction
=
"sum"
)
contrastive_loss
=
nn
.
functional
.
cross_entropy
(
preds
.
float
(),
target
,
reduction
=
"sum"
)
# 7. compute diversity loss: \mathbf{L}_d
num_codevectors
=
self
.
config
.
num_codevectors_per_group
*
self
.
config
.
num_codevector_groups
...
...
@@ -1502,10 +1501,10 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
flattened_targets
=
labels
.
masked_select
(
labels_mask
)
# ctc_loss doesn't support fp16
log_probs
=
F
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
).
transpose
(
0
,
1
)
log_probs
=
nn
.
functional
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
).
transpose
(
0
,
1
)
with
torch
.
backends
.
cudnn
.
flags
(
enabled
=
False
):
loss
=
F
.
ctc_loss
(
loss
=
nn
.
functional
.
ctc_loss
(
log_probs
,
flattened_targets
,
input_lengths
,
...
...
src/transformers/models/xlm/modeling_tf_xlm.py
View file @
1ed2ebf6
...
...
@@ -503,7 +503,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
# encoder attention (for decoder only)
# if self.is_decoder and src_enc is not None:
# attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
# attn =
F
.dropout(attn, p=self.dropout, training=self.training)
# attn =
nn.functional
.dropout(attn, p=self.dropout, training=self.training)
# tensor = tensor + attn
# tensor = self.layer_norm15[i](tensor)
...
...
src/transformers/models/xlm/modeling_xlm.py
View file @
1ed2ebf6
...
...
@@ -25,7 +25,6 @@ import numpy as np
import
torch
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
from
torch.nn
import
functional
as
F
from
...activations
import
gelu
from
...file_utils
import
(
...
...
@@ -190,8 +189,8 @@ class MultiHeadAttention(nn.Module):
mask
=
(
mask
==
0
).
view
(
mask_reshape
).
expand_as
(
scores
)
# (bs, n_heads, qlen, klen)
scores
.
masked_fill_
(
mask
,
-
float
(
"inf"
))
# (bs, n_heads, qlen, klen)
weights
=
F
.
softmax
(
scores
.
float
(),
dim
=-
1
).
type_as
(
scores
)
# (bs, n_heads, qlen, klen)
weights
=
F
.
dropout
(
weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# (bs, n_heads, qlen, klen)
weights
=
nn
.
functional
.
softmax
(
scores
.
float
(),
dim
=-
1
).
type_as
(
scores
)
# (bs, n_heads, qlen, klen)
weights
=
nn
.
functional
.
dropout
(
weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# (bs, n_heads, qlen, klen)
# Mask heads if we want to
if
head_mask
is
not
None
:
...
...
@@ -212,7 +211,7 @@ class TransformerFFN(nn.Module):
self
.
dropout
=
config
.
dropout
self
.
lin1
=
nn
.
Linear
(
in_dim
,
dim_hidden
)
self
.
lin2
=
nn
.
Linear
(
dim_hidden
,
out_dim
)
self
.
act
=
gelu
if
config
.
gelu_activation
else
F
.
relu
self
.
act
=
gelu
if
config
.
gelu_activation
else
nn
.
functional
.
relu
self
.
chunk_size_feed_forward
=
config
.
chunk_size_feed_forward
self
.
seq_len_dim
=
1
...
...
@@ -223,7 +222,7 @@ class TransformerFFN(nn.Module):
x
=
self
.
lin1
(
input
)
x
=
self
.
act
(
x
)
x
=
self
.
lin2
(
x
)
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
return
x
...
...
@@ -578,7 +577,7 @@ class XLMModel(XLMPreTrainedModel):
if
token_type_ids
is
not
None
:
tensor
=
tensor
+
self
.
embeddings
(
token_type_ids
)
tensor
=
self
.
layer_norm_emb
(
tensor
)
tensor
=
F
.
dropout
(
tensor
,
p
=
self
.
dropout
,
training
=
self
.
training
)
tensor
=
nn
.
functional
.
dropout
(
tensor
,
p
=
self
.
dropout
,
training
=
self
.
training
)
tensor
*=
mask
.
unsqueeze
(
-
1
).
to
(
tensor
.
dtype
)
# transformer layers
...
...
@@ -599,14 +598,14 @@ class XLMModel(XLMPreTrainedModel):
attn
=
attn_outputs
[
0
]
if
output_attentions
:
attentions
=
attentions
+
(
attn_outputs
[
1
],)
attn
=
F
.
dropout
(
attn
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn
=
nn
.
functional
.
dropout
(
attn
,
p
=
self
.
dropout
,
training
=
self
.
training
)
tensor
=
tensor
+
attn
tensor
=
self
.
layer_norm1
[
i
](
tensor
)
# encoder attention (for decoder only)
# if self.is_decoder and src_enc is not None:
# attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
# attn =
F
.dropout(attn, p=self.dropout, training=self.training)
# attn =
nn.functional
.dropout(attn, p=self.dropout, training=self.training)
# tensor = tensor + attn
# tensor = self.layer_norm15[i](tensor)
...
...
@@ -661,7 +660,9 @@ class XLMPredLayer(nn.Module):
scores
=
self
.
proj
(
x
)
outputs
=
(
scores
,)
+
outputs
if
y
is
not
None
:
loss
=
F
.
cross_entropy
(
scores
.
view
(
-
1
,
self
.
n_words
),
y
.
view
(
-
1
),
reduction
=
"elementwise_mean"
)
loss
=
nn
.
functional
.
cross_entropy
(
scores
.
view
(
-
1
,
self
.
n_words
),
y
.
view
(
-
1
),
reduction
=
"elementwise_mean"
)
outputs
=
(
loss
,)
+
outputs
else
:
scores
=
self
.
proj
.
log_prob
(
x
)
...
...
src/transformers/models/xlnet/modeling_xlnet.py
View file @
1ed2ebf6
...
...
@@ -23,7 +23,6 @@ from typing import List, Optional, Tuple
import
torch
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
from
torch.nn
import
functional
as
F
from
...activations
import
ACT2FN
from
...file_utils
import
(
...
...
@@ -305,7 +304,7 @@ class XLNetRelativeAttention(nn.Module):
attn_score
=
attn_score
-
1e30
*
torch
.
einsum
(
"ijbn->bnij"
,
attn_mask
)
# attention probability
attn_prob
=
F
.
softmax
(
attn_score
,
dim
=
3
)
attn_prob
=
nn
.
functional
.
softmax
(
attn_score
,
dim
=
3
)
attn_prob
=
self
.
dropout
(
attn_prob
)
# Mask heads if we want to
...
...
@@ -1208,7 +1207,7 @@ class XLNetModel(XLNetPreTrainedModel):
# `1` indicates not in the same segment [qlen x klen x bsz]
seg_mat
=
(
token_type_ids
[:,
None
]
!=
cat_ids
[
None
,
:]).
long
()
seg_mat
=
F
.
one_hot
(
seg_mat
,
num_classes
=
2
).
to
(
dtype_float
)
seg_mat
=
nn
.
functional
.
one_hot
(
seg_mat
,
num_classes
=
2
).
to
(
dtype_float
)
else
:
seg_mat
=
None
...
...
@@ -2034,7 +2033,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
else
:
# during inference, compute the end logits based on beam search
bsz
,
slen
,
hsz
=
hidden_states
.
size
()
start_log_probs
=
F
.
softmax
(
start_logits
,
dim
=-
1
)
# shape (bsz, slen)
start_log_probs
=
nn
.
functional
.
softmax
(
start_logits
,
dim
=-
1
)
# shape (bsz, slen)
start_top_log_probs
,
start_top_index
=
torch
.
topk
(
start_log_probs
,
self
.
start_n_top
,
dim
=-
1
...
...
@@ -2048,7 +2047,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
)
# shape (bsz, slen, start_n_top, hsz)
p_mask
=
p_mask
.
unsqueeze
(
-
1
)
if
p_mask
is
not
None
else
None
end_logits
=
self
.
end_logits
(
hidden_states_expanded
,
start_states
=
start_states
,
p_mask
=
p_mask
)
end_log_probs
=
F
.
softmax
(
end_logits
,
dim
=
1
)
# shape (bsz, slen, start_n_top)
end_log_probs
=
nn
.
functional
.
softmax
(
end_logits
,
dim
=
1
)
# shape (bsz, slen, start_n_top)
end_top_log_probs
,
end_top_index
=
torch
.
topk
(
end_log_probs
,
self
.
end_n_top
,
dim
=
1
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment