Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
1ed2ebf6
Unverified
Commit
1ed2ebf6
authored
Jun 14, 2021
by
Stas Bekman
Committed by
GitHub
Jun 14, 2021
Browse files
[style] consistent nn. and nn.functional (#12124)
* consistent nn. and nn.functional * fix glitch * fix glitch #2
parent
ff7c8168
Changes
63
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
132 additions
and
133 deletions
+132
-133
src/transformers/models/electra/modeling_electra.py
src/transformers/models/electra/modeling_electra.py
+1
-1
src/transformers/models/flaubert/modeling_flaubert.py
src/transformers/models/flaubert/modeling_flaubert.py
+5
-5
src/transformers/models/flaubert/modeling_tf_flaubert.py
src/transformers/models/flaubert/modeling_tf_flaubert.py
+1
-1
src/transformers/models/fsmt/modeling_fsmt.py
src/transformers/models/fsmt/modeling_fsmt.py
+12
-13
src/transformers/models/funnel/modeling_funnel.py
src/transformers/models/funnel/modeling_funnel.py
+4
-5
src/transformers/models/gpt2/modeling_gpt2.py
src/transformers/models/gpt2/modeling_gpt2.py
+1
-1
src/transformers/models/gpt_neo/modeling_gpt_neo.py
src/transformers/models/gpt_neo/modeling_gpt_neo.py
+1
-2
src/transformers/models/ibert/modeling_ibert.py
src/transformers/models/ibert/modeling_ibert.py
+1
-1
src/transformers/models/ibert/quant_modules.py
src/transformers/models/ibert/quant_modules.py
+5
-6
src/transformers/models/layoutlm/modeling_layoutlm.py
src/transformers/models/layoutlm/modeling_layoutlm.py
+1
-1
src/transformers/models/led/modeling_led.py
src/transformers/models/led/modeling_led.py
+26
-23
src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
...nvert_longformer_original_pytorch_lightning_to_pytorch.py
+2
-1
src/transformers/models/longformer/modeling_longformer.py
src/transformers/models/longformer/modeling_longformer.py
+16
-13
src/transformers/models/luke/modeling_luke.py
src/transformers/models/luke/modeling_luke.py
+7
-8
src/transformers/models/m2m_100/modeling_m2m_100.py
src/transformers/models/m2m_100/modeling_m2m_100.py
+13
-14
src/transformers/models/marian/convert_marian_to_pytorch.py
src/transformers/models/marian/convert_marian_to_pytorch.py
+4
-3
src/transformers/models/marian/modeling_marian.py
src/transformers/models/marian/modeling_marian.py
+13
-14
src/transformers/models/mbart/modeling_mbart.py
src/transformers/models/mbart/modeling_mbart.py
+13
-14
src/transformers/models/mmbt/modeling_mmbt.py
src/transformers/models/mmbt/modeling_mmbt.py
+1
-1
src/transformers/models/mobilebert/modeling_mobilebert.py
src/transformers/models/mobilebert/modeling_mobilebert.py
+5
-6
No files found.
src/transformers/models/electra/modeling_electra.py
View file @
1ed2ebf6
...
...
@@ -20,8 +20,8 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
from
...activations
import
ACT2FN
,
get_activation
...
...
src/transformers/models/flaubert/modeling_flaubert.py
View file @
1ed2ebf6
...
...
@@ -18,7 +18,7 @@
import
random
import
torch
from
torch
.nn
import
functional
as
F
from
torch
import
nn
from
...file_utils
import
add_code_sample_docstrings
,
add_start_docstrings
,
add_start_docstrings_to_model_forward
from
...modeling_outputs
import
BaseModelOutput
...
...
@@ -234,7 +234,7 @@ class FlaubertModel(XLMModel):
if
token_type_ids
is
not
None
:
tensor
=
tensor
+
self
.
embeddings
(
token_type_ids
)
tensor
=
self
.
layer_norm_emb
(
tensor
)
tensor
=
F
.
dropout
(
tensor
,
p
=
self
.
dropout
,
training
=
self
.
training
)
tensor
=
nn
.
functional
.
dropout
(
tensor
,
p
=
self
.
dropout
,
training
=
self
.
training
)
tensor
*=
mask
.
unsqueeze
(
-
1
).
to
(
tensor
.
dtype
)
# transformer layers
...
...
@@ -261,7 +261,7 @@ class FlaubertModel(XLMModel):
attn
=
attn_outputs
[
0
]
if
output_attentions
:
attentions
=
attentions
+
(
attn_outputs
[
1
],)
attn
=
F
.
dropout
(
attn
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn
=
nn
.
functional
.
dropout
(
attn
,
p
=
self
.
dropout
,
training
=
self
.
training
)
tensor
=
tensor
+
attn
tensor
=
self
.
layer_norm1
[
i
](
tensor
)
else
:
...
...
@@ -270,13 +270,13 @@ class FlaubertModel(XLMModel):
attn
=
attn_outputs
[
0
]
if
output_attentions
:
attentions
=
attentions
+
(
attn_outputs
[
1
],)
attn
=
F
.
dropout
(
attn
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn
=
nn
.
functional
.
dropout
(
attn
,
p
=
self
.
dropout
,
training
=
self
.
training
)
tensor
=
tensor
+
attn
# encoder attention (for decoder only)
# if self.is_decoder and src_enc is not None:
# attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
# attn =
F
.dropout(attn, p=self.dropout, training=self.training)
# attn =
nn.functional
.dropout(attn, p=self.dropout, training=self.training)
# tensor = tensor + attn
# tensor = self.layer_norm15[i](tensor)
...
...
src/transformers/models/flaubert/modeling_tf_flaubert.py
View file @
1ed2ebf6
...
...
@@ -675,7 +675,7 @@ class TFFlaubertMainLayer(tf.keras.layers.Layer):
# encoder attention (for decoder only)
# if self.is_decoder and src_enc is not None:
# attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
# attn =
F
.dropout(attn, p=self.dropout, training=self.training)
# attn =
nn.functional
.dropout(attn, p=self.dropout, training=self.training)
# tensor = tensor + attn
# tensor = self.layer_norm15[i](tensor)
...
...
src/transformers/models/fsmt/modeling_fsmt.py
View file @
1ed2ebf6
...
...
@@ -32,7 +32,6 @@ import random
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
from
torch
import
Tensor
,
nn
from
torch.nn
import
CrossEntropyLoss
,
LayerNorm
...
...
@@ -430,15 +429,15 @@ class EncoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
residual
+
x
x
=
self
.
self_attn_layer_norm
(
x
)
residual
=
x
x
=
self
.
activation_fn
(
self
.
fc1
(
x
))
x
=
F
.
dropout
(
x
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
x
=
self
.
fc2
(
x
)
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
residual
+
x
x
=
self
.
final_layer_norm
(
x
)
return
x
,
attn_weights
...
...
@@ -504,7 +503,7 @@ class FSMTEncoder(nn.Module):
inputs_embeds
=
self
.
embed_tokens
(
input_ids
)
*
self
.
embed_scale
embed_pos
=
self
.
embed_positions
(
input_ids
)
x
=
inputs_embeds
+
embed_pos
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# B x T x C -> T x B x C
x
=
x
.
transpose
(
0
,
1
)
...
...
@@ -600,7 +599,7 @@ class DecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
residual
+
x
x
=
self
.
self_attn_layer_norm
(
x
)
...
...
@@ -615,16 +614,16 @@ class DecoderLayer(nn.Module):
layer_head_mask
=
cross_attn_layer_head_mask
,
output_attentions
=
output_attentions
,
)
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
residual
+
x
x
=
self
.
encoder_attn_layer_norm
(
x
)
# Fully Connected
residual
=
x
x
=
self
.
activation_fn
(
self
.
fc1
(
x
))
x
=
F
.
dropout
(
x
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
x
=
self
.
fc2
(
x
)
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
residual
+
x
x
=
self
.
final_layer_norm
(
x
)
return
(
...
...
@@ -641,7 +640,7 @@ class FSMTDecoder(nn.Module):
Args:
config: FSMTConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
FSMTConfig
,
embed_tokens
:
nn
.
Embedding
):
...
...
@@ -726,7 +725,7 @@ class FSMTDecoder(nn.Module):
x
=
self
.
embed_tokens
(
input_ids
)
*
self
.
embed_scale
x
+=
positions
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# Convert to FSMT output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
x
=
x
.
transpose
(
0
,
1
)
...
...
@@ -913,7 +912,7 @@ class Attention(nn.Module):
attn_weights
=
attn_weights
.
masked_fill
(
reshaped
,
float
(
"-inf"
))
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
assert
layer_head_mask
.
size
()
==
(
...
...
@@ -929,7 +928,7 @@ class Attention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
,
...
...
src/transformers/models/funnel/modeling_funnel.py
View file @
1ed2ebf6
...
...
@@ -22,7 +22,6 @@ import numpy as np
import
torch
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
from
torch.nn
import
functional
as
F
from
...activations
import
ACT2FN
from
...file_utils
import
(
...
...
@@ -196,7 +195,7 @@ class FunnelAttentionStructure(nn.Module):
position_embeds
=
self
.
get_position_embeds
(
seq_len
,
inputs_embeds
.
dtype
,
inputs_embeds
.
device
)
token_type_mat
=
self
.
token_type_ids_to_mat
(
token_type_ids
)
if
token_type_ids
is
not
None
else
None
cls_mask
=
(
F
.
pad
(
inputs_embeds
.
new_ones
([
seq_len
-
1
,
seq_len
-
1
]),
(
1
,
0
,
1
,
0
))
nn
.
functional
.
pad
(
inputs_embeds
.
new_ones
([
seq_len
-
1
,
seq_len
-
1
]),
(
1
,
0
,
1
,
0
))
if
self
.
config
.
separate_cls
else
None
)
...
...
@@ -368,11 +367,11 @@ class FunnelAttentionStructure(nn.Module):
stride
=
(
stride
,
1
)
if
mode
==
"mean"
:
tensor
=
F
.
avg_pool2d
(
tensor
,
stride
,
stride
=
stride
,
ceil_mode
=
True
)
tensor
=
nn
.
functional
.
avg_pool2d
(
tensor
,
stride
,
stride
=
stride
,
ceil_mode
=
True
)
elif
mode
==
"max"
:
tensor
=
F
.
max_pool2d
(
tensor
,
stride
,
stride
=
stride
,
ceil_mode
=
True
)
tensor
=
nn
.
functional
.
max_pool2d
(
tensor
,
stride
,
stride
=
stride
,
ceil_mode
=
True
)
elif
mode
==
"min"
:
tensor
=
-
F
.
max_pool2d
(
-
tensor
,
stride
,
stride
=
stride
,
ceil_mode
=
True
)
tensor
=
-
nn
.
functional
.
max_pool2d
(
-
tensor
,
stride
,
stride
=
stride
,
ceil_mode
=
True
)
else
:
raise
NotImplementedError
(
"The supported modes are 'mean', 'max' and 'min'."
)
...
...
src/transformers/models/gpt2/modeling_gpt2.py
View file @
1ed2ebf6
...
...
@@ -20,8 +20,8 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
...activations
import
ACT2FN
...
...
src/transformers/models/gpt_neo/modeling_gpt_neo.py
View file @
1ed2ebf6
...
...
@@ -19,7 +19,6 @@ import os
from
typing
import
Tuple
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
...
...
@@ -186,7 +185,7 @@ class GPTNeoAttentionMixin:
else
:
raise
ValueError
(
f
"Input tensor rank should be one of [2, 3], but is:
{
len
(
tensor
.
shape
)
}
"
)
padded_tensor
=
F
.
pad
(
tensor
,
padding_side
,
value
=
pad_value
)
padded_tensor
=
nn
.
functional
.
pad
(
tensor
,
padding_side
,
value
=
pad_value
)
padded_tensor
=
padded_tensor
.
unfold
(
dimension
=
1
,
size
=
window_size
+
block_length
,
step
=
block_length
)
if
is_key_value
:
...
...
src/transformers/models/ibert/modeling_ibert.py
View file @
1ed2ebf6
...
...
@@ -20,8 +20,8 @@
import
math
import
torch
import
torch.nn
as
nn
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
...activations
import
gelu
...
...
src/transformers/models/ibert/quant_modules.py
View file @
1ed2ebf6
...
...
@@ -19,8 +19,7 @@ import decimal
import
numpy
as
np
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch.autograd
import
Function
from
...utils
import
logging
...
...
@@ -79,7 +78,7 @@ class QuantEmbedding(nn.Module):
def
forward
(
self
,
x
,
positions
=
None
,
incremental_state
=
None
):
if
not
self
.
quant_mode
:
return
(
F
.
embedding
(
nn
.
functional
.
embedding
(
x
,
self
.
weight
,
self
.
padding_idx
,
...
...
@@ -101,7 +100,7 @@ class QuantEmbedding(nn.Module):
self
.
weight
,
self
.
weight_bit
,
self
.
percentile_mode
,
self
.
weight_scaling_factor
)
emb_int
=
F
.
embedding
(
emb_int
=
nn
.
functional
.
embedding
(
x
,
self
.
weight_integer
,
self
.
padding_idx
,
...
...
@@ -264,7 +263,7 @@ class QuantLinear(nn.Module):
def
forward
(
self
,
x
,
prev_act_scaling_factor
=
None
):
if
not
self
.
quant_mode
:
return
F
.
linear
(
x
,
weight
=
self
.
weight
,
bias
=
self
.
bias
),
None
return
nn
.
functional
.
linear
(
x
,
weight
=
self
.
weight
,
bias
=
self
.
bias
),
None
# assert that prev_act_scaling_factor is a scalar tensor
assert
prev_act_scaling_factor
is
not
None
and
prev_act_scaling_factor
.
shape
==
(
1
,),
(
...
...
@@ -295,7 +294,7 @@ class QuantLinear(nn.Module):
x_int
=
x
/
prev_act_scaling_factor
return
(
F
.
linear
(
x_int
,
weight
=
self
.
weight_integer
,
bias
=
self
.
bias_integer
)
*
bias_scaling_factor
,
nn
.
functional
.
linear
(
x_int
,
weight
=
self
.
weight_integer
,
bias
=
self
.
bias_integer
)
*
bias_scaling_factor
,
bias_scaling_factor
,
)
...
...
src/transformers/models/layoutlm/modeling_layoutlm.py
View file @
1ed2ebf6
...
...
@@ -52,7 +52,7 @@ LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
]
LayoutLMLayerNorm
=
torch
.
nn
.
LayerNorm
LayoutLMLayerNorm
=
nn
.
LayerNorm
class
LayoutLMEmbeddings
(
nn
.
Module
):
...
...
src/transformers/models/led/modeling_led.py
View file @
1ed2ebf6
...
...
@@ -21,7 +21,6 @@ from dataclasses import dataclass
from
typing
import
List
,
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
...
...
@@ -250,7 +249,9 @@ class LEDEncoderSelfAttention(nn.Module):
# free memory
del
global_key_attn_scores
attn_probs
=
F
.
softmax
(
attn_scores
,
dim
=-
1
,
dtype
=
torch
.
float32
)
# use fp32 for numerical stability
attn_probs
=
nn
.
functional
.
softmax
(
attn_scores
,
dim
=-
1
,
dtype
=
torch
.
float32
)
# use fp32 for numerical stability
if
layer_head_mask
is
not
None
:
assert
layer_head_mask
.
size
()
==
(
...
...
@@ -266,7 +267,7 @@ class LEDEncoderSelfAttention(nn.Module):
del
attn_scores
# apply dropout
attn_probs
=
F
.
dropout
(
attn_probs
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_probs
,
p
=
self
.
dropout
,
training
=
self
.
training
)
value_vectors
=
value_vectors
.
view
(
seq_len
,
batch_size
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
0
,
1
)
...
...
@@ -326,7 +327,7 @@ class LEDEncoderSelfAttention(nn.Module):
@
staticmethod
def
_pad_and_transpose_last_two_dims
(
hidden_states_padded
,
padding
):
"""pads rows and then flips rows and columns"""
hidden_states_padded
=
F
.
pad
(
hidden_states_padded
=
nn
.
functional
.
pad
(
hidden_states_padded
,
padding
)
# padding value is not important because it will be overwritten
hidden_states_padded
=
hidden_states_padded
.
view
(
...
...
@@ -353,7 +354,7 @@ class LEDEncoderSelfAttention(nn.Module):
0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
"""
total_num_heads
,
num_chunks
,
window_overlap
,
hidden_dim
=
chunked_hidden_states
.
size
()
chunked_hidden_states
=
F
.
pad
(
chunked_hidden_states
=
nn
.
functional
.
pad
(
chunked_hidden_states
,
(
0
,
window_overlap
+
1
)
)
# total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten
chunked_hidden_states
=
chunked_hidden_states
.
view
(
...
...
@@ -489,7 +490,7 @@ class LEDEncoderSelfAttention(nn.Module):
value
=
value
.
transpose
(
1
,
2
).
reshape
(
batch_size
*
num_heads
,
seq_len
,
head_dim
)
# pad seq_len with w at the beginning of the sequence and another window overlap at the end
padded_value
=
F
.
pad
(
value
,
(
0
,
0
,
window_overlap
,
window_overlap
),
value
=-
1
)
padded_value
=
nn
.
functional
.
pad
(
value
,
(
0
,
0
,
window_overlap
,
window_overlap
),
value
=-
1
)
# chunk padded_value into chunks of size 3 window overlap and an overlap of size window overlap
chunked_value_size
=
(
batch_size
*
num_heads
,
chunks_count
+
1
,
3
*
window_overlap
,
head_dim
)
...
...
@@ -661,7 +662,7 @@ class LEDEncoderSelfAttention(nn.Module):
global_attn_scores
=
global_attn_scores
.
view
(
batch_size
*
self
.
num_heads
,
max_num_global_attn_indices
,
seq_len
)
# compute global attn probs
global_attn_probs_float
=
F
.
softmax
(
global_attn_probs_float
=
nn
.
functional
.
softmax
(
global_attn_scores
,
dim
=-
1
,
dtype
=
torch
.
float32
)
# use fp32 for numerical stability
...
...
@@ -677,7 +678,7 @@ class LEDEncoderSelfAttention(nn.Module):
batch_size
*
self
.
num_heads
,
max_num_global_attn_indices
,
seq_len
)
global_attn_probs
=
F
.
dropout
(
global_attn_probs
=
nn
.
functional
.
dropout
(
global_attn_probs_float
.
type_as
(
global_attn_scores
),
p
=
self
.
dropout
,
training
=
self
.
training
)
...
...
@@ -833,7 +834,7 @@ class LEDDecoderAttention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
assert
layer_head_mask
.
size
()
==
(
self
.
num_heads
,
...
...
@@ -851,7 +852,7 @@ class LEDDecoderAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -914,15 +915,15 @@ class LEDEncoderLayer(nn.Module):
output_attentions
=
output_attentions
,
)
hidden_states
=
attn_outputs
[
0
]
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
self_attn_layer_norm
(
hidden_states
)
residual
=
hidden_states
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
...
...
@@ -1002,7 +1003,7 @@ class LEDDecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
self_attn_layer_norm
(
hidden_states
)
...
...
@@ -1022,7 +1023,7 @@ class LEDDecoderLayer(nn.Module):
past_key_value
=
cross_attn_past_key_value
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
encoder_attn_layer_norm
(
hidden_states
)
...
...
@@ -1032,9 +1033,9 @@ class LEDDecoderLayer(nn.Module):
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
...
...
@@ -1562,7 +1563,7 @@ class LEDEncoder(LEDPreTrainedModel):
Args:
config: LEDConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
LEDConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -1637,7 +1638,7 @@ class LEDEncoder(LEDPreTrainedModel):
f
"`config.attention_window`:
{
attention_window
}
"
)
if
input_ids
is
not
None
:
input_ids
=
F
.
pad
(
input_ids
,
(
0
,
padding_len
),
value
=
pad_token_id
)
input_ids
=
nn
.
functional
.
pad
(
input_ids
,
(
0
,
padding_len
),
value
=
pad_token_id
)
if
inputs_embeds
is
not
None
:
input_ids_padding
=
inputs_embeds
.
new_full
(
(
batch_size
,
padding_len
),
...
...
@@ -1647,7 +1648,9 @@ class LEDEncoder(LEDPreTrainedModel):
inputs_embeds_padding
=
self
.
embed_tokens
(
input_ids_padding
)
inputs_embeds
=
torch
.
cat
([
inputs_embeds
,
inputs_embeds_padding
],
dim
=-
2
)
attention_mask
=
F
.
pad
(
attention_mask
,
(
0
,
padding_len
),
value
=
False
)
# no attention on the padding tokens
attention_mask
=
nn
.
functional
.
pad
(
attention_mask
,
(
0
,
padding_len
),
value
=
False
)
# no attention on the padding tokens
return
padding_len
,
input_ids
,
attention_mask
,
inputs_embeds
...
...
@@ -1760,7 +1763,7 @@ class LEDEncoder(LEDPreTrainedModel):
hidden_states
=
inputs_embeds
+
embed_pos
hidden_states
=
self
.
layernorm_embedding
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
encoder_states
=
()
if
output_hidden_states
else
None
all_attentions
=
()
if
output_attentions
else
None
...
...
@@ -1842,7 +1845,7 @@ class LEDDecoder(LEDPreTrainedModel):
Args:
config: LEDConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
LEDConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -2008,7 +2011,7 @@ class LEDDecoder(LEDPreTrainedModel):
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
self
.
layernorm_embedding
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# decoder layers
all_hidden_states
=
()
if
output_hidden_states
else
None
...
...
src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
View file @
1ed2ebf6
...
...
@@ -19,6 +19,7 @@ import argparse
import
pytorch_lightning
as
pl
import
torch
from
torch
import
nn
from
transformers
import
LongformerForQuestionAnswering
,
LongformerModel
...
...
@@ -28,7 +29,7 @@ class LightningModel(pl.LightningModule):
super
().
__init__
()
self
.
model
=
model
self
.
num_labels
=
2
self
.
qa_outputs
=
torch
.
nn
.
Linear
(
self
.
model
.
config
.
hidden_size
,
self
.
num_labels
)
self
.
qa_outputs
=
nn
.
Linear
(
self
.
model
.
config
.
hidden_size
,
self
.
num_labels
)
# implement only because lightning requires to do so
def
forward
(
self
):
...
...
src/transformers/models/longformer/modeling_longformer.py
View file @
1ed2ebf6
...
...
@@ -19,10 +19,9 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
from
torch.nn
import
functional
as
F
from
...activations
import
ACT2FN
,
gelu
from
...file_utils
import
(
...
...
@@ -640,7 +639,9 @@ class LongformerSelfAttention(nn.Module):
# free memory
del
global_key_attn_scores
attn_probs
=
F
.
softmax
(
attn_scores
,
dim
=-
1
,
dtype
=
torch
.
float32
)
# use fp32 for numerical stability
attn_probs
=
nn
.
functional
.
softmax
(
attn_scores
,
dim
=-
1
,
dtype
=
torch
.
float32
)
# use fp32 for numerical stability
if
layer_head_mask
is
not
None
:
assert
layer_head_mask
.
size
()
==
(
...
...
@@ -656,7 +657,7 @@ class LongformerSelfAttention(nn.Module):
del
attn_scores
# apply dropout
attn_probs
=
F
.
dropout
(
attn_probs
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_probs
,
p
=
self
.
dropout
,
training
=
self
.
training
)
value_vectors
=
value_vectors
.
view
(
seq_len
,
batch_size
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
0
,
1
)
...
...
@@ -716,7 +717,7 @@ class LongformerSelfAttention(nn.Module):
@
staticmethod
def
_pad_and_transpose_last_two_dims
(
hidden_states_padded
,
padding
):
"""pads rows and then flips rows and columns"""
hidden_states_padded
=
F
.
pad
(
hidden_states_padded
=
nn
.
functional
.
pad
(
hidden_states_padded
,
padding
)
# padding value is not important because it will be overwritten
hidden_states_padded
=
hidden_states_padded
.
view
(
...
...
@@ -743,7 +744,7 @@ class LongformerSelfAttention(nn.Module):
0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
"""
total_num_heads
,
num_chunks
,
window_overlap
,
hidden_dim
=
chunked_hidden_states
.
size
()
chunked_hidden_states
=
F
.
pad
(
chunked_hidden_states
=
nn
.
functional
.
pad
(
chunked_hidden_states
,
(
0
,
window_overlap
+
1
)
)
# total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten
chunked_hidden_states
=
chunked_hidden_states
.
view
(
...
...
@@ -879,7 +880,7 @@ class LongformerSelfAttention(nn.Module):
value
=
value
.
transpose
(
1
,
2
).
reshape
(
batch_size
*
num_heads
,
seq_len
,
head_dim
)
# pad seq_len with w at the beginning of the sequence and another window overlap at the end
padded_value
=
F
.
pad
(
value
,
(
0
,
0
,
window_overlap
,
window_overlap
),
value
=-
1
)
padded_value
=
nn
.
functional
.
pad
(
value
,
(
0
,
0
,
window_overlap
,
window_overlap
),
value
=-
1
)
# chunk padded_value into chunks of size 3 window overlap and an overlap of size window overlap
chunked_value_size
=
(
batch_size
*
num_heads
,
chunks_count
+
1
,
3
*
window_overlap
,
head_dim
)
...
...
@@ -1051,7 +1052,7 @@ class LongformerSelfAttention(nn.Module):
global_attn_scores
=
global_attn_scores
.
view
(
batch_size
*
self
.
num_heads
,
max_num_global_attn_indices
,
seq_len
)
# compute global attn probs
global_attn_probs_float
=
F
.
softmax
(
global_attn_probs_float
=
nn
.
functional
.
softmax
(
global_attn_scores
,
dim
=-
1
,
dtype
=
torch
.
float32
)
# use fp32 for numerical stability
...
...
@@ -1067,7 +1068,7 @@ class LongformerSelfAttention(nn.Module):
batch_size
*
self
.
num_heads
,
max_num_global_attn_indices
,
seq_len
)
global_attn_probs
=
F
.
dropout
(
global_attn_probs
=
nn
.
functional
.
dropout
(
global_attn_probs_float
.
type_as
(
global_attn_scores
),
p
=
self
.
dropout
,
training
=
self
.
training
)
...
...
@@ -1546,10 +1547,10 @@ class LongformerModel(LongformerPreTrainedModel):
f
"`config.attention_window`:
{
attention_window
}
"
)
if
input_ids
is
not
None
:
input_ids
=
F
.
pad
(
input_ids
,
(
0
,
padding_len
),
value
=
pad_token_id
)
input_ids
=
nn
.
functional
.
pad
(
input_ids
,
(
0
,
padding_len
),
value
=
pad_token_id
)
if
position_ids
is
not
None
:
# pad with position_id = pad_token_id as in modeling_roberta.RobertaEmbeddings
position_ids
=
F
.
pad
(
position_ids
,
(
0
,
padding_len
),
value
=
pad_token_id
)
position_ids
=
nn
.
functional
.
pad
(
position_ids
,
(
0
,
padding_len
),
value
=
pad_token_id
)
if
inputs_embeds
is
not
None
:
input_ids_padding
=
inputs_embeds
.
new_full
(
(
batch_size
,
padding_len
),
...
...
@@ -1559,8 +1560,10 @@ class LongformerModel(LongformerPreTrainedModel):
inputs_embeds_padding
=
self
.
embeddings
(
input_ids_padding
)
inputs_embeds
=
torch
.
cat
([
inputs_embeds
,
inputs_embeds_padding
],
dim
=-
2
)
attention_mask
=
F
.
pad
(
attention_mask
,
(
0
,
padding_len
),
value
=
False
)
# no attention on the padding tokens
token_type_ids
=
F
.
pad
(
token_type_ids
,
(
0
,
padding_len
),
value
=
0
)
# pad with token_type_id = 0
attention_mask
=
nn
.
functional
.
pad
(
attention_mask
,
(
0
,
padding_len
),
value
=
False
)
# no attention on the padding tokens
token_type_ids
=
nn
.
functional
.
pad
(
token_type_ids
,
(
0
,
padding_len
),
value
=
0
)
# pad with token_type_id = 0
return
padding_len
,
input_ids
,
attention_mask
,
token_type_ids
,
position_ids
,
inputs_embeds
...
...
src/transformers/models/luke/modeling_luke.py
View file @
1ed2ebf6
...
...
@@ -19,9 +19,8 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
from
...activations
import
ACT2FN
from
...file_utils
import
(
...
...
@@ -1098,9 +1097,9 @@ class LukeForEntityClassification(LukePreTrainedModel):
# When the number of dimension of `labels` is 1, cross entropy is used as the loss function. The binary
# cross entropy is used otherwise.
if
labels
.
ndim
==
1
:
loss
=
F
.
cross_entropy
(
logits
,
labels
)
loss
=
nn
.
functional
.
cross_entropy
(
logits
,
labels
)
else
:
loss
=
F
.
binary_cross_entropy_with_logits
(
logits
.
view
(
-
1
),
labels
.
view
(
-
1
).
type_as
(
logits
))
loss
=
nn
.
functional
.
binary_cross_entropy_with_logits
(
logits
.
view
(
-
1
),
labels
.
view
(
-
1
).
type_as
(
logits
))
if
not
return_dict
:
output
=
(
...
...
@@ -1213,9 +1212,9 @@ class LukeForEntityPairClassification(LukePreTrainedModel):
# When the number of dimension of `labels` is 1, cross entropy is used as the loss function. The binary
# cross entropy is used otherwise.
if
labels
.
ndim
==
1
:
loss
=
F
.
cross_entropy
(
logits
,
labels
)
loss
=
nn
.
functional
.
cross_entropy
(
logits
,
labels
)
else
:
loss
=
F
.
binary_cross_entropy_with_logits
(
logits
.
view
(
-
1
),
labels
.
view
(
-
1
).
type_as
(
logits
))
loss
=
nn
.
functional
.
binary_cross_entropy_with_logits
(
logits
.
view
(
-
1
),
labels
.
view
(
-
1
).
type_as
(
logits
))
if
not
return_dict
:
output
=
(
...
...
@@ -1351,9 +1350,9 @@ class LukeForEntitySpanClassification(LukePreTrainedModel):
# When the number of dimension of `labels` is 2, cross entropy is used as the loss function. The binary
# cross entropy is used otherwise.
if
labels
.
ndim
==
2
:
loss
=
F
.
cross_entropy
(
logits
.
view
(
-
1
,
self
.
num_labels
),
labels
.
view
(
-
1
))
loss
=
nn
.
functional
.
cross_entropy
(
logits
.
view
(
-
1
,
self
.
num_labels
),
labels
.
view
(
-
1
))
else
:
loss
=
F
.
binary_cross_entropy_with_logits
(
logits
.
view
(
-
1
),
labels
.
view
(
-
1
).
type_as
(
logits
))
loss
=
nn
.
functional
.
binary_cross_entropy_with_logits
(
logits
.
view
(
-
1
),
labels
.
view
(
-
1
).
type_as
(
logits
))
if
not
return_dict
:
output
=
(
...
...
src/transformers/models/m2m_100/modeling_m2m_100.py
View file @
1ed2ebf6
...
...
@@ -20,7 +20,6 @@ import random
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
...
...
@@ -293,7 +292,7 @@ class M2M100Attention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
if
layer_head_mask
.
size
()
!=
(
self
.
num_heads
,):
...
...
@@ -313,7 +312,7 @@ class M2M100Attention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -375,15 +374,15 @@ class M2M100EncoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
if
hidden_states
.
dtype
==
torch
.
float16
and
(
...
...
@@ -471,7 +470,7 @@ class M2M100DecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# Cross-Attention Block
...
...
@@ -491,7 +490,7 @@ class M2M100DecoderLayer(nn.Module):
past_key_value
=
cross_attn_past_key_value
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# add cross-attn to positions 3,4 of present_key_value tuple
...
...
@@ -501,9 +500,9 @@ class M2M100DecoderLayer(nn.Module):
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
outputs
=
(
hidden_states
,)
...
...
@@ -665,7 +664,7 @@ class M2M100Encoder(M2M100PreTrainedModel):
Args:
config: M2M100Config
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
M2M100Config
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -764,7 +763,7 @@ class M2M100Encoder(M2M100PreTrainedModel):
embed_pos
=
self
.
embed_positions
(
input_ids
,
inputs_embeds
)
hidden_states
=
inputs_embeds
+
embed_pos
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# expand attention_mask
if
attention_mask
is
not
None
:
...
...
@@ -832,7 +831,7 @@ class M2M100Decoder(M2M100PreTrainedModel):
Args:
config: M2M100Config
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
M2M100Config
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -989,7 +988,7 @@ class M2M100Decoder(M2M100PreTrainedModel):
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# decoder layers
all_hidden_states
=
()
if
output_hidden_states
else
None
...
...
src/transformers/models/marian/convert_marian_to_pytorch.py
View file @
1ed2ebf6
...
...
@@ -24,6 +24,7 @@ from zipfile import ZipFile
import
numpy
as
np
import
torch
from
torch
import
nn
from
tqdm
import
tqdm
from
transformers
import
MarianConfig
,
MarianMTModel
,
MarianTokenizer
...
...
@@ -53,7 +54,7 @@ def convert_encoder_layer(opus_dict, layer_prefix: str, converter: dict):
return
sd
def
load_layers_
(
layer_lst
:
torch
.
nn
.
ModuleList
,
opus_state
:
dict
,
converter
,
is_decoder
=
False
):
def
load_layers_
(
layer_lst
:
nn
.
ModuleList
,
opus_state
:
dict
,
converter
,
is_decoder
=
False
):
for
i
,
layer
in
enumerate
(
layer_lst
):
layer_tag
=
f
"decoder_l
{
i
+
1
}
_"
if
is_decoder
else
f
"encoder_l
{
i
+
1
}
_"
sd
=
convert_encoder_layer
(
opus_state
,
layer_tag
,
converter
)
...
...
@@ -543,8 +544,8 @@ class OpusState:
load_layers_
(
model
.
model
.
decoder
.
layers
,
state_dict
,
BART_CONVERTER
,
is_decoder
=
True
)
# handle tensors not associated with layers
wemb_tensor
=
torch
.
nn
.
Parameter
(
torch
.
FloatTensor
(
self
.
wemb
))
bias_tensor
=
torch
.
nn
.
Parameter
(
torch
.
FloatTensor
(
self
.
final_bias
))
wemb_tensor
=
nn
.
Parameter
(
torch
.
FloatTensor
(
self
.
wemb
))
bias_tensor
=
nn
.
Parameter
(
torch
.
FloatTensor
(
self
.
final_bias
))
model
.
model
.
shared
.
weight
=
wemb_tensor
model
.
model
.
encoder
.
embed_tokens
=
model
.
model
.
decoder
.
embed_tokens
=
model
.
model
.
shared
...
...
src/transformers/models/marian/modeling_marian.py
View file @
1ed2ebf6
...
...
@@ -22,7 +22,6 @@ from typing import Optional, Tuple
import
numpy
as
np
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
...
...
@@ -239,7 +238,7 @@ class MarianAttention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
if
layer_head_mask
.
size
()
!=
(
self
.
num_heads
,):
...
...
@@ -259,7 +258,7 @@ class MarianAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -320,15 +319,15 @@ class MarianEncoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
self_attn_layer_norm
(
hidden_states
)
residual
=
hidden_states
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
...
...
@@ -416,7 +415,7 @@ class MarianDecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
self_attn_layer_norm
(
hidden_states
)
...
...
@@ -436,7 +435,7 @@ class MarianDecoderLayer(nn.Module):
past_key_value
=
cross_attn_past_key_value
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
encoder_attn_layer_norm
(
hidden_states
)
...
...
@@ -446,9 +445,9 @@ class MarianDecoderLayer(nn.Module):
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
...
...
@@ -630,7 +629,7 @@ class MarianEncoder(MarianPreTrainedModel):
Args:
config: MarianConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
MarianConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -727,7 +726,7 @@ class MarianEncoder(MarianPreTrainedModel):
embed_pos
=
self
.
embed_positions
(
input_shape
)
hidden_states
=
inputs_embeds
+
embed_pos
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# expand attention_mask
if
attention_mask
is
not
None
:
...
...
@@ -793,7 +792,7 @@ class MarianDecoder(MarianPreTrainedModel):
Args:
config: MarianConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
MarianConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -963,7 +962,7 @@ class MarianDecoder(MarianPreTrainedModel):
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# decoder layers
all_hidden_states
=
()
if
output_hidden_states
else
None
...
...
src/transformers/models/mbart/modeling_mbart.py
View file @
1ed2ebf6
...
...
@@ -19,7 +19,6 @@ import random
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
...
...
@@ -230,7 +229,7 @@ class MBartAttention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
if
layer_head_mask
.
size
()
!=
(
self
.
num_heads
,):
...
...
@@ -250,7 +249,7 @@ class MBartAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -311,15 +310,15 @@ class MBartEncoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
if
hidden_states
.
dtype
==
torch
.
float16
and
(
...
...
@@ -406,7 +405,7 @@ class MBartDecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# Cross-Attention Block
...
...
@@ -426,7 +425,7 @@ class MBartDecoderLayer(nn.Module):
past_key_value
=
cross_attn_past_key_value
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# add cross-attn to positions 3,4 of present_key_value tuple
...
...
@@ -436,9 +435,9 @@ class MBartDecoderLayer(nn.Module):
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
outputs
=
(
hidden_states
,)
...
...
@@ -658,7 +657,7 @@ class MBartEncoder(MBartPreTrainedModel):
Args:
config: MBartConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
MBartConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -758,7 +757,7 @@ class MBartEncoder(MBartPreTrainedModel):
hidden_states
=
inputs_embeds
+
embed_pos
hidden_states
=
self
.
layernorm_embedding
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# expand attention_mask
if
attention_mask
is
not
None
:
...
...
@@ -826,7 +825,7 @@ class MBartDecoder(MBartPreTrainedModel):
Args:
config: MBartConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
MBartConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -999,7 +998,7 @@ class MBartDecoder(MBartPreTrainedModel):
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
self
.
layernorm_embedding
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# decoder layers
all_hidden_states
=
()
if
output_hidden_states
else
None
...
...
src/transformers/models/mmbt/modeling_mmbt.py
View file @
1ed2ebf6
...
...
@@ -17,7 +17,7 @@
import
torch
import
torch.nn
as
nn
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
...file_utils
import
add_start_docstrings
,
add_start_docstrings_to_model_forward
,
replace_return_docstrings
...
...
src/transformers/models/mobilebert/modeling_mobilebert.py
View file @
1ed2ebf6
...
...
@@ -27,7 +27,6 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
...
...
@@ -155,7 +154,7 @@ class NoNorm(nn.Module):
return
input_tensor
*
self
.
weight
+
self
.
bias
NORM2FN
=
{
"layer_norm"
:
torch
.
nn
.
LayerNorm
,
"no_norm"
:
NoNorm
}
NORM2FN
=
{
"layer_norm"
:
nn
.
LayerNorm
,
"no_norm"
:
NoNorm
}
class
MobileBertEmbeddings
(
nn
.
Module
):
...
...
@@ -207,9 +206,9 @@ class MobileBertEmbeddings(nn.Module):
# dimensional output.
inputs_embeds
=
torch
.
cat
(
[
F
.
pad
(
inputs_embeds
[:,
1
:],
[
0
,
0
,
0
,
1
,
0
,
0
],
value
=
0
),
nn
.
functional
.
pad
(
inputs_embeds
[:,
1
:],
[
0
,
0
,
0
,
1
,
0
,
0
],
value
=
0
),
inputs_embeds
,
F
.
pad
(
inputs_embeds
[:,
:
-
1
],
[
0
,
0
,
1
,
0
,
0
,
0
],
value
=
0
),
nn
.
functional
.
pad
(
inputs_embeds
[:,
:
-
1
],
[
0
,
0
,
1
,
0
,
0
,
0
],
value
=
0
),
],
dim
=
2
,
)
...
...
@@ -920,7 +919,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
def
set_output_embeddings
(
self
,
new_embeddigs
):
self
.
cls
.
predictions
.
decoder
=
new_embeddigs
def
resize_token_embeddings
(
self
,
new_num_tokens
:
Optional
[
int
]
=
None
)
->
torch
.
nn
.
Embedding
:
def
resize_token_embeddings
(
self
,
new_num_tokens
:
Optional
[
int
]
=
None
)
->
nn
.
Embedding
:
# resize dense output embedings at first
self
.
cls
.
predictions
.
dense
=
self
.
_get_resized_lm_head
(
self
.
cls
.
predictions
.
dense
,
new_num_tokens
=
new_num_tokens
,
transposed
=
True
...
...
@@ -1028,7 +1027,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
def
set_output_embeddings
(
self
,
new_embeddigs
):
self
.
cls
.
predictions
.
decoder
=
new_embeddigs
def
resize_token_embeddings
(
self
,
new_num_tokens
:
Optional
[
int
]
=
None
)
->
torch
.
nn
.
Embedding
:
def
resize_token_embeddings
(
self
,
new_num_tokens
:
Optional
[
int
]
=
None
)
->
nn
.
Embedding
:
# resize dense output embedings at first
self
.
cls
.
predictions
.
dense
=
self
.
_get_resized_lm_head
(
self
.
cls
.
predictions
.
dense
,
new_num_tokens
=
new_num_tokens
,
transposed
=
True
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment