Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
1ed2ebf6
Unverified
Commit
1ed2ebf6
authored
Jun 14, 2021
by
Stas Bekman
Committed by
GitHub
Jun 14, 2021
Browse files
[style] consistent nn. and nn.functional (#12124)
* consistent nn. and nn.functional * fix glitch * fix glitch #2
parent
ff7c8168
Changes
63
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
132 additions
and
133 deletions
+132
-133
src/transformers/models/electra/modeling_electra.py
src/transformers/models/electra/modeling_electra.py
+1
-1
src/transformers/models/flaubert/modeling_flaubert.py
src/transformers/models/flaubert/modeling_flaubert.py
+5
-5
src/transformers/models/flaubert/modeling_tf_flaubert.py
src/transformers/models/flaubert/modeling_tf_flaubert.py
+1
-1
src/transformers/models/fsmt/modeling_fsmt.py
src/transformers/models/fsmt/modeling_fsmt.py
+12
-13
src/transformers/models/funnel/modeling_funnel.py
src/transformers/models/funnel/modeling_funnel.py
+4
-5
src/transformers/models/gpt2/modeling_gpt2.py
src/transformers/models/gpt2/modeling_gpt2.py
+1
-1
src/transformers/models/gpt_neo/modeling_gpt_neo.py
src/transformers/models/gpt_neo/modeling_gpt_neo.py
+1
-2
src/transformers/models/ibert/modeling_ibert.py
src/transformers/models/ibert/modeling_ibert.py
+1
-1
src/transformers/models/ibert/quant_modules.py
src/transformers/models/ibert/quant_modules.py
+5
-6
src/transformers/models/layoutlm/modeling_layoutlm.py
src/transformers/models/layoutlm/modeling_layoutlm.py
+1
-1
src/transformers/models/led/modeling_led.py
src/transformers/models/led/modeling_led.py
+26
-23
src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
...nvert_longformer_original_pytorch_lightning_to_pytorch.py
+2
-1
src/transformers/models/longformer/modeling_longformer.py
src/transformers/models/longformer/modeling_longformer.py
+16
-13
src/transformers/models/luke/modeling_luke.py
src/transformers/models/luke/modeling_luke.py
+7
-8
src/transformers/models/m2m_100/modeling_m2m_100.py
src/transformers/models/m2m_100/modeling_m2m_100.py
+13
-14
src/transformers/models/marian/convert_marian_to_pytorch.py
src/transformers/models/marian/convert_marian_to_pytorch.py
+4
-3
src/transformers/models/marian/modeling_marian.py
src/transformers/models/marian/modeling_marian.py
+13
-14
src/transformers/models/mbart/modeling_mbart.py
src/transformers/models/mbart/modeling_mbart.py
+13
-14
src/transformers/models/mmbt/modeling_mmbt.py
src/transformers/models/mmbt/modeling_mmbt.py
+1
-1
src/transformers/models/mobilebert/modeling_mobilebert.py
src/transformers/models/mobilebert/modeling_mobilebert.py
+5
-6
No files found.
src/transformers/models/electra/modeling_electra.py
View file @
1ed2ebf6
...
...
@@ -20,8 +20,8 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
from
...activations
import
ACT2FN
,
get_activation
...
...
src/transformers/models/flaubert/modeling_flaubert.py
View file @
1ed2ebf6
...
...
@@ -18,7 +18,7 @@
import
random
import
torch
from
torch
.nn
import
functional
as
F
from
torch
import
nn
from
...file_utils
import
add_code_sample_docstrings
,
add_start_docstrings
,
add_start_docstrings_to_model_forward
from
...modeling_outputs
import
BaseModelOutput
...
...
@@ -234,7 +234,7 @@ class FlaubertModel(XLMModel):
if
token_type_ids
is
not
None
:
tensor
=
tensor
+
self
.
embeddings
(
token_type_ids
)
tensor
=
self
.
layer_norm_emb
(
tensor
)
tensor
=
F
.
dropout
(
tensor
,
p
=
self
.
dropout
,
training
=
self
.
training
)
tensor
=
nn
.
functional
.
dropout
(
tensor
,
p
=
self
.
dropout
,
training
=
self
.
training
)
tensor
*=
mask
.
unsqueeze
(
-
1
).
to
(
tensor
.
dtype
)
# transformer layers
...
...
@@ -261,7 +261,7 @@ class FlaubertModel(XLMModel):
attn
=
attn_outputs
[
0
]
if
output_attentions
:
attentions
=
attentions
+
(
attn_outputs
[
1
],)
attn
=
F
.
dropout
(
attn
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn
=
nn
.
functional
.
dropout
(
attn
,
p
=
self
.
dropout
,
training
=
self
.
training
)
tensor
=
tensor
+
attn
tensor
=
self
.
layer_norm1
[
i
](
tensor
)
else
:
...
...
@@ -270,13 +270,13 @@ class FlaubertModel(XLMModel):
attn
=
attn_outputs
[
0
]
if
output_attentions
:
attentions
=
attentions
+
(
attn_outputs
[
1
],)
attn
=
F
.
dropout
(
attn
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn
=
nn
.
functional
.
dropout
(
attn
,
p
=
self
.
dropout
,
training
=
self
.
training
)
tensor
=
tensor
+
attn
# encoder attention (for decoder only)
# if self.is_decoder and src_enc is not None:
# attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
# attn =
F
.dropout(attn, p=self.dropout, training=self.training)
# attn =
nn.functional
.dropout(attn, p=self.dropout, training=self.training)
# tensor = tensor + attn
# tensor = self.layer_norm15[i](tensor)
...
...
src/transformers/models/flaubert/modeling_tf_flaubert.py
View file @
1ed2ebf6
...
...
@@ -675,7 +675,7 @@ class TFFlaubertMainLayer(tf.keras.layers.Layer):
# encoder attention (for decoder only)
# if self.is_decoder and src_enc is not None:
# attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
# attn =
F
.dropout(attn, p=self.dropout, training=self.training)
# attn =
nn.functional
.dropout(attn, p=self.dropout, training=self.training)
# tensor = tensor + attn
# tensor = self.layer_norm15[i](tensor)
...
...
src/transformers/models/fsmt/modeling_fsmt.py
View file @
1ed2ebf6
...
...
@@ -32,7 +32,6 @@ import random
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
from
torch
import
Tensor
,
nn
from
torch.nn
import
CrossEntropyLoss
,
LayerNorm
...
...
@@ -430,15 +429,15 @@ class EncoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
residual
+
x
x
=
self
.
self_attn_layer_norm
(
x
)
residual
=
x
x
=
self
.
activation_fn
(
self
.
fc1
(
x
))
x
=
F
.
dropout
(
x
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
x
=
self
.
fc2
(
x
)
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
residual
+
x
x
=
self
.
final_layer_norm
(
x
)
return
x
,
attn_weights
...
...
@@ -504,7 +503,7 @@ class FSMTEncoder(nn.Module):
inputs_embeds
=
self
.
embed_tokens
(
input_ids
)
*
self
.
embed_scale
embed_pos
=
self
.
embed_positions
(
input_ids
)
x
=
inputs_embeds
+
embed_pos
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# B x T x C -> T x B x C
x
=
x
.
transpose
(
0
,
1
)
...
...
@@ -600,7 +599,7 @@ class DecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
residual
+
x
x
=
self
.
self_attn_layer_norm
(
x
)
...
...
@@ -615,16 +614,16 @@ class DecoderLayer(nn.Module):
layer_head_mask
=
cross_attn_layer_head_mask
,
output_attentions
=
output_attentions
,
)
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
residual
+
x
x
=
self
.
encoder_attn_layer_norm
(
x
)
# Fully Connected
residual
=
x
x
=
self
.
activation_fn
(
self
.
fc1
(
x
))
x
=
F
.
dropout
(
x
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
x
=
self
.
fc2
(
x
)
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
residual
+
x
x
=
self
.
final_layer_norm
(
x
)
return
(
...
...
@@ -641,7 +640,7 @@ class FSMTDecoder(nn.Module):
Args:
config: FSMTConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
FSMTConfig
,
embed_tokens
:
nn
.
Embedding
):
...
...
@@ -726,7 +725,7 @@ class FSMTDecoder(nn.Module):
x
=
self
.
embed_tokens
(
input_ids
)
*
self
.
embed_scale
x
+=
positions
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
nn
.
functional
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# Convert to FSMT output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
x
=
x
.
transpose
(
0
,
1
)
...
...
@@ -913,7 +912,7 @@ class Attention(nn.Module):
attn_weights
=
attn_weights
.
masked_fill
(
reshaped
,
float
(
"-inf"
))
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
assert
layer_head_mask
.
size
()
==
(
...
...
@@ -929,7 +928,7 @@ class Attention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
,
...
...
src/transformers/models/funnel/modeling_funnel.py
View file @
1ed2ebf6
...
...
@@ -22,7 +22,6 @@ import numpy as np
import
torch
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
from
torch.nn
import
functional
as
F
from
...activations
import
ACT2FN
from
...file_utils
import
(
...
...
@@ -196,7 +195,7 @@ class FunnelAttentionStructure(nn.Module):
position_embeds
=
self
.
get_position_embeds
(
seq_len
,
inputs_embeds
.
dtype
,
inputs_embeds
.
device
)
token_type_mat
=
self
.
token_type_ids_to_mat
(
token_type_ids
)
if
token_type_ids
is
not
None
else
None
cls_mask
=
(
F
.
pad
(
inputs_embeds
.
new_ones
([
seq_len
-
1
,
seq_len
-
1
]),
(
1
,
0
,
1
,
0
))
nn
.
functional
.
pad
(
inputs_embeds
.
new_ones
([
seq_len
-
1
,
seq_len
-
1
]),
(
1
,
0
,
1
,
0
))
if
self
.
config
.
separate_cls
else
None
)
...
...
@@ -368,11 +367,11 @@ class FunnelAttentionStructure(nn.Module):
stride
=
(
stride
,
1
)
if
mode
==
"mean"
:
tensor
=
F
.
avg_pool2d
(
tensor
,
stride
,
stride
=
stride
,
ceil_mode
=
True
)
tensor
=
nn
.
functional
.
avg_pool2d
(
tensor
,
stride
,
stride
=
stride
,
ceil_mode
=
True
)
elif
mode
==
"max"
:
tensor
=
F
.
max_pool2d
(
tensor
,
stride
,
stride
=
stride
,
ceil_mode
=
True
)
tensor
=
nn
.
functional
.
max_pool2d
(
tensor
,
stride
,
stride
=
stride
,
ceil_mode
=
True
)
elif
mode
==
"min"
:
tensor
=
-
F
.
max_pool2d
(
-
tensor
,
stride
,
stride
=
stride
,
ceil_mode
=
True
)
tensor
=
-
nn
.
functional
.
max_pool2d
(
-
tensor
,
stride
,
stride
=
stride
,
ceil_mode
=
True
)
else
:
raise
NotImplementedError
(
"The supported modes are 'mean', 'max' and 'min'."
)
...
...
src/transformers/models/gpt2/modeling_gpt2.py
View file @
1ed2ebf6
...
...
@@ -20,8 +20,8 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
...activations
import
ACT2FN
...
...
src/transformers/models/gpt_neo/modeling_gpt_neo.py
View file @
1ed2ebf6
...
...
@@ -19,7 +19,6 @@ import os
from
typing
import
Tuple
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
...
...
@@ -186,7 +185,7 @@ class GPTNeoAttentionMixin:
else
:
raise
ValueError
(
f
"Input tensor rank should be one of [2, 3], but is:
{
len
(
tensor
.
shape
)
}
"
)
padded_tensor
=
F
.
pad
(
tensor
,
padding_side
,
value
=
pad_value
)
padded_tensor
=
nn
.
functional
.
pad
(
tensor
,
padding_side
,
value
=
pad_value
)
padded_tensor
=
padded_tensor
.
unfold
(
dimension
=
1
,
size
=
window_size
+
block_length
,
step
=
block_length
)
if
is_key_value
:
...
...
src/transformers/models/ibert/modeling_ibert.py
View file @
1ed2ebf6
...
...
@@ -20,8 +20,8 @@
import
math
import
torch
import
torch.nn
as
nn
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
...activations
import
gelu
...
...
src/transformers/models/ibert/quant_modules.py
View file @
1ed2ebf6
...
...
@@ -19,8 +19,7 @@ import decimal
import
numpy
as
np
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch.autograd
import
Function
from
...utils
import
logging
...
...
@@ -79,7 +78,7 @@ class QuantEmbedding(nn.Module):
def
forward
(
self
,
x
,
positions
=
None
,
incremental_state
=
None
):
if
not
self
.
quant_mode
:
return
(
F
.
embedding
(
nn
.
functional
.
embedding
(
x
,
self
.
weight
,
self
.
padding_idx
,
...
...
@@ -101,7 +100,7 @@ class QuantEmbedding(nn.Module):
self
.
weight
,
self
.
weight_bit
,
self
.
percentile_mode
,
self
.
weight_scaling_factor
)
emb_int
=
F
.
embedding
(
emb_int
=
nn
.
functional
.
embedding
(
x
,
self
.
weight_integer
,
self
.
padding_idx
,
...
...
@@ -264,7 +263,7 @@ class QuantLinear(nn.Module):
def
forward
(
self
,
x
,
prev_act_scaling_factor
=
None
):
if
not
self
.
quant_mode
:
return
F
.
linear
(
x
,
weight
=
self
.
weight
,
bias
=
self
.
bias
),
None
return
nn
.
functional
.
linear
(
x
,
weight
=
self
.
weight
,
bias
=
self
.
bias
),
None
# assert that prev_act_scaling_factor is a scalar tensor
assert
prev_act_scaling_factor
is
not
None
and
prev_act_scaling_factor
.
shape
==
(
1
,),
(
...
...
@@ -295,7 +294,7 @@ class QuantLinear(nn.Module):
x_int
=
x
/
prev_act_scaling_factor
return
(
F
.
linear
(
x_int
,
weight
=
self
.
weight_integer
,
bias
=
self
.
bias_integer
)
*
bias_scaling_factor
,
nn
.
functional
.
linear
(
x_int
,
weight
=
self
.
weight_integer
,
bias
=
self
.
bias_integer
)
*
bias_scaling_factor
,
bias_scaling_factor
,
)
...
...
src/transformers/models/layoutlm/modeling_layoutlm.py
View file @
1ed2ebf6
...
...
@@ -52,7 +52,7 @@ LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
]
LayoutLMLayerNorm
=
torch
.
nn
.
LayerNorm
LayoutLMLayerNorm
=
nn
.
LayerNorm
class
LayoutLMEmbeddings
(
nn
.
Module
):
...
...
src/transformers/models/led/modeling_led.py
View file @
1ed2ebf6
...
...
@@ -21,7 +21,6 @@ from dataclasses import dataclass
from
typing
import
List
,
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
...
...
@@ -250,7 +249,9 @@ class LEDEncoderSelfAttention(nn.Module):
# free memory
del
global_key_attn_scores
attn_probs
=
F
.
softmax
(
attn_scores
,
dim
=-
1
,
dtype
=
torch
.
float32
)
# use fp32 for numerical stability
attn_probs
=
nn
.
functional
.
softmax
(
attn_scores
,
dim
=-
1
,
dtype
=
torch
.
float32
)
# use fp32 for numerical stability
if
layer_head_mask
is
not
None
:
assert
layer_head_mask
.
size
()
==
(
...
...
@@ -266,7 +267,7 @@ class LEDEncoderSelfAttention(nn.Module):
del
attn_scores
# apply dropout
attn_probs
=
F
.
dropout
(
attn_probs
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_probs
,
p
=
self
.
dropout
,
training
=
self
.
training
)
value_vectors
=
value_vectors
.
view
(
seq_len
,
batch_size
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
0
,
1
)
...
...
@@ -326,7 +327,7 @@ class LEDEncoderSelfAttention(nn.Module):
@
staticmethod
def
_pad_and_transpose_last_two_dims
(
hidden_states_padded
,
padding
):
"""pads rows and then flips rows and columns"""
hidden_states_padded
=
F
.
pad
(
hidden_states_padded
=
nn
.
functional
.
pad
(
hidden_states_padded
,
padding
)
# padding value is not important because it will be overwritten
hidden_states_padded
=
hidden_states_padded
.
view
(
...
...
@@ -353,7 +354,7 @@ class LEDEncoderSelfAttention(nn.Module):
0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
"""
total_num_heads
,
num_chunks
,
window_overlap
,
hidden_dim
=
chunked_hidden_states
.
size
()
chunked_hidden_states
=
F
.
pad
(
chunked_hidden_states
=
nn
.
functional
.
pad
(
chunked_hidden_states
,
(
0
,
window_overlap
+
1
)
)
# total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten
chunked_hidden_states
=
chunked_hidden_states
.
view
(
...
...
@@ -489,7 +490,7 @@ class LEDEncoderSelfAttention(nn.Module):
value
=
value
.
transpose
(
1
,
2
).
reshape
(
batch_size
*
num_heads
,
seq_len
,
head_dim
)
# pad seq_len with w at the beginning of the sequence and another window overlap at the end
padded_value
=
F
.
pad
(
value
,
(
0
,
0
,
window_overlap
,
window_overlap
),
value
=-
1
)
padded_value
=
nn
.
functional
.
pad
(
value
,
(
0
,
0
,
window_overlap
,
window_overlap
),
value
=-
1
)
# chunk padded_value into chunks of size 3 window overlap and an overlap of size window overlap
chunked_value_size
=
(
batch_size
*
num_heads
,
chunks_count
+
1
,
3
*
window_overlap
,
head_dim
)
...
...
@@ -661,7 +662,7 @@ class LEDEncoderSelfAttention(nn.Module):
global_attn_scores
=
global_attn_scores
.
view
(
batch_size
*
self
.
num_heads
,
max_num_global_attn_indices
,
seq_len
)
# compute global attn probs
global_attn_probs_float
=
F
.
softmax
(
global_attn_probs_float
=
nn
.
functional
.
softmax
(
global_attn_scores
,
dim
=-
1
,
dtype
=
torch
.
float32
)
# use fp32 for numerical stability
...
...
@@ -677,7 +678,7 @@ class LEDEncoderSelfAttention(nn.Module):
batch_size
*
self
.
num_heads
,
max_num_global_attn_indices
,
seq_len
)
global_attn_probs
=
F
.
dropout
(
global_attn_probs
=
nn
.
functional
.
dropout
(
global_attn_probs_float
.
type_as
(
global_attn_scores
),
p
=
self
.
dropout
,
training
=
self
.
training
)
...
...
@@ -833,7 +834,7 @@ class LEDDecoderAttention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
assert
layer_head_mask
.
size
()
==
(
self
.
num_heads
,
...
...
@@ -851,7 +852,7 @@ class LEDDecoderAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -914,15 +915,15 @@ class LEDEncoderLayer(nn.Module):
output_attentions
=
output_attentions
,
)
hidden_states
=
attn_outputs
[
0
]
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
self_attn_layer_norm
(
hidden_states
)
residual
=
hidden_states
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
...
...
@@ -1002,7 +1003,7 @@ class LEDDecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
self_attn_layer_norm
(
hidden_states
)
...
...
@@ -1022,7 +1023,7 @@ class LEDDecoderLayer(nn.Module):
past_key_value
=
cross_attn_past_key_value
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
encoder_attn_layer_norm
(
hidden_states
)
...
...
@@ -1032,9 +1033,9 @@ class LEDDecoderLayer(nn.Module):
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
...
...
@@ -1562,7 +1563,7 @@ class LEDEncoder(LEDPreTrainedModel):
Args:
config: LEDConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
LEDConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -1637,7 +1638,7 @@ class LEDEncoder(LEDPreTrainedModel):
f
"`config.attention_window`:
{
attention_window
}
"
)
if
input_ids
is
not
None
:
input_ids
=
F
.
pad
(
input_ids
,
(
0
,
padding_len
),
value
=
pad_token_id
)
input_ids
=
nn
.
functional
.
pad
(
input_ids
,
(
0
,
padding_len
),
value
=
pad_token_id
)
if
inputs_embeds
is
not
None
:
input_ids_padding
=
inputs_embeds
.
new_full
(
(
batch_size
,
padding_len
),
...
...
@@ -1647,7 +1648,9 @@ class LEDEncoder(LEDPreTrainedModel):
inputs_embeds_padding
=
self
.
embed_tokens
(
input_ids_padding
)
inputs_embeds
=
torch
.
cat
([
inputs_embeds
,
inputs_embeds_padding
],
dim
=-
2
)
attention_mask
=
F
.
pad
(
attention_mask
,
(
0
,
padding_len
),
value
=
False
)
# no attention on the padding tokens
attention_mask
=
nn
.
functional
.
pad
(
attention_mask
,
(
0
,
padding_len
),
value
=
False
)
# no attention on the padding tokens
return
padding_len
,
input_ids
,
attention_mask
,
inputs_embeds
...
...
@@ -1760,7 +1763,7 @@ class LEDEncoder(LEDPreTrainedModel):
hidden_states
=
inputs_embeds
+
embed_pos
hidden_states
=
self
.
layernorm_embedding
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
encoder_states
=
()
if
output_hidden_states
else
None
all_attentions
=
()
if
output_attentions
else
None
...
...
@@ -1842,7 +1845,7 @@ class LEDDecoder(LEDPreTrainedModel):
Args:
config: LEDConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
LEDConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -2008,7 +2011,7 @@ class LEDDecoder(LEDPreTrainedModel):
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
self
.
layernorm_embedding
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# decoder layers
all_hidden_states
=
()
if
output_hidden_states
else
None
...
...
src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
View file @
1ed2ebf6
...
...
@@ -19,6 +19,7 @@ import argparse
import
pytorch_lightning
as
pl
import
torch
from
torch
import
nn
from
transformers
import
LongformerForQuestionAnswering
,
LongformerModel
...
...
@@ -28,7 +29,7 @@ class LightningModel(pl.LightningModule):
super
().
__init__
()
self
.
model
=
model
self
.
num_labels
=
2
self
.
qa_outputs
=
torch
.
nn
.
Linear
(
self
.
model
.
config
.
hidden_size
,
self
.
num_labels
)
self
.
qa_outputs
=
nn
.
Linear
(
self
.
model
.
config
.
hidden_size
,
self
.
num_labels
)
# implement only because lightning requires to do so
def
forward
(
self
):
...
...
src/transformers/models/longformer/modeling_longformer.py
View file @
1ed2ebf6
...
...
@@ -19,10 +19,9 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
from
torch.nn
import
functional
as
F
from
...activations
import
ACT2FN
,
gelu
from
...file_utils
import
(
...
...
@@ -640,7 +639,9 @@ class LongformerSelfAttention(nn.Module):
# free memory
del
global_key_attn_scores
attn_probs
=
F
.
softmax
(
attn_scores
,
dim
=-
1
,
dtype
=
torch
.
float32
)
# use fp32 for numerical stability
attn_probs
=
nn
.
functional
.
softmax
(
attn_scores
,
dim
=-
1
,
dtype
=
torch
.
float32
)
# use fp32 for numerical stability
if
layer_head_mask
is
not
None
:
assert
layer_head_mask
.
size
()
==
(
...
...
@@ -656,7 +657,7 @@ class LongformerSelfAttention(nn.Module):
del
attn_scores
# apply dropout
attn_probs
=
F
.
dropout
(
attn_probs
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_probs
,
p
=
self
.
dropout
,
training
=
self
.
training
)
value_vectors
=
value_vectors
.
view
(
seq_len
,
batch_size
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
0
,
1
)
...
...
@@ -716,7 +717,7 @@ class LongformerSelfAttention(nn.Module):
@
staticmethod
def
_pad_and_transpose_last_two_dims
(
hidden_states_padded
,
padding
):
"""pads rows and then flips rows and columns"""
hidden_states_padded
=
F
.
pad
(
hidden_states_padded
=
nn
.
functional
.
pad
(
hidden_states_padded
,
padding
)
# padding value is not important because it will be overwritten
hidden_states_padded
=
hidden_states_padded
.
view
(
...
...
@@ -743,7 +744,7 @@ class LongformerSelfAttention(nn.Module):
0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
"""
total_num_heads
,
num_chunks
,
window_overlap
,
hidden_dim
=
chunked_hidden_states
.
size
()
chunked_hidden_states
=
F
.
pad
(
chunked_hidden_states
=
nn
.
functional
.
pad
(
chunked_hidden_states
,
(
0
,
window_overlap
+
1
)
)
# total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten
chunked_hidden_states
=
chunked_hidden_states
.
view
(
...
...
@@ -879,7 +880,7 @@ class LongformerSelfAttention(nn.Module):
value
=
value
.
transpose
(
1
,
2
).
reshape
(
batch_size
*
num_heads
,
seq_len
,
head_dim
)
# pad seq_len with w at the beginning of the sequence and another window overlap at the end
padded_value
=
F
.
pad
(
value
,
(
0
,
0
,
window_overlap
,
window_overlap
),
value
=-
1
)
padded_value
=
nn
.
functional
.
pad
(
value
,
(
0
,
0
,
window_overlap
,
window_overlap
),
value
=-
1
)
# chunk padded_value into chunks of size 3 window overlap and an overlap of size window overlap
chunked_value_size
=
(
batch_size
*
num_heads
,
chunks_count
+
1
,
3
*
window_overlap
,
head_dim
)
...
...
@@ -1051,7 +1052,7 @@ class LongformerSelfAttention(nn.Module):
global_attn_scores
=
global_attn_scores
.
view
(
batch_size
*
self
.
num_heads
,
max_num_global_attn_indices
,
seq_len
)
# compute global attn probs
global_attn_probs_float
=
F
.
softmax
(
global_attn_probs_float
=
nn
.
functional
.
softmax
(
global_attn_scores
,
dim
=-
1
,
dtype
=
torch
.
float32
)
# use fp32 for numerical stability
...
...
@@ -1067,7 +1068,7 @@ class LongformerSelfAttention(nn.Module):
batch_size
*
self
.
num_heads
,
max_num_global_attn_indices
,
seq_len
)
global_attn_probs
=
F
.
dropout
(
global_attn_probs
=
nn
.
functional
.
dropout
(
global_attn_probs_float
.
type_as
(
global_attn_scores
),
p
=
self
.
dropout
,
training
=
self
.
training
)
...
...
@@ -1546,10 +1547,10 @@ class LongformerModel(LongformerPreTrainedModel):
f
"`config.attention_window`:
{
attention_window
}
"
)
if
input_ids
is
not
None
:
input_ids
=
F
.
pad
(
input_ids
,
(
0
,
padding_len
),
value
=
pad_token_id
)
input_ids
=
nn
.
functional
.
pad
(
input_ids
,
(
0
,
padding_len
),
value
=
pad_token_id
)
if
position_ids
is
not
None
:
# pad with position_id = pad_token_id as in modeling_roberta.RobertaEmbeddings
position_ids
=
F
.
pad
(
position_ids
,
(
0
,
padding_len
),
value
=
pad_token_id
)
position_ids
=
nn
.
functional
.
pad
(
position_ids
,
(
0
,
padding_len
),
value
=
pad_token_id
)
if
inputs_embeds
is
not
None
:
input_ids_padding
=
inputs_embeds
.
new_full
(
(
batch_size
,
padding_len
),
...
...
@@ -1559,8 +1560,10 @@ class LongformerModel(LongformerPreTrainedModel):
inputs_embeds_padding
=
self
.
embeddings
(
input_ids_padding
)
inputs_embeds
=
torch
.
cat
([
inputs_embeds
,
inputs_embeds_padding
],
dim
=-
2
)
attention_mask
=
F
.
pad
(
attention_mask
,
(
0
,
padding_len
),
value
=
False
)
# no attention on the padding tokens
token_type_ids
=
F
.
pad
(
token_type_ids
,
(
0
,
padding_len
),
value
=
0
)
# pad with token_type_id = 0
attention_mask
=
nn
.
functional
.
pad
(
attention_mask
,
(
0
,
padding_len
),
value
=
False
)
# no attention on the padding tokens
token_type_ids
=
nn
.
functional
.
pad
(
token_type_ids
,
(
0
,
padding_len
),
value
=
0
)
# pad with token_type_id = 0
return
padding_len
,
input_ids
,
attention_mask
,
token_type_ids
,
position_ids
,
inputs_embeds
...
...
src/transformers/models/luke/modeling_luke.py
View file @
1ed2ebf6
...
...
@@ -19,9 +19,8 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
from
...activations
import
ACT2FN
from
...file_utils
import
(
...
...
@@ -1098,9 +1097,9 @@ class LukeForEntityClassification(LukePreTrainedModel):
# When the number of dimension of `labels` is 1, cross entropy is used as the loss function. The binary
# cross entropy is used otherwise.
if
labels
.
ndim
==
1
:
loss
=
F
.
cross_entropy
(
logits
,
labels
)
loss
=
nn
.
functional
.
cross_entropy
(
logits
,
labels
)
else
:
loss
=
F
.
binary_cross_entropy_with_logits
(
logits
.
view
(
-
1
),
labels
.
view
(
-
1
).
type_as
(
logits
))
loss
=
nn
.
functional
.
binary_cross_entropy_with_logits
(
logits
.
view
(
-
1
),
labels
.
view
(
-
1
).
type_as
(
logits
))
if
not
return_dict
:
output
=
(
...
...
@@ -1213,9 +1212,9 @@ class LukeForEntityPairClassification(LukePreTrainedModel):
# When the number of dimension of `labels` is 1, cross entropy is used as the loss function. The binary
# cross entropy is used otherwise.
if
labels
.
ndim
==
1
:
loss
=
F
.
cross_entropy
(
logits
,
labels
)
loss
=
nn
.
functional
.
cross_entropy
(
logits
,
labels
)
else
:
loss
=
F
.
binary_cross_entropy_with_logits
(
logits
.
view
(
-
1
),
labels
.
view
(
-
1
).
type_as
(
logits
))
loss
=
nn
.
functional
.
binary_cross_entropy_with_logits
(
logits
.
view
(
-
1
),
labels
.
view
(
-
1
).
type_as
(
logits
))
if
not
return_dict
:
output
=
(
...
...
@@ -1351,9 +1350,9 @@ class LukeForEntitySpanClassification(LukePreTrainedModel):
# When the number of dimension of `labels` is 2, cross entropy is used as the loss function. The binary
# cross entropy is used otherwise.
if
labels
.
ndim
==
2
:
loss
=
F
.
cross_entropy
(
logits
.
view
(
-
1
,
self
.
num_labels
),
labels
.
view
(
-
1
))
loss
=
nn
.
functional
.
cross_entropy
(
logits
.
view
(
-
1
,
self
.
num_labels
),
labels
.
view
(
-
1
))
else
:
loss
=
F
.
binary_cross_entropy_with_logits
(
logits
.
view
(
-
1
),
labels
.
view
(
-
1
).
type_as
(
logits
))
loss
=
nn
.
functional
.
binary_cross_entropy_with_logits
(
logits
.
view
(
-
1
),
labels
.
view
(
-
1
).
type_as
(
logits
))
if
not
return_dict
:
output
=
(
...
...
src/transformers/models/m2m_100/modeling_m2m_100.py
View file @
1ed2ebf6
...
...
@@ -20,7 +20,6 @@ import random
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
...
...
@@ -293,7 +292,7 @@ class M2M100Attention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
if
layer_head_mask
.
size
()
!=
(
self
.
num_heads
,):
...
...
@@ -313,7 +312,7 @@ class M2M100Attention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -375,15 +374,15 @@ class M2M100EncoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
if
hidden_states
.
dtype
==
torch
.
float16
and
(
...
...
@@ -471,7 +470,7 @@ class M2M100DecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# Cross-Attention Block
...
...
@@ -491,7 +490,7 @@ class M2M100DecoderLayer(nn.Module):
past_key_value
=
cross_attn_past_key_value
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# add cross-attn to positions 3,4 of present_key_value tuple
...
...
@@ -501,9 +500,9 @@ class M2M100DecoderLayer(nn.Module):
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
outputs
=
(
hidden_states
,)
...
...
@@ -665,7 +664,7 @@ class M2M100Encoder(M2M100PreTrainedModel):
Args:
config: M2M100Config
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
M2M100Config
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -764,7 +763,7 @@ class M2M100Encoder(M2M100PreTrainedModel):
embed_pos
=
self
.
embed_positions
(
input_ids
,
inputs_embeds
)
hidden_states
=
inputs_embeds
+
embed_pos
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# expand attention_mask
if
attention_mask
is
not
None
:
...
...
@@ -832,7 +831,7 @@ class M2M100Decoder(M2M100PreTrainedModel):
Args:
config: M2M100Config
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
M2M100Config
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -989,7 +988,7 @@ class M2M100Decoder(M2M100PreTrainedModel):
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# decoder layers
all_hidden_states
=
()
if
output_hidden_states
else
None
...
...
src/transformers/models/marian/convert_marian_to_pytorch.py
View file @
1ed2ebf6
...
...
@@ -24,6 +24,7 @@ from zipfile import ZipFile
import
numpy
as
np
import
torch
from
torch
import
nn
from
tqdm
import
tqdm
from
transformers
import
MarianConfig
,
MarianMTModel
,
MarianTokenizer
...
...
@@ -53,7 +54,7 @@ def convert_encoder_layer(opus_dict, layer_prefix: str, converter: dict):
return
sd
def
load_layers_
(
layer_lst
:
torch
.
nn
.
ModuleList
,
opus_state
:
dict
,
converter
,
is_decoder
=
False
):
def
load_layers_
(
layer_lst
:
nn
.
ModuleList
,
opus_state
:
dict
,
converter
,
is_decoder
=
False
):
for
i
,
layer
in
enumerate
(
layer_lst
):
layer_tag
=
f
"decoder_l
{
i
+
1
}
_"
if
is_decoder
else
f
"encoder_l
{
i
+
1
}
_"
sd
=
convert_encoder_layer
(
opus_state
,
layer_tag
,
converter
)
...
...
@@ -543,8 +544,8 @@ class OpusState:
load_layers_
(
model
.
model
.
decoder
.
layers
,
state_dict
,
BART_CONVERTER
,
is_decoder
=
True
)
# handle tensors not associated with layers
wemb_tensor
=
torch
.
nn
.
Parameter
(
torch
.
FloatTensor
(
self
.
wemb
))
bias_tensor
=
torch
.
nn
.
Parameter
(
torch
.
FloatTensor
(
self
.
final_bias
))
wemb_tensor
=
nn
.
Parameter
(
torch
.
FloatTensor
(
self
.
wemb
))
bias_tensor
=
nn
.
Parameter
(
torch
.
FloatTensor
(
self
.
final_bias
))
model
.
model
.
shared
.
weight
=
wemb_tensor
model
.
model
.
encoder
.
embed_tokens
=
model
.
model
.
decoder
.
embed_tokens
=
model
.
model
.
shared
...
...
src/transformers/models/marian/modeling_marian.py
View file @
1ed2ebf6
...
...
@@ -22,7 +22,6 @@ from typing import Optional, Tuple
import
numpy
as
np
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
...
...
@@ -239,7 +238,7 @@ class MarianAttention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
if
layer_head_mask
.
size
()
!=
(
self
.
num_heads
,):
...
...
@@ -259,7 +258,7 @@ class MarianAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -320,15 +319,15 @@ class MarianEncoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
self_attn_layer_norm
(
hidden_states
)
residual
=
hidden_states
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
...
...
@@ -416,7 +415,7 @@ class MarianDecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
self_attn_layer_norm
(
hidden_states
)
...
...
@@ -436,7 +435,7 @@ class MarianDecoderLayer(nn.Module):
past_key_value
=
cross_attn_past_key_value
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
encoder_attn_layer_norm
(
hidden_states
)
...
...
@@ -446,9 +445,9 @@ class MarianDecoderLayer(nn.Module):
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
...
...
@@ -630,7 +629,7 @@ class MarianEncoder(MarianPreTrainedModel):
Args:
config: MarianConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
MarianConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -727,7 +726,7 @@ class MarianEncoder(MarianPreTrainedModel):
embed_pos
=
self
.
embed_positions
(
input_shape
)
hidden_states
=
inputs_embeds
+
embed_pos
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# expand attention_mask
if
attention_mask
is
not
None
:
...
...
@@ -793,7 +792,7 @@ class MarianDecoder(MarianPreTrainedModel):
Args:
config: MarianConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
MarianConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -963,7 +962,7 @@ class MarianDecoder(MarianPreTrainedModel):
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# decoder layers
all_hidden_states
=
()
if
output_hidden_states
else
None
...
...
src/transformers/models/mbart/modeling_mbart.py
View file @
1ed2ebf6
...
...
@@ -19,7 +19,6 @@ import random
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
import
torch.utils.checkpoint
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
...
...
@@ -230,7 +229,7 @@ class MBartAttention(nn.Module):
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
F
.
softmax
(
attn_weights
,
dim
=-
1
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
if
layer_head_mask
is
not
None
:
if
layer_head_mask
.
size
()
!=
(
self
.
num_heads
,):
...
...
@@ -250,7 +249,7 @@ class MBartAttention(nn.Module):
else
:
attn_weights_reshaped
=
None
attn_probs
=
F
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -311,15 +310,15 @@ class MBartEncoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
if
hidden_states
.
dtype
==
torch
.
float16
and
(
...
...
@@ -406,7 +405,7 @@ class MBartDecoderLayer(nn.Module):
layer_head_mask
=
layer_head_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# Cross-Attention Block
...
...
@@ -426,7 +425,7 @@ class MBartDecoderLayer(nn.Module):
past_key_value
=
cross_attn_past_key_value
,
output_attentions
=
output_attentions
,
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
# add cross-attn to positions 3,4 of present_key_value tuple
...
...
@@ -436,9 +435,9 @@ class MBartDecoderLayer(nn.Module):
residual
=
hidden_states
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
self
.
fc1
(
hidden_states
))
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
hidden_states
=
self
.
fc2
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
residual
+
hidden_states
outputs
=
(
hidden_states
,)
...
...
@@ -658,7 +657,7 @@ class MBartEncoder(MBartPreTrainedModel):
Args:
config: MBartConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
MBartConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -758,7 +757,7 @@ class MBartEncoder(MBartPreTrainedModel):
hidden_states
=
inputs_embeds
+
embed_pos
hidden_states
=
self
.
layernorm_embedding
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# expand attention_mask
if
attention_mask
is
not
None
:
...
...
@@ -826,7 +825,7 @@ class MBartDecoder(MBartPreTrainedModel):
Args:
config: MBartConfig
embed_tokens (
torch.
nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""
def
__init__
(
self
,
config
:
MBartConfig
,
embed_tokens
:
Optional
[
nn
.
Embedding
]
=
None
):
...
...
@@ -999,7 +998,7 @@ class MBartDecoder(MBartPreTrainedModel):
hidden_states
=
inputs_embeds
+
positions
hidden_states
=
self
.
layernorm_embedding
(
hidden_states
)
hidden_states
=
F
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
hidden_states
=
nn
.
functional
.
dropout
(
hidden_states
,
p
=
self
.
dropout
,
training
=
self
.
training
)
# decoder layers
all_hidden_states
=
()
if
output_hidden_states
else
None
...
...
src/transformers/models/mmbt/modeling_mmbt.py
View file @
1ed2ebf6
...
...
@@ -17,7 +17,7 @@
import
torch
import
torch.nn
as
nn
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
...file_utils
import
add_start_docstrings
,
add_start_docstrings_to_model_forward
,
replace_return_docstrings
...
...
src/transformers/models/mobilebert/modeling_mobilebert.py
View file @
1ed2ebf6
...
...
@@ -27,7 +27,6 @@ from dataclasses import dataclass
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch.nn
import
BCEWithLogitsLoss
,
CrossEntropyLoss
,
MSELoss
...
...
@@ -155,7 +154,7 @@ class NoNorm(nn.Module):
return
input_tensor
*
self
.
weight
+
self
.
bias
NORM2FN
=
{
"layer_norm"
:
torch
.
nn
.
LayerNorm
,
"no_norm"
:
NoNorm
}
NORM2FN
=
{
"layer_norm"
:
nn
.
LayerNorm
,
"no_norm"
:
NoNorm
}
class
MobileBertEmbeddings
(
nn
.
Module
):
...
...
@@ -207,9 +206,9 @@ class MobileBertEmbeddings(nn.Module):
# dimensional output.
inputs_embeds
=
torch
.
cat
(
[
F
.
pad
(
inputs_embeds
[:,
1
:],
[
0
,
0
,
0
,
1
,
0
,
0
],
value
=
0
),
nn
.
functional
.
pad
(
inputs_embeds
[:,
1
:],
[
0
,
0
,
0
,
1
,
0
,
0
],
value
=
0
),
inputs_embeds
,
F
.
pad
(
inputs_embeds
[:,
:
-
1
],
[
0
,
0
,
1
,
0
,
0
,
0
],
value
=
0
),
nn
.
functional
.
pad
(
inputs_embeds
[:,
:
-
1
],
[
0
,
0
,
1
,
0
,
0
,
0
],
value
=
0
),
],
dim
=
2
,
)
...
...
@@ -920,7 +919,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
def
set_output_embeddings
(
self
,
new_embeddigs
):
self
.
cls
.
predictions
.
decoder
=
new_embeddigs
def
resize_token_embeddings
(
self
,
new_num_tokens
:
Optional
[
int
]
=
None
)
->
torch
.
nn
.
Embedding
:
def
resize_token_embeddings
(
self
,
new_num_tokens
:
Optional
[
int
]
=
None
)
->
nn
.
Embedding
:
# resize dense output embedings at first
self
.
cls
.
predictions
.
dense
=
self
.
_get_resized_lm_head
(
self
.
cls
.
predictions
.
dense
,
new_num_tokens
=
new_num_tokens
,
transposed
=
True
...
...
@@ -1028,7 +1027,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
def
set_output_embeddings
(
self
,
new_embeddigs
):
self
.
cls
.
predictions
.
decoder
=
new_embeddigs
def
resize_token_embeddings
(
self
,
new_num_tokens
:
Optional
[
int
]
=
None
)
->
torch
.
nn
.
Embedding
:
def
resize_token_embeddings
(
self
,
new_num_tokens
:
Optional
[
int
]
=
None
)
->
nn
.
Embedding
:
# resize dense output embedings at first
self
.
cls
.
predictions
.
dense
=
self
.
_get_resized_lm_head
(
self
.
cls
.
predictions
.
dense
,
new_num_tokens
=
new_num_tokens
,
transposed
=
True
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment