Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
c0f05c10
Commit
c0f05c10
authored
Nov 29, 2022
by
hepj
Browse files
更新transformer代码
parent
c056df78
Changes
321
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
1020 additions
and
0 deletions
+1020
-0
PyTorch/NLP/new-Transformer/fairseq/models/lightconv.py
PyTorch/NLP/new-Transformer/fairseq/models/lightconv.py
+1020
-0
No files found.
Too many changes to show.
To preserve performance only
321 of 321+
files are displayed.
Plain diff
Email patch
PyTorch/NLP/new-Transformer/fairseq/models/lightconv.py
0 → 100644
View file @
c0f05c10
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
math
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
fairseq
import
utils
from
fairseq.models
import
(
FairseqEncoder
,
FairseqEncoderDecoderModel
,
FairseqIncrementalDecoder
,
register_model
,
register_model_architecture
,
)
from
fairseq.modules
import
(
AdaptiveSoftmax
,
DynamicConv
,
FairseqDropout
,
LayerNorm
,
LightweightConv
,
MultiheadAttention
,
PositionalEmbedding
,
)
from
fairseq.utils
import
safe_hasattr
@
register_model
(
"lightconv"
)
class
LightConvModel
(
FairseqEncoderDecoderModel
):
"""
LightConv and DynamicConv model from `"Pay Less Attention with Lightweight and Dynamic Convolutions" (Wu, et al, 2019)
<https://openreview.net/pdf?id=SkVhlh09tX>`_.
To use LightConv please set ``--encoder-conv-type lightweight --decoder-conv-type lightweight``
To use DynamicConv please set ``--encoder-conv-type dynamic --decoder-conv-type dynamic``
Args:
encoder (LightConvEncoder): the encoder
decoder (LightConvDecoder): the decoder
The LightConv model provides the following named architectures and
command-line arguments:
.. argparse::
:ref: fairseq.models.lightconv_parser
:prog:
"""
@
classmethod
def
hub_models
(
cls
):
# fmt: off
def
moses_subword
(
path
):
return
{
'path'
:
path
,
'tokenizer'
:
'moses'
,
'bpe'
:
'subword_nmt'
,
}
return
{
'lightconv.no_glu.iwslt14.de-en'
:
moses_subword
(
'https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.lightconv.tar.gz'
),
'dynamicconv.no_glu.iwslt14.de-en'
:
moses_subword
(
'https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.dynamicconv.tar.gz'
),
'lightconv.no_glu.wmt16.en-de'
:
moses_subword
(
'https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv.tar.gz'
),
'dynamicconv.no_glu.wmt16.en-de'
:
moses_subword
(
'https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv.tar.gz'
),
'lightconv.glu.wmt16.en-de'
:
moses_subword
(
'https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv-glu.tar.gz'
),
'dynamicconv.glu.wmt16.en-de'
:
moses_subword
(
'https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv-glu.tar.gz'
),
'lightconv.glu.wmt17.en-de'
:
moses_subword
(
'https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv-glu.tar.gz'
),
'dynamicconv.glu.wmt17.en-de'
:
moses_subword
(
'https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv-glu.tar.gz'
),
'lightconv.glu.wmt14.en-fr'
:
moses_subword
(
'https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.lightconv-glu.tar.gz'
),
'dynamicconv.glu.wmt14.en-fr'
:
moses_subword
(
'https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.dynamicconv-glu.tar.gz'
),
'lightconv.glu.wmt17.zh-en'
:
moses_subword
(
'https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.lightconv-glu.tar.gz'
),
'dynamicconv.glu.wmt17.zh-en'
:
moses_subword
(
'https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.dynamicconv-glu.tar.gz'
),
}
# fmt: on
def
__init__
(
self
,
encoder
,
decoder
):
super
().
__init__
(
encoder
,
decoder
)
@
staticmethod
def
add_args
(
parser
):
"""Add model-specific arguments to the parser."""
parser
.
add_argument
(
"--dropout"
,
type
=
float
,
metavar
=
"D"
,
help
=
"dropout probability"
)
parser
.
add_argument
(
"--attention-dropout"
,
type
=
float
,
metavar
=
"D"
,
help
=
"dropout probability for attention weights"
,
)
parser
.
add_argument
(
"--relu-dropout"
,
type
=
float
,
metavar
=
"D"
,
help
=
"dropout probability after ReLU in FFN"
,
)
parser
.
add_argument
(
"--input-dropout"
,
type
=
float
,
metavar
=
"D"
,
help
=
"dropout probability of the inputs"
,
)
parser
.
add_argument
(
"--encoder-embed-path"
,
type
=
str
,
metavar
=
"STR"
,
help
=
"path to pre-trained encoder embedding"
,
)
parser
.
add_argument
(
"--encoder-embed-dim"
,
type
=
int
,
metavar
=
"N"
,
help
=
"encoder embedding dimension"
,
)
parser
.
add_argument
(
"--encoder-conv-dim"
,
type
=
int
,
metavar
=
"N"
,
help
=
"encoder embedding dimension"
,
)
parser
.
add_argument
(
"--encoder-ffn-embed-dim"
,
type
=
int
,
metavar
=
"N"
,
help
=
"encoder embedding dimension for FFN"
,
)
parser
.
add_argument
(
"--encoder-layers"
,
type
=
int
,
metavar
=
"N"
,
help
=
"num encoder layers"
)
parser
.
add_argument
(
"--encoder-attention-heads"
,
type
=
int
,
metavar
=
"N"
,
help
=
"num encoder attention heads or LightConv/DynamicConv heads"
,
)
parser
.
add_argument
(
"--encoder-normalize-before"
,
action
=
"store_true"
,
help
=
"apply layernorm before each encoder block"
,
)
parser
.
add_argument
(
"--encoder-learned-pos"
,
action
=
"store_true"
,
help
=
"use learned positional embeddings in the encoder"
,
)
parser
.
add_argument
(
"--decoder-embed-path"
,
type
=
str
,
metavar
=
"STR"
,
help
=
"path to pre-trained decoder embedding"
,
)
parser
.
add_argument
(
"--decoder-embed-dim"
,
type
=
int
,
metavar
=
"N"
,
help
=
"decoder embedding dimension"
,
)
parser
.
add_argument
(
"--decoder-conv-dim"
,
type
=
int
,
metavar
=
"N"
,
help
=
"decoder embedding dimension"
,
)
parser
.
add_argument
(
"--decoder-ffn-embed-dim"
,
type
=
int
,
metavar
=
"N"
,
help
=
"decoder embedding dimension for FFN"
,
)
parser
.
add_argument
(
"--decoder-layers"
,
type
=
int
,
metavar
=
"N"
,
help
=
"num decoder layers"
)
parser
.
add_argument
(
"--decoder-attention-heads"
,
type
=
int
,
metavar
=
"N"
,
help
=
"num decoder attention heads or LightConv/DynamicConv heads"
,
)
parser
.
add_argument
(
"--decoder-learned-pos"
,
action
=
"store_true"
,
help
=
"use learned positional embeddings in the decoder"
,
)
parser
.
add_argument
(
"--decoder-normalize-before"
,
action
=
"store_true"
,
help
=
"apply layernorm before each decoder block"
,
)
parser
.
add_argument
(
"--share-decoder-input-output-embed"
,
action
=
"store_true"
,
help
=
"share decoder input and output embeddings"
,
)
parser
.
add_argument
(
"--share-all-embeddings"
,
action
=
"store_true"
,
help
=
"share encoder, decoder and output embeddings"
" (requires shared dictionary and embed dim)"
,
)
parser
.
add_argument
(
"--adaptive-softmax-cutoff"
,
metavar
=
"EXPR"
,
help
=
"comma separated list of adaptive softmax cutoff points. "
"Must be used with adaptive_loss criterion"
,
),
parser
.
add_argument
(
"--adaptive-softmax-dropout"
,
type
=
float
,
metavar
=
"D"
,
help
=
"sets adaptive softmax dropout for the tail projections"
,
)
"""LightConv and DynamicConv arguments"""
parser
.
add_argument
(
"--encoder-kernel-size-list"
,
type
=
lambda
x
:
utils
.
eval_str_list
(
x
,
int
),
help
=
'list of kernel size (default: "[3,7,15,31,31,31,31]")'
,
)
parser
.
add_argument
(
"--decoder-kernel-size-list"
,
type
=
lambda
x
:
utils
.
eval_str_list
(
x
,
int
),
help
=
'list of kernel size (default: "[3,7,15,31,31,31]")'
,
)
parser
.
add_argument
(
"--encoder-glu"
,
type
=
utils
.
eval_bool
,
help
=
"glu after in proj"
)
parser
.
add_argument
(
"--decoder-glu"
,
type
=
utils
.
eval_bool
,
help
=
"glu after in proj"
)
parser
.
add_argument
(
"--encoder-conv-type"
,
default
=
"dynamic"
,
type
=
str
,
choices
=
[
"dynamic"
,
"lightweight"
],
help
=
"type of convolution"
,
)
parser
.
add_argument
(
"--decoder-conv-type"
,
default
=
"dynamic"
,
type
=
str
,
choices
=
[
"dynamic"
,
"lightweight"
],
help
=
"type of convolution"
,
)
parser
.
add_argument
(
"--weight-softmax"
,
default
=
True
,
type
=
utils
.
eval_bool
)
parser
.
add_argument
(
"--weight-dropout"
,
type
=
float
,
metavar
=
"D"
,
help
=
"dropout probability for conv weights"
,
)
@
classmethod
def
build_model
(
cls
,
args
,
task
):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_architecture
(
args
)
if
not
safe_hasattr
(
args
,
"max_source_positions"
):
args
.
max_source_positions
=
1024
if
not
safe_hasattr
(
args
,
"max_target_positions"
):
args
.
max_target_positions
=
1024
src_dict
,
tgt_dict
=
task
.
source_dictionary
,
task
.
target_dictionary
def
build_embedding
(
dictionary
,
embed_dim
,
path
=
None
):
num_embeddings
=
len
(
dictionary
)
padding_idx
=
dictionary
.
pad
()
emb
=
Embedding
(
num_embeddings
,
embed_dim
,
padding_idx
)
# if provided, load from preloaded dictionaries
if
path
:
embed_dict
=
utils
.
parse_embedding
(
path
)
utils
.
load_embedding
(
embed_dict
,
dictionary
,
emb
)
return
emb
if
args
.
share_all_embeddings
:
if
src_dict
!=
tgt_dict
:
raise
RuntimeError
(
"--share-all-embeddings requires a joined dictionary"
)
if
args
.
encoder_embed_dim
!=
args
.
decoder_embed_dim
:
raise
RuntimeError
(
"--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
)
if
args
.
decoder_embed_path
and
(
args
.
decoder_embed_path
!=
args
.
encoder_embed_path
):
raise
RuntimeError
(
"--share-all-embeddings not compatible with --decoder-embed-path"
)
encoder_embed_tokens
=
build_embedding
(
src_dict
,
args
.
encoder_embed_dim
,
args
.
encoder_embed_path
)
decoder_embed_tokens
=
encoder_embed_tokens
args
.
share_decoder_input_output_embed
=
True
else
:
encoder_embed_tokens
=
build_embedding
(
src_dict
,
args
.
encoder_embed_dim
,
args
.
encoder_embed_path
)
decoder_embed_tokens
=
build_embedding
(
tgt_dict
,
args
.
decoder_embed_dim
,
args
.
decoder_embed_path
)
encoder
=
LightConvEncoder
(
args
,
src_dict
,
encoder_embed_tokens
)
decoder
=
LightConvDecoder
(
args
,
tgt_dict
,
decoder_embed_tokens
)
return
LightConvModel
(
encoder
,
decoder
)
class
LightConvEncoder
(
FairseqEncoder
):
"""
LightConv encoder consisting of *args.encoder_layers* layers. Each layer
is a :class:`LightConvEncoderLayer`.
Args:
args (argparse.Namespace): parsed command-line arguments
dictionary (~fairseq.data.Dictionary): encoding dictionary
embed_tokens (torch.nn.Embedding): input embedding
"""
def
__init__
(
self
,
args
,
dictionary
,
embed_tokens
):
super
().
__init__
(
dictionary
)
self
.
dropout_module
=
FairseqDropout
(
args
.
dropout
,
module_name
=
self
.
__class__
.
__name__
)
embed_dim
=
embed_tokens
.
embedding_dim
self
.
padding_idx
=
embed_tokens
.
padding_idx
self
.
max_source_positions
=
args
.
max_source_positions
self
.
embed_tokens
=
embed_tokens
self
.
embed_scale
=
math
.
sqrt
(
embed_dim
)
self
.
embed_positions
=
(
PositionalEmbedding
(
args
.
max_source_positions
,
embed_dim
,
self
.
padding_idx
,
learned
=
args
.
encoder_learned_pos
,
)
if
not
args
.
no_token_positional_embeddings
else
None
)
self
.
layers
=
nn
.
ModuleList
([])
self
.
layers
.
extend
(
[
LightConvEncoderLayer
(
args
,
kernel_size
=
args
.
encoder_kernel_size_list
[
i
]
)
for
i
in
range
(
args
.
encoder_layers
)
]
)
self
.
register_buffer
(
"version"
,
torch
.
Tensor
([
2
]))
self
.
normalize
=
args
.
encoder_normalize_before
if
self
.
normalize
:
self
.
layer_norm
=
LayerNorm
(
embed_dim
)
def
forward
(
self
,
src_tokens
,
**
unused
):
"""
Args:
src_tokens (LongTensor): tokens in the source language of shape
`(batch, src_len)`
Returns:
dict:
- **encoder_out** (Tensor): the last encoder layer's output of
shape `(src_len, batch, embed_dim)`
- **encoder_padding_mask** (ByteTensor): the positions of
padding elements of shape `(batch, src_len)`
"""
# embed tokens and positions
x
=
self
.
embed_scale
*
self
.
embed_tokens
(
src_tokens
)
if
self
.
embed_positions
is
not
None
:
x
+=
self
.
embed_positions
(
src_tokens
)
x
=
self
.
dropout_module
(
x
)
# B x T x C -> T x B x C
x
=
x
.
transpose
(
0
,
1
)
# compute padding mask
encoder_padding_mask
=
src_tokens
.
eq
(
self
.
padding_idx
)
if
not
encoder_padding_mask
.
any
():
encoder_padding_mask
=
None
# encoder layers
for
layer
in
self
.
layers
:
x
=
layer
(
x
,
encoder_padding_mask
)
if
self
.
normalize
:
x
=
self
.
layer_norm
(
x
)
return
{
"encoder_out"
:
x
,
# T x B x C
"encoder_padding_mask"
:
encoder_padding_mask
,
# B x T
}
def
reorder_encoder_out
(
self
,
encoder_out
,
new_order
):
"""
Reorder encoder output according to *new_order*.
Args:
encoder_out: output from the ``forward()`` method
new_order (LongTensor): desired order
Returns:
*encoder_out* rearranged according to *new_order*
"""
if
encoder_out
[
"encoder_out"
]
is
not
None
:
encoder_out
[
"encoder_out"
]
=
encoder_out
[
"encoder_out"
].
index_select
(
1
,
new_order
)
if
encoder_out
[
"encoder_padding_mask"
]
is
not
None
:
encoder_out
[
"encoder_padding_mask"
]
=
encoder_out
[
"encoder_padding_mask"
].
index_select
(
0
,
new_order
)
return
encoder_out
def
max_positions
(
self
):
"""Maximum input length supported by the encoder."""
if
self
.
embed_positions
is
None
:
return
self
.
max_source_positions
return
min
(
self
.
max_source_positions
,
self
.
embed_positions
.
max_positions
)
class
LightConvDecoder
(
FairseqIncrementalDecoder
):
"""
LightConv decoder consisting of *args.decoder_layers* layers. Each layer
is a :class:`LightConvDecoderLayer`.
Args:
args (argparse.Namespace): parsed command-line arguments
dictionary (~fairseq.data.Dictionary): decoding dictionary
embed_tokens (torch.nn.Embedding): output embedding
no_encoder_attn (bool, optional): whether to attend to encoder outputs.
Default: ``False``
"""
def
__init__
(
self
,
args
,
dictionary
,
embed_tokens
,
no_encoder_attn
=
False
,
final_norm
=
True
):
super
().
__init__
(
dictionary
)
self
.
dropout_module
=
FairseqDropout
(
args
.
dropout
,
module_name
=
self
.
__class__
.
__name__
)
self
.
share_input_output_embed
=
args
.
share_decoder_input_output_embed
input_embed_dim
=
embed_tokens
.
embedding_dim
embed_dim
=
args
.
decoder_embed_dim
output_embed_dim
=
args
.
decoder_output_dim
padding_idx
=
embed_tokens
.
padding_idx
self
.
max_target_positions
=
args
.
max_target_positions
self
.
embed_tokens
=
embed_tokens
self
.
embed_scale
=
math
.
sqrt
(
embed_dim
)
# todo: try with input_embed_dim
self
.
project_in_dim
=
(
Linear
(
input_embed_dim
,
embed_dim
,
bias
=
False
)
if
embed_dim
!=
input_embed_dim
else
None
)
self
.
embed_positions
=
(
PositionalEmbedding
(
args
.
max_target_positions
,
embed_dim
,
padding_idx
,
learned
=
args
.
decoder_learned_pos
,
)
if
not
args
.
no_token_positional_embeddings
else
None
)
self
.
layers
=
nn
.
ModuleList
([])
self
.
layers
.
extend
(
[
LightConvDecoderLayer
(
args
,
no_encoder_attn
,
kernel_size
=
args
.
decoder_kernel_size_list
[
i
]
)
for
i
in
range
(
args
.
decoder_layers
)
]
)
self
.
adaptive_softmax
=
None
self
.
project_out_dim
=
(
Linear
(
embed_dim
,
output_embed_dim
,
bias
=
False
)
if
embed_dim
!=
output_embed_dim
and
not
args
.
tie_adaptive_weights
else
None
)
if
args
.
adaptive_softmax_cutoff
is
not
None
:
self
.
adaptive_softmax
=
AdaptiveSoftmax
(
len
(
dictionary
),
output_embed_dim
,
utils
.
eval_str_list
(
args
.
adaptive_softmax_cutoff
,
type
=
int
),
dropout
=
args
.
adaptive_softmax_dropout
,
adaptive_inputs
=
embed_tokens
if
args
.
tie_adaptive_weights
else
None
,
factor
=
args
.
adaptive_softmax_factor
,
tie_proj
=
args
.
tie_adaptive_proj
,
)
elif
not
self
.
share_input_output_embed
:
self
.
embed_out
=
nn
.
Parameter
(
torch
.
Tensor
(
len
(
dictionary
),
output_embed_dim
)
)
nn
.
init
.
normal_
(
self
.
embed_out
,
mean
=
0
,
std
=
output_embed_dim
**-
0.5
)
self
.
register_buffer
(
"version"
,
torch
.
Tensor
([
2
]))
self
.
normalize
=
args
.
decoder_normalize_before
and
final_norm
if
self
.
normalize
:
self
.
layer_norm
=
LayerNorm
(
embed_dim
)
def
forward
(
self
,
prev_output_tokens
,
encoder_out
=
None
,
incremental_state
=
None
,
**
kwargs
):
"""
Args:
prev_output_tokens (LongTensor): previous decoder outputs of shape
`(batch, tgt_len)`, for teacher forcing
encoder_out (Tensor, optional): output from the encoder, used for
encoder-side attention
incremental_state (dict): dictionary used for storing state during
:ref:`Incremental decoding`
Returns:
tuple:
- the last decoder layer's output of shape `(batch, tgt_len,
vocab)`
- the last decoder layer's attention weights of shape `(batch,
tgt_len, src_len)`
"""
# embed positions
positions
=
(
self
.
embed_positions
(
prev_output_tokens
,
incremental_state
=
incremental_state
,
)
if
self
.
embed_positions
is
not
None
else
None
)
if
incremental_state
is
not
None
:
prev_output_tokens
=
prev_output_tokens
[:,
-
1
:]
if
positions
is
not
None
:
positions
=
positions
[:,
-
1
:]
# embed tokens and positions
x
=
self
.
embed_scale
*
self
.
embed_tokens
(
prev_output_tokens
)
if
self
.
project_in_dim
is
not
None
:
x
=
self
.
project_in_dim
(
x
)
if
positions
is
not
None
:
x
+=
positions
x
=
self
.
dropout_module
(
x
)
# B x T x C -> T x B x C
x
=
x
.
transpose
(
0
,
1
)
attn
=
None
inner_states
=
[
x
]
# decoder layers
for
layer
in
self
.
layers
:
x
,
attn
=
layer
(
x
,
encoder_out
[
"encoder_out"
]
if
encoder_out
is
not
None
else
None
,
encoder_out
[
"encoder_padding_mask"
]
if
encoder_out
is
not
None
else
None
,
incremental_state
,
)
inner_states
.
append
(
x
)
if
self
.
normalize
:
x
=
self
.
layer_norm
(
x
)
# T x B x C -> B x T x C
x
=
x
.
transpose
(
0
,
1
)
if
self
.
project_out_dim
is
not
None
:
x
=
self
.
project_out_dim
(
x
)
if
self
.
adaptive_softmax
is
None
:
# project back to size of vocabulary
if
self
.
share_input_output_embed
:
x
=
F
.
linear
(
x
,
self
.
embed_tokens
.
weight
)
else
:
x
=
F
.
linear
(
x
,
self
.
embed_out
)
return
x
,
{
"attn"
:
attn
,
"inner_states"
:
inner_states
}
def
max_positions
(
self
):
"""Maximum output length supported by the decoder."""
if
self
.
embed_positions
is
None
:
return
self
.
max_target_positions
return
min
(
self
.
max_target_positions
,
self
.
embed_positions
.
max_positions
)
def
buffered_future_mask
(
self
,
tensor
):
dim
=
tensor
.
size
(
0
)
if
(
not
hasattr
(
self
,
"_future_mask"
)
or
self
.
_future_mask
is
None
or
self
.
_future_mask
.
device
!=
tensor
.
device
):
self
.
_future_mask
=
torch
.
triu
(
utils
.
fill_with_neg_inf
(
tensor
.
new
(
dim
,
dim
)),
1
)
if
self
.
_future_mask
.
size
(
0
)
<
dim
:
self
.
_future_mask
=
torch
.
triu
(
utils
.
fill_with_neg_inf
(
self
.
_future_mask
.
resize_
(
dim
,
dim
)),
1
)
return
self
.
_future_mask
[:
dim
,
:
dim
]
class
LightConvEncoderLayer
(
nn
.
Module
):
"""Encoder layer block.
Args:
args (argparse.Namespace): parsed command-line arguments
kernel_size: kernel size of the convolution
"""
def
__init__
(
self
,
args
,
kernel_size
=
0
):
super
().
__init__
()
self
.
embed_dim
=
args
.
encoder_embed_dim
self
.
conv_dim
=
args
.
encoder_conv_dim
padding_l
=
(
kernel_size
//
2
if
kernel_size
%
2
==
1
else
((
kernel_size
-
1
)
//
2
,
kernel_size
//
2
)
)
if
args
.
encoder_glu
:
self
.
linear1
=
Linear
(
self
.
embed_dim
,
2
*
self
.
conv_dim
)
self
.
act
=
nn
.
GLU
()
else
:
self
.
linear1
=
Linear
(
self
.
embed_dim
,
self
.
conv_dim
)
self
.
act
=
None
if
args
.
encoder_conv_type
==
"lightweight"
:
self
.
conv
=
LightweightConv
(
self
.
conv_dim
,
kernel_size
,
padding_l
=
padding_l
,
weight_softmax
=
args
.
weight_softmax
,
num_heads
=
args
.
encoder_attention_heads
,
weight_dropout
=
args
.
weight_dropout
,
)
elif
args
.
encoder_conv_type
==
"dynamic"
:
self
.
conv
=
DynamicConv
(
self
.
conv_dim
,
kernel_size
,
padding_l
=
padding_l
,
weight_softmax
=
args
.
weight_softmax
,
num_heads
=
args
.
encoder_attention_heads
,
weight_dropout
=
args
.
weight_dropout
,
)
else
:
raise
NotImplementedError
self
.
linear2
=
Linear
(
self
.
conv_dim
,
self
.
embed_dim
)
self
.
dropout_module
=
FairseqDropout
(
args
.
dropout
,
module_name
=
self
.
__class__
.
__name__
)
self
.
relu_dropout_module
=
FairseqDropout
(
args
.
relu_dropout
,
module_name
=
self
.
__class__
.
__name__
)
self
.
input_dropout_module
=
FairseqDropout
(
args
.
input_dropout
,
module_name
=
self
.
__class__
.
__name__
)
self
.
normalize_before
=
args
.
encoder_normalize_before
self
.
fc1
=
Linear
(
self
.
embed_dim
,
args
.
encoder_ffn_embed_dim
)
self
.
fc2
=
Linear
(
args
.
encoder_ffn_embed_dim
,
self
.
embed_dim
)
self
.
layer_norms
=
nn
.
ModuleList
([
LayerNorm
(
self
.
embed_dim
)
for
_
in
range
(
2
)])
def
forward
(
self
,
x
,
encoder_padding_mask
):
"""
Args:
x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
encoder_padding_mask (ByteTensor): binary ByteTensor of shape
`(batch, src_len)` where padding elements are indicated by ``1``.
Returns:
encoded output of shape `(batch, src_len, embed_dim)`
"""
residual
=
x
x
=
self
.
maybe_layer_norm
(
0
,
x
,
before
=
True
)
x
=
self
.
input_dropout_module
(
x
)
x
=
self
.
linear1
(
x
)
if
self
.
act
is
not
None
:
x
=
self
.
act
(
x
)
if
encoder_padding_mask
is
not
None
:
x
=
x
.
masked_fill
(
encoder_padding_mask
.
transpose
(
0
,
1
).
unsqueeze
(
2
),
0
)
x
=
self
.
conv
(
x
)
x
=
self
.
linear2
(
x
)
x
=
self
.
dropout_module
(
x
)
x
=
residual
+
x
x
=
self
.
maybe_layer_norm
(
0
,
x
,
after
=
True
)
residual
=
x
x
=
self
.
maybe_layer_norm
(
1
,
x
,
before
=
True
)
x
=
F
.
relu
(
self
.
fc1
(
x
))
x
=
self
.
relu_dropout_module
(
x
)
x
=
self
.
fc2
(
x
)
x
=
self
.
dropout_module
(
x
)
x
=
residual
+
x
x
=
self
.
maybe_layer_norm
(
1
,
x
,
after
=
True
)
return
x
def
maybe_layer_norm
(
self
,
i
,
x
,
before
=
False
,
after
=
False
):
assert
before
^
after
if
after
^
self
.
normalize_before
:
return
self
.
layer_norms
[
i
](
x
)
else
:
return
x
def
extra_repr
(
self
):
return
(
"dropout={}, relu_dropout={}, input_dropout={}, normalize_before={}"
.
format
(
self
.
dropout_module
.
p
,
self
.
relu_dropout_module
.
p
,
self
.
input_dropout_module
.
p
,
self
.
normalize_before
,
)
)
class
LightConvDecoderLayer
(
nn
.
Module
):
"""Decoder layer block.
Args:
args (argparse.Namespace): parsed command-line arguments
no_encoder_attn (bool, optional): whether to attend to encoder outputs.
Default: ``False``
kernel_size: kernel size of the convolution
"""
def
__init__
(
self
,
args
,
no_encoder_attn
=
False
,
kernel_size
=
0
):
super
().
__init__
()
self
.
embed_dim
=
args
.
decoder_embed_dim
self
.
conv_dim
=
args
.
decoder_conv_dim
if
args
.
decoder_glu
:
self
.
linear1
=
Linear
(
self
.
embed_dim
,
2
*
self
.
conv_dim
)
self
.
act
=
nn
.
GLU
()
else
:
self
.
linear1
=
Linear
(
self
.
embed_dim
,
self
.
conv_dim
)
self
.
act
=
None
if
args
.
decoder_conv_type
==
"lightweight"
:
self
.
conv
=
LightweightConv
(
self
.
conv_dim
,
kernel_size
,
padding_l
=
kernel_size
-
1
,
weight_softmax
=
args
.
weight_softmax
,
num_heads
=
args
.
decoder_attention_heads
,
weight_dropout
=
args
.
weight_dropout
,
)
elif
args
.
decoder_conv_type
==
"dynamic"
:
self
.
conv
=
DynamicConv
(
self
.
conv_dim
,
kernel_size
,
padding_l
=
kernel_size
-
1
,
weight_softmax
=
args
.
weight_softmax
,
num_heads
=
args
.
decoder_attention_heads
,
weight_dropout
=
args
.
weight_dropout
,
)
else
:
raise
NotImplementedError
self
.
linear2
=
Linear
(
self
.
conv_dim
,
self
.
embed_dim
)
self
.
dropout_module
=
FairseqDropout
(
args
.
dropout
,
module_name
=
self
.
__class__
.
__name__
)
self
.
relu_dropout_module
=
FairseqDropout
(
args
.
relu_dropout
,
module_name
=
self
.
__class__
.
__name__
)
self
.
input_dropout_module
=
FairseqDropout
(
args
.
input_dropout
,
module_name
=
self
.
__class__
.
__name__
)
self
.
normalize_before
=
args
.
decoder_normalize_before
self
.
conv_layer_norm
=
LayerNorm
(
self
.
embed_dim
)
if
no_encoder_attn
:
self
.
encoder_attn
=
None
self
.
encoder_attn_layer_norm
=
None
else
:
self
.
encoder_attn
=
MultiheadAttention
(
self
.
embed_dim
,
args
.
decoder_attention_heads
,
dropout
=
args
.
attention_dropout
,
encoder_decoder_attention
=
True
,
)
self
.
encoder_attn_layer_norm
=
LayerNorm
(
self
.
embed_dim
)
self
.
fc1
=
Linear
(
self
.
embed_dim
,
args
.
decoder_ffn_embed_dim
)
self
.
fc2
=
Linear
(
args
.
decoder_ffn_embed_dim
,
self
.
embed_dim
)
self
.
final_layer_norm
=
LayerNorm
(
self
.
embed_dim
)
self
.
need_attn
=
True
def
forward
(
self
,
x
,
encoder_out
,
encoder_padding_mask
,
incremental_state
,
prev_conv_state
=
None
,
prev_attn_state
=
None
,
conv_mask
=
None
,
conv_padding_mask
=
None
,
):
"""
Args:
x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
encoder_padding_mask (ByteTensor): binary ByteTensor of shape
`(batch, src_len)` where padding elements are indicated by ``1``.
Returns:
encoded output of shape `(batch, src_len, embed_dim)`
"""
residual
=
x
x
=
self
.
maybe_layer_norm
(
self
.
conv_layer_norm
,
x
,
before
=
True
)
if
prev_conv_state
is
not
None
:
if
incremental_state
is
None
:
incremental_state
=
{}
self
.
conv
.
_set_input_buffer
(
incremental_state
,
prev_conv_state
)
x
=
self
.
input_dropout_module
(
x
)
x
=
self
.
linear1
(
x
)
if
self
.
act
is
not
None
:
x
=
self
.
act
(
x
)
x
=
self
.
conv
(
x
,
incremental_state
=
incremental_state
)
x
=
self
.
linear2
(
x
)
x
=
self
.
dropout_module
(
x
)
x
=
residual
+
x
x
=
self
.
maybe_layer_norm
(
self
.
conv_layer_norm
,
x
,
after
=
True
)
attn
=
None
if
self
.
encoder_attn
is
not
None
:
residual
=
x
x
=
self
.
maybe_layer_norm
(
self
.
encoder_attn_layer_norm
,
x
,
before
=
True
)
if
prev_attn_state
is
not
None
:
if
incremental_state
is
None
:
incremental_state
=
{}
prev_key
,
prev_value
=
prev_attn_state
saved_state
=
{
"prev_key"
:
prev_key
,
"prev_value"
:
prev_value
}
self
.
encoder_attn
.
_set_input_buffer
(
incremental_state
,
saved_state
)
x
,
attn
=
self
.
encoder_attn
(
query
=
x
,
key
=
encoder_out
,
value
=
encoder_out
,
key_padding_mask
=
encoder_padding_mask
,
incremental_state
=
incremental_state
,
static_kv
=
True
,
need_weights
=
(
not
self
.
training
and
self
.
need_attn
),
)
x
=
self
.
dropout_module
(
x
)
x
=
residual
+
x
x
=
self
.
maybe_layer_norm
(
self
.
encoder_attn_layer_norm
,
x
,
after
=
True
)
residual
=
x
x
=
self
.
maybe_layer_norm
(
self
.
final_layer_norm
,
x
,
before
=
True
)
x
=
F
.
relu
(
self
.
fc1
(
x
))
x
=
self
.
relu_dropout_module
(
x
)
x
=
self
.
fc2
(
x
)
x
=
self
.
dropout_module
(
x
)
x
=
residual
+
x
x
=
self
.
maybe_layer_norm
(
self
.
final_layer_norm
,
x
,
after
=
True
)
return
x
,
attn
def
maybe_layer_norm
(
self
,
layer_norm
,
x
,
before
=
False
,
after
=
False
):
assert
before
^
after
if
after
^
self
.
normalize_before
:
return
layer_norm
(
x
)
else
:
return
x
def
make_generation_fast_
(
self
,
need_attn
=
False
,
**
kwargs
):
self
.
need_attn
=
need_attn
def
extra_repr
(
self
):
return
(
"dropout={}, relu_dropout={}, input_dropout={}, normalize_before={}"
.
format
(
self
.
dropout_module
.
p
,
self
.
relu_dropout_module
.
p
,
self
.
input_dropout_module
.
p
,
self
.
normalize_before
,
)
)
def
Embedding
(
num_embeddings
,
embedding_dim
,
padding_idx
):
m
=
nn
.
Embedding
(
num_embeddings
,
embedding_dim
,
padding_idx
=
padding_idx
)
nn
.
init
.
normal_
(
m
.
weight
,
mean
=
0
,
std
=
embedding_dim
**-
0.5
)
nn
.
init
.
constant_
(
m
.
weight
[
padding_idx
],
0
)
return
m
def
Linear
(
in_features
,
out_features
,
bias
=
True
):
m
=
nn
.
Linear
(
in_features
,
out_features
,
bias
)
nn
.
init
.
xavier_uniform_
(
m
.
weight
)
if
bias
:
nn
.
init
.
constant_
(
m
.
bias
,
0.0
)
return
m
@
register_model_architecture
(
"lightconv"
,
"lightconv"
)
def
base_architecture
(
args
):
args
.
encoder_embed_path
=
getattr
(
args
,
"encoder_embed_path"
,
None
)
args
.
encoder_embed_dim
=
getattr
(
args
,
"encoder_embed_dim"
,
512
)
args
.
encoder_ffn_embed_dim
=
getattr
(
args
,
"encoder_ffn_embed_dim"
,
2048
)
args
.
encoder_layers
=
getattr
(
args
,
"encoder_layers"
,
7
)
args
.
encoder_attention_heads
=
getattr
(
args
,
"encoder_attention_heads"
,
8
)
args
.
encoder_normalize_before
=
getattr
(
args
,
"encoder_normalize_before"
,
False
)
args
.
encoder_learned_pos
=
getattr
(
args
,
"encoder_learned_pos"
,
False
)
args
.
decoder_embed_path
=
getattr
(
args
,
"decoder_embed_path"
,
None
)
args
.
decoder_embed_dim
=
getattr
(
args
,
"decoder_embed_dim"
,
args
.
encoder_embed_dim
)
args
.
decoder_ffn_embed_dim
=
getattr
(
args
,
"decoder_ffn_embed_dim"
,
args
.
encoder_ffn_embed_dim
)
args
.
decoder_layers
=
getattr
(
args
,
"decoder_layers"
,
6
)
args
.
decoder_attention_heads
=
getattr
(
args
,
"decoder_attention_heads"
,
8
)
args
.
decoder_normalize_before
=
getattr
(
args
,
"decoder_normalize_before"
,
False
)
args
.
decoder_learned_pos
=
getattr
(
args
,
"decoder_learned_pos"
,
False
)
args
.
attention_dropout
=
getattr
(
args
,
"attention_dropout"
,
0.0
)
args
.
relu_dropout
=
getattr
(
args
,
"relu_dropout"
,
0.0
)
args
.
dropout
=
getattr
(
args
,
"dropout"
,
0.1
)
args
.
adaptive_softmax_cutoff
=
getattr
(
args
,
"adaptive_softmax_cutoff"
,
None
)
args
.
adaptive_softmax_dropout
=
getattr
(
args
,
"adaptive_softmax_dropout"
,
0
)
args
.
share_decoder_input_output_embed
=
getattr
(
args
,
"share_decoder_input_output_embed"
,
False
)
args
.
share_all_embeddings
=
getattr
(
args
,
"share_all_embeddings"
,
False
)
args
.
no_token_positional_embeddings
=
getattr
(
args
,
"no_token_positional_embeddings"
,
False
)
args
.
decoder_output_dim
=
getattr
(
args
,
"decoder_output_dim"
,
args
.
decoder_embed_dim
)
args
.
decoder_input_dim
=
getattr
(
args
,
"decoder_input_dim"
,
args
.
decoder_embed_dim
)
args
.
encoder_conv_dim
=
getattr
(
args
,
"encoder_conv_dim"
,
args
.
encoder_embed_dim
)
args
.
decoder_conv_dim
=
getattr
(
args
,
"decoder_conv_dim"
,
args
.
decoder_embed_dim
)
args
.
encoder_kernel_size_list
=
getattr
(
args
,
"encoder_kernel_size_list"
,
[
3
,
7
,
15
,
31
,
31
,
31
,
31
]
)
args
.
decoder_kernel_size_list
=
getattr
(
args
,
"decoder_kernel_size_list"
,
[
3
,
7
,
15
,
31
,
31
,
31
]
)
if
len
(
args
.
encoder_kernel_size_list
)
==
1
:
args
.
encoder_kernel_size_list
=
(
args
.
encoder_kernel_size_list
*
args
.
encoder_layers
)
if
len
(
args
.
decoder_kernel_size_list
)
==
1
:
args
.
decoder_kernel_size_list
=
(
args
.
decoder_kernel_size_list
*
args
.
decoder_layers
)
assert
(
len
(
args
.
encoder_kernel_size_list
)
==
args
.
encoder_layers
),
"encoder_kernel_size_list doesn't match encoder_layers"
assert
(
len
(
args
.
decoder_kernel_size_list
)
==
args
.
decoder_layers
),
"decoder_kernel_size_list doesn't match decoder_layers"
args
.
encoder_glu
=
getattr
(
args
,
"encoder_glu"
,
True
)
args
.
decoder_glu
=
getattr
(
args
,
"decoder_glu"
,
True
)
args
.
input_dropout
=
getattr
(
args
,
"input_dropout"
,
0.1
)
args
.
weight_dropout
=
getattr
(
args
,
"weight_dropout"
,
args
.
attention_dropout
)
@
register_model_architecture
(
"lightconv"
,
"lightconv_iwslt_de_en"
)
def
lightconv_iwslt_de_en
(
args
):
args
.
encoder_embed_dim
=
getattr
(
args
,
"encoder_embed_dim"
,
512
)
args
.
encoder_ffn_embed_dim
=
getattr
(
args
,
"encoder_ffn_embed_dim"
,
1024
)
args
.
encoder_attention_heads
=
getattr
(
args
,
"encoder_attention_heads"
,
4
)
args
.
encoder_layers
=
getattr
(
args
,
"encoder_layers"
,
7
)
args
.
decoder_embed_dim
=
getattr
(
args
,
"decoder_embed_dim"
,
512
)
args
.
decoder_ffn_embed_dim
=
getattr
(
args
,
"decoder_ffn_embed_dim"
,
1024
)
args
.
decoder_attention_heads
=
getattr
(
args
,
"decoder_attention_heads"
,
4
)
args
.
decoder_layers
=
getattr
(
args
,
"decoder_layers"
,
6
)
args
.
attention_dropout
=
getattr
(
args
,
"attention_dropout"
,
0.1
)
args
.
weight_dropout
=
getattr
(
args
,
"weight_dropout"
,
0.1
)
args
.
encoder_glu
=
getattr
(
args
,
"encoder_glu"
,
False
)
args
.
decoder_glu
=
getattr
(
args
,
"decoder_glu"
,
False
)
args
.
input_dropout
=
getattr
(
args
,
"input_dropout"
,
0.0
)
base_architecture
(
args
)
@
register_model_architecture
(
"lightconv"
,
"lightconv_wmt_en_de"
)
def
lightconv_wmt_en_de
(
args
):
base_architecture
(
args
)
@
register_model_architecture
(
"lightconv"
,
"lightconv_wmt_en_de_big"
)
def
lightconv_wmt_en_de_big
(
args
):
args
.
attention_dropout
=
getattr
(
args
,
"attention_dropout"
,
0.1
)
args
.
encoder_embed_dim
=
getattr
(
args
,
"encoder_embed_dim"
,
1024
)
args
.
encoder_ffn_embed_dim
=
getattr
(
args
,
"encoder_ffn_embed_dim"
,
4096
)
args
.
encoder_attention_heads
=
getattr
(
args
,
"encoder_attention_heads"
,
16
)
args
.
encoder_normalize_before
=
getattr
(
args
,
"encoder_normalize_before"
,
False
)
args
.
decoder_embed_dim
=
getattr
(
args
,
"decoder_embed_dim"
,
1024
)
args
.
decoder_ffn_embed_dim
=
getattr
(
args
,
"decoder_ffn_embed_dim"
,
4096
)
args
.
decoder_attention_heads
=
getattr
(
args
,
"decoder_attention_heads"
,
16
)
args
.
dropout
=
getattr
(
args
,
"dropout"
,
0.3
)
base_architecture
(
args
)
@
register_model_architecture
(
"lightconv"
,
"lightconv_wmt_en_fr_big"
)
def
lightconv_wmt_en_fr_big
(
args
):
args
.
dropout
=
getattr
(
args
,
"dropout"
,
0.1
)
lightconv_wmt_en_de_big
(
args
)
@
register_model_architecture
(
"lightconv"
,
"lightconv_wmt_zh_en_big"
)
def
lightconv_wmt_zh_en_big
(
args
):
args
.
dropout
=
getattr
(
args
,
"dropout"
,
0.2
)
args
.
attention_dropout
=
getattr
(
args
,
"attention_dropout"
,
0.2
)
args
.
weight_dropout
=
getattr
(
args
,
"weight_dropout"
,
0.2
)
lightconv_wmt_en_de_big
(
args
)
Prev
1
…
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment