Commit ac5fddfc authored by Mehdi Drissi's avatar Mehdi Drissi Committed by Myle Ott
Browse files

Fix up model defaults (#211)

parent f26b6aff
...@@ -31,17 +31,17 @@ class FConvModel(FairseqModel): ...@@ -31,17 +31,17 @@ class FConvModel(FairseqModel):
@staticmethod @staticmethod
def add_args(parser): def add_args(parser):
"""Add model-specific arguments to the parser.""" """Add model-specific arguments to the parser."""
parser.add_argument('--dropout', default=0.1, type=float, metavar='D', parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability') help='dropout probability')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N', parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension') help='encoder embedding dimension')
parser.add_argument('--encoder-embed-path', default=None, type=str, metavar='STR', parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
help='path to pre-trained encoder embedding') help='path to pre-trained encoder embedding')
parser.add_argument('--encoder-layers', type=str, metavar='EXPR', parser.add_argument('--encoder-layers', type=str, metavar='EXPR',
help='encoder layers [(dim, kernel_size), ...]') help='encoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N', parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension') help='decoder embedding dimension')
parser.add_argument('--decoder-embed-path', default=None, type=str, metavar='STR', parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
help='path to pre-trained decoder embedding') help='path to pre-trained decoder embedding')
parser.add_argument('--decoder-layers', type=str, metavar='EXPR', parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
help='decoder layers [(dim, kernel_size), ...]') help='decoder layers [(dim, kernel_size), ...]')
...@@ -49,7 +49,7 @@ class FConvModel(FairseqModel): ...@@ -49,7 +49,7 @@ class FConvModel(FairseqModel):
help='decoder output embedding dimension') help='decoder output embedding dimension')
parser.add_argument('--decoder-attention', type=str, metavar='EXPR', parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
help='decoder attention [True, ...]') help='decoder attention [True, ...]')
parser.add_argument('--normalization-constant', type=float, default=0.5, metavar='D', parser.add_argument('--normalization-constant', type=float, metavar='D',
help='multiplies the result of the residual block by sqrt(value)') help='multiplies the result of the residual block by sqrt(value)')
parser.add_argument('--share-input-output-embed', action='store_true', parser.add_argument('--share-input-output-embed', action='store_true',
help='share input and output embeddings (requires' help='share input and output embeddings (requires'
...@@ -104,7 +104,7 @@ class FConvLanguageModel(FairseqLanguageModel): ...@@ -104,7 +104,7 @@ class FConvLanguageModel(FairseqLanguageModel):
@staticmethod @staticmethod
def add_args(parser): def add_args(parser):
"""Add model-specific arguments to the parser.""" """Add model-specific arguments to the parser."""
parser.add_argument('--dropout', default=0.1, type=float, metavar='D', parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability') help='dropout probability')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N', parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension') help='decoder embedding dimension')
...@@ -117,7 +117,7 @@ class FConvLanguageModel(FairseqLanguageModel): ...@@ -117,7 +117,7 @@ class FConvLanguageModel(FairseqLanguageModel):
'Must be used with adaptive_loss criterion') 'Must be used with adaptive_loss criterion')
parser.add_argument('--decoder-attention', type=str, metavar='EXPR', parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
help='decoder attention [True, ...]') help='decoder attention [True, ...]')
parser.add_argument('--normalization-constant', type=float, default=0.5, metavar='D', parser.add_argument('--normalization-constant', type=float, metavar='D',
help='multiplies the result of the residual block by sqrt(value)') help='multiplies the result of the residual block by sqrt(value)')
@classmethod @classmethod
...@@ -611,6 +611,7 @@ def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs): ...@@ -611,6 +611,7 @@ def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
@register_model_architecture('fconv_lm', 'fconv_lm') @register_model_architecture('fconv_lm', 'fconv_lm')
def base_lm_architecture(args): def base_lm_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128) args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
args.decoder_layers = getattr(args, 'decoder_layers', '[(1268, 4)] * 13') args.decoder_layers = getattr(args, 'decoder_layers', '[(1268, 4)] * 13')
args.decoder_attention = getattr(args, 'decoder_attention', 'False') args.decoder_attention = getattr(args, 'decoder_attention', 'False')
...@@ -650,6 +651,7 @@ def fconv_lm_dauphin_gbw(args): ...@@ -650,6 +651,7 @@ def fconv_lm_dauphin_gbw(args):
@register_model_architecture('fconv', 'fconv') @register_model_architecture('fconv', 'fconv')
def base_architecture(args): def base_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512) args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
args.encoder_embed_path = getattr(args, 'encoder_embed_path', None) args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
args.encoder_layers = getattr(args, 'encoder_layers', '[(512, 3)] * 20') args.encoder_layers = getattr(args, 'encoder_layers', '[(512, 3)] * 20')
......
...@@ -41,7 +41,7 @@ class FConvModelSelfAtt(FairseqModel): ...@@ -41,7 +41,7 @@ class FConvModelSelfAtt(FairseqModel):
@staticmethod @staticmethod
def add_args(parser): def add_args(parser):
"""Add model-specific arguments to the parser.""" """Add model-specific arguments to the parser."""
parser.add_argument('--dropout', default=0.1, type=float, metavar='D', parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability') help='dropout probability')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N', parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension') help='encoder embedding dimension')
...@@ -55,25 +55,25 @@ class FConvModelSelfAtt(FairseqModel): ...@@ -55,25 +55,25 @@ class FConvModelSelfAtt(FairseqModel):
help='decoder output embedding dimension') help='decoder output embedding dimension')
parser.add_argument('--decoder-attention', type=str, metavar='EXPR', parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
help='decoder attention [True, ...]') help='decoder attention [True, ...]')
parser.add_argument('--self-attention', default='False', type=str, metavar='EXPR', parser.add_argument('--self-attention', type=str, metavar='EXPR',
help='decoder self-attention layers, ex: [True] + [False]*5') help='decoder self-attention layers, ex: [True] + [False]*5')
parser.add_argument('--multihead-attention-nheads', default=1, type=int, parser.add_argument('--multihead-attention-nheads', type=int,
help='Number of heads to use in attention') help='Number of heads to use in attention')
parser.add_argument('--multihead-self-attention-nheads', default=1, type=int, parser.add_argument('--multihead-self-attention-nheads', type=int,
help='Number of heads to use in self-attention') help='Number of heads to use in self-attention')
parser.add_argument('--encoder-attention', type=str, metavar='EXPR', default='False', parser.add_argument('--encoder-attention', type=str, metavar='EXPR',
help='encoder attention [True, ...]') help='encoder attention [True, ...]')
parser.add_argument('--encoder-attention-nheads', default=1, type=int, parser.add_argument('--encoder-attention-nheads', type=int,
help='Number of heads to use in encoder attention') help='Number of heads to use in encoder attention')
parser.add_argument('--project-input', type=str, metavar='EXPR', default='False', parser.add_argument('--project-input', type=str, metavar='EXPR',
help='Use projections in self-attention [True, ...]') help='Use projections in self-attention [True, ...]')
parser.add_argument('--gated-attention', type=str, metavar='EXPR', default='False', parser.add_argument('--gated-attention', type=str, metavar='EXPR',
help='Use GLU layers in self-attention projections [True, ...]') help='Use GLU layers in self-attention projections [True, ...]')
parser.add_argument('--downsample', type=str, metavar='EXPR', default='False', parser.add_argument('--downsample', type=str, metavar='EXPR',
help='Use downsampling in self-attention [True, ...]') help='Use downsampling in self-attention [True, ...]')
parser.add_argument('--pretrained-checkpoint', metavar='DIR', default='', parser.add_argument('--pretrained-checkpoint', metavar='DIR',
help='path to load checkpoint from pretrained model') help='path to load checkpoint from pretrained model')
parser.add_argument('--pretrained', type=str, metavar='EXPR', default='False', parser.add_argument('--pretrained', type=str, metavar='EXPR',
help='use pretrained model when training [True, ...]') help='use pretrained model when training [True, ...]')
@classmethod @classmethod
...@@ -499,22 +499,34 @@ def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs): ...@@ -499,22 +499,34 @@ def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
@register_model_architecture('fconv_self_att', 'fconv_self_att') @register_model_architecture('fconv_self_att', 'fconv_self_att')
def base_architecture(args): def base_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512) args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
args.encoder_layers = getattr(args, 'encoder_layers', '[(512, 3)] * 3') args.encoder_layers = getattr(args, 'encoder_layers', '[(512, 3)] * 3')
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512) args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 3)] * 8') args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 3)] * 8')
args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256) args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
args.decoder_attention = getattr(args, 'decoder_attention', 'True') args.decoder_attention = getattr(args, 'decoder_attention', 'True')
args.self_attention = getattr(args, 'self_attention', 'False')
args.encoder_attention = getattr(args, 'encoder_attention', 'False')
args.multihead_attention_nheads = getattr(args, 'multihead_attention_nheads', 1)
args.multihead_self_attention_nheads = getattr(args, 'multihead_self_attention_nheads', 1)
args.encoder_attention_nheads = getattr(args, 'encoder_attention_nheads', 1)
args.project_input = getattr(args, 'project_input', 'False')
args.gated_attention = getattr(args, 'gated_attention', 'False')
args.downsample = getattr(args, 'downsample', 'False')
args.pretrained_checkpoint = getattr(args, 'pretrained_checkpoint', '')
args.pretrained = getattr(args, 'pretrained', 'False')
@register_model_architecture('fconv_self_att', 'fconv_self_att_wp') @register_model_architecture('fconv_self_att', 'fconv_self_att_wp')
def fconv_self_att_wp(args): def fconv_self_att_wp(args):
base_architecture(args)
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256) args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256)
args.encoder_layers = getattr(args, 'encoder_layers', '[(128, 3)] * 2 + [(512,3)] * 1') args.encoder_layers = getattr(args, 'encoder_layers', '[(128, 3)] * 2 + [(512,3)] * 1')
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 256) args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 256)
args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 4)] * 4 + [(768, 4)] * 2 + [(1024, 4)] * 1') args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 4)] * 4 + [(768, 4)] * 2 + [(1024, 4)] * 1')
args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256) args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
args.multihead_attention_nheads = getattr(args, 'multihead_attention_nheads', 1) args.self_attention = getattr(args, 'self_attention', 'True')
args.encoder_attention_nheads = getattr(args, 'encoder_attention_nheads', 1)
args.multihead_self_attention_nheads = getattr(args, 'multihead_self_attention_nheads', 4) args.multihead_self_attention_nheads = getattr(args, 'multihead_self_attention_nheads', 4)
args.project_input = getattr(args, 'project_input', 'True')
args.gated_attention = getattr(args, 'gated_attention', 'True')
args.downsample = getattr(args, 'downsample', 'True')
base_architecture(args)
...@@ -25,11 +25,11 @@ class LSTMModel(FairseqModel): ...@@ -25,11 +25,11 @@ class LSTMModel(FairseqModel):
@staticmethod @staticmethod
def add_args(parser): def add_args(parser):
"""Add model-specific arguments to the parser.""" """Add model-specific arguments to the parser."""
parser.add_argument('--dropout', default=0.1, type=float, metavar='D', parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability') help='dropout probability')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N', parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension') help='encoder embedding dimension')
parser.add_argument('--encoder-embed-path', default=None, type=str, metavar='STR', parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
help='path to pre-trained encoder embedding') help='path to pre-trained encoder embedding')
parser.add_argument('--encoder-hidden-size', type=int, metavar='N', parser.add_argument('--encoder-hidden-size', type=int, metavar='N',
help='encoder hidden size') help='encoder hidden size')
...@@ -39,7 +39,7 @@ class LSTMModel(FairseqModel): ...@@ -39,7 +39,7 @@ class LSTMModel(FairseqModel):
help='make all layers of encoder bidirectional') help='make all layers of encoder bidirectional')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N', parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension') help='decoder embedding dimension')
parser.add_argument('--decoder-embed-path', default=None, type=str, metavar='STR', parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
help='path to pre-trained decoder embedding') help='path to pre-trained decoder embedding')
parser.add_argument('--decoder-hidden-size', type=int, metavar='N', parser.add_argument('--decoder-hidden-size', type=int, metavar='N',
help='decoder hidden size') help='decoder hidden size')
...@@ -415,6 +415,7 @@ def Linear(in_features, out_features, bias=True, dropout=0): ...@@ -415,6 +415,7 @@ def Linear(in_features, out_features, bias=True, dropout=0):
@register_model_architecture('lstm', 'lstm') @register_model_architecture('lstm', 'lstm')
def base_architecture(args): def base_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512) args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
args.encoder_embed_path = getattr(args, 'encoder_embed_path', None) args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
args.encoder_hidden_size = getattr(args, 'encoder_hidden_size', args.encoder_embed_dim) args.encoder_hidden_size = getattr(args, 'encoder_hidden_size', args.encoder_embed_dim)
...@@ -434,6 +435,7 @@ def base_architecture(args): ...@@ -434,6 +435,7 @@ def base_architecture(args):
@register_model_architecture('lstm', 'lstm_wiseman_iwslt_de_en') @register_model_architecture('lstm', 'lstm_wiseman_iwslt_de_en')
def lstm_wiseman_iwslt_de_en(args): def lstm_wiseman_iwslt_de_en(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256) args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256)
args.encoder_dropout_in = getattr(args, 'encoder_dropout_in', 0) args.encoder_dropout_in = getattr(args, 'encoder_dropout_in', 0)
args.encoder_dropout_out = getattr(args, 'encoder_dropout_out', 0) args.encoder_dropout_out = getattr(args, 'encoder_dropout_out', 0)
......
...@@ -48,9 +48,9 @@ class TransformerModel(FairseqModel): ...@@ -48,9 +48,9 @@ class TransformerModel(FairseqModel):
help='num encoder layers') help='num encoder layers')
parser.add_argument('--encoder-attention-heads', type=int, metavar='N', parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
help='num encoder attention heads') help='num encoder attention heads')
parser.add_argument('--encoder-normalize-before', default=False, action='store_true', parser.add_argument('--encoder-normalize-before', action='store_true',
help='apply layernorm before each encoder block') help='apply layernorm before each encoder block')
parser.add_argument('--encoder-learned-pos', default=False, action='store_true', parser.add_argument('--encoder-learned-pos', action='store_true',
help='use learned positional embeddings in the encoder') help='use learned positional embeddings in the encoder')
parser.add_argument('--decoder-embed-path', type=str, metavar='STR', parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
help='path to pre-trained decoder embedding') help='path to pre-trained decoder embedding')
...@@ -62,13 +62,13 @@ class TransformerModel(FairseqModel): ...@@ -62,13 +62,13 @@ class TransformerModel(FairseqModel):
help='num decoder layers') help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N', parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads') help='num decoder attention heads')
parser.add_argument('--decoder-learned-pos', default=False, action='store_true', parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder') help='use learned positional embeddings in the decoder')
parser.add_argument('--decoder-normalize-before', default=False, action='store_true', parser.add_argument('--decoder-normalize-before', action='store_true',
help='apply layernorm before each decoder block') help='apply layernorm before each decoder block')
parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true', parser.add_argument('--share-decoder-input-output-embed', action='store_true',
help='share decoder input and output embeddings') help='share decoder input and output embeddings')
parser.add_argument('--share-all-embeddings', default=False, action='store_true', parser.add_argument('--share-all-embeddings', action='store_true',
help='share encoder, decoder and output embeddings' help='share encoder, decoder and output embeddings'
' (requires shared dictionary and embed dim)') ' (requires shared dictionary and embed dim)')
...@@ -422,14 +422,20 @@ def base_architecture(args): ...@@ -422,14 +422,20 @@ def base_architecture(args):
args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 2048) args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 2048)
args.encoder_layers = getattr(args, 'encoder_layers', 6) args.encoder_layers = getattr(args, 'encoder_layers', 6)
args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 8) args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 8)
args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', False)
args.encoder_learned_pos = getattr(args, 'encoder_learned_pos', False)
args.decoder_embed_path = getattr(args, 'decoder_embed_path', None) args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', args.encoder_embed_dim) args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', args.encoder_embed_dim)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', args.encoder_ffn_embed_dim) args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', args.encoder_ffn_embed_dim)
args.decoder_layers = getattr(args, 'decoder_layers', 6) args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8) args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.decoder_normalize_before = getattr(args, 'decoder_normalize_before', False)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
args.attention_dropout = getattr(args, 'attention_dropout', 0.) args.attention_dropout = getattr(args, 'attention_dropout', 0.)
args.relu_dropout = getattr(args, 'relu_dropout', 0.) args.relu_dropout = getattr(args, 'relu_dropout', 0.)
args.dropout = getattr(args, 'dropout', 0.1) args.dropout = getattr(args, 'dropout', 0.1)
args.share_decoder_input_output_embed = getattr(args, 'share_decoder_input_output_embed', False)
args.share_all_embeddings = getattr(args, 'share_all_embeddings', False)
@register_model_architecture('transformer', 'transformer_iwslt_de_en') @register_model_architecture('transformer', 'transformer_iwslt_de_en')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment