Commit ac5fddfc authored by Mehdi Drissi's avatar Mehdi Drissi Committed by Myle Ott
Browse files

Fix up model defaults (#211)

parent f26b6aff
......@@ -31,17 +31,17 @@ class FConvModel(FairseqModel):
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension')
parser.add_argument('--encoder-embed-path', default=None, type=str, metavar='STR',
parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
help='path to pre-trained encoder embedding')
parser.add_argument('--encoder-layers', type=str, metavar='EXPR',
help='encoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-embed-path', default=None, type=str, metavar='STR',
parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
help='path to pre-trained decoder embedding')
parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
help='decoder layers [(dim, kernel_size), ...]')
......@@ -49,7 +49,7 @@ class FConvModel(FairseqModel):
help='decoder output embedding dimension')
parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
parser.add_argument('--normalization-constant', type=float, default=0.5, metavar='D',
parser.add_argument('--normalization-constant', type=float, metavar='D',
help='multiplies the result of the residual block by sqrt(value)')
parser.add_argument('--share-input-output-embed', action='store_true',
help='share input and output embeddings (requires'
......@@ -104,7 +104,7 @@ class FConvLanguageModel(FairseqLanguageModel):
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
......@@ -117,7 +117,7 @@ class FConvLanguageModel(FairseqLanguageModel):
'Must be used with adaptive_loss criterion')
parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
parser.add_argument('--normalization-constant', type=float, default=0.5, metavar='D',
parser.add_argument('--normalization-constant', type=float, metavar='D',
help='multiplies the result of the residual block by sqrt(value)')
@classmethod
......@@ -611,6 +611,7 @@ def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
@register_model_architecture('fconv_lm', 'fconv_lm')
def base_lm_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
args.decoder_layers = getattr(args, 'decoder_layers', '[(1268, 4)] * 13')
args.decoder_attention = getattr(args, 'decoder_attention', 'False')
......@@ -650,6 +651,7 @@ def fconv_lm_dauphin_gbw(args):
@register_model_architecture('fconv', 'fconv')
def base_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
args.encoder_layers = getattr(args, 'encoder_layers', '[(512, 3)] * 20')
......
......@@ -41,7 +41,7 @@ class FConvModelSelfAtt(FairseqModel):
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension')
......@@ -55,25 +55,25 @@ class FConvModelSelfAtt(FairseqModel):
help='decoder output embedding dimension')
parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
parser.add_argument('--self-attention', default='False', type=str, metavar='EXPR',
parser.add_argument('--self-attention', type=str, metavar='EXPR',
help='decoder self-attention layers, ex: [True] + [False]*5')
parser.add_argument('--multihead-attention-nheads', default=1, type=int,
parser.add_argument('--multihead-attention-nheads', type=int,
help='Number of heads to use in attention')
parser.add_argument('--multihead-self-attention-nheads', default=1, type=int,
parser.add_argument('--multihead-self-attention-nheads', type=int,
help='Number of heads to use in self-attention')
parser.add_argument('--encoder-attention', type=str, metavar='EXPR', default='False',
parser.add_argument('--encoder-attention', type=str, metavar='EXPR',
help='encoder attention [True, ...]')
parser.add_argument('--encoder-attention-nheads', default=1, type=int,
parser.add_argument('--encoder-attention-nheads', type=int,
help='Number of heads to use in encoder attention')
parser.add_argument('--project-input', type=str, metavar='EXPR', default='False',
parser.add_argument('--project-input', type=str, metavar='EXPR',
help='Use projections in self-attention [True, ...]')
parser.add_argument('--gated-attention', type=str, metavar='EXPR', default='False',
parser.add_argument('--gated-attention', type=str, metavar='EXPR',
help='Use GLU layers in self-attention projections [True, ...]')
parser.add_argument('--downsample', type=str, metavar='EXPR', default='False',
parser.add_argument('--downsample', type=str, metavar='EXPR',
help='Use downsampling in self-attention [True, ...]')
parser.add_argument('--pretrained-checkpoint', metavar='DIR', default='',
parser.add_argument('--pretrained-checkpoint', metavar='DIR',
help='path to load checkpoint from pretrained model')
parser.add_argument('--pretrained', type=str, metavar='EXPR', default='False',
parser.add_argument('--pretrained', type=str, metavar='EXPR',
help='use pretrained model when training [True, ...]')
@classmethod
......@@ -499,22 +499,34 @@ def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
@register_model_architecture('fconv_self_att', 'fconv_self_att')
def base_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
args.encoder_layers = getattr(args, 'encoder_layers', '[(512, 3)] * 3')
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 3)] * 8')
args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
args.decoder_attention = getattr(args, 'decoder_attention', 'True')
args.self_attention = getattr(args, 'self_attention', 'False')
args.encoder_attention = getattr(args, 'encoder_attention', 'False')
args.multihead_attention_nheads = getattr(args, 'multihead_attention_nheads', 1)
args.multihead_self_attention_nheads = getattr(args, 'multihead_self_attention_nheads', 1)
args.encoder_attention_nheads = getattr(args, 'encoder_attention_nheads', 1)
args.project_input = getattr(args, 'project_input', 'False')
args.gated_attention = getattr(args, 'gated_attention', 'False')
args.downsample = getattr(args, 'downsample', 'False')
args.pretrained_checkpoint = getattr(args, 'pretrained_checkpoint', '')
args.pretrained = getattr(args, 'pretrained', 'False')
@register_model_architecture('fconv_self_att', 'fconv_self_att_wp')
def fconv_self_att_wp(args):
base_architecture(args)
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256)
args.encoder_layers = getattr(args, 'encoder_layers', '[(128, 3)] * 2 + [(512,3)] * 1')
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 256)
args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 4)] * 4 + [(768, 4)] * 2 + [(1024, 4)] * 1')
args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
args.multihead_attention_nheads = getattr(args, 'multihead_attention_nheads', 1)
args.encoder_attention_nheads = getattr(args, 'encoder_attention_nheads', 1)
args.self_attention = getattr(args, 'self_attention', 'True')
args.multihead_self_attention_nheads = getattr(args, 'multihead_self_attention_nheads', 4)
args.project_input = getattr(args, 'project_input', 'True')
args.gated_attention = getattr(args, 'gated_attention', 'True')
args.downsample = getattr(args, 'downsample', 'True')
base_architecture(args)
......@@ -25,11 +25,11 @@ class LSTMModel(FairseqModel):
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
parser.add_argument('--dropout', type=float, metavar='D',
help='dropout probability')
parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
help='encoder embedding dimension')
parser.add_argument('--encoder-embed-path', default=None, type=str, metavar='STR',
parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
help='path to pre-trained encoder embedding')
parser.add_argument('--encoder-hidden-size', type=int, metavar='N',
help='encoder hidden size')
......@@ -39,7 +39,7 @@ class LSTMModel(FairseqModel):
help='make all layers of encoder bidirectional')
parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-embed-path', default=None, type=str, metavar='STR',
parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
help='path to pre-trained decoder embedding')
parser.add_argument('--decoder-hidden-size', type=int, metavar='N',
help='decoder hidden size')
......@@ -415,6 +415,7 @@ def Linear(in_features, out_features, bias=True, dropout=0):
@register_model_architecture('lstm', 'lstm')
def base_architecture(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
args.encoder_hidden_size = getattr(args, 'encoder_hidden_size', args.encoder_embed_dim)
......@@ -434,6 +435,7 @@ def base_architecture(args):
@register_model_architecture('lstm', 'lstm_wiseman_iwslt_de_en')
def lstm_wiseman_iwslt_de_en(args):
args.dropout = getattr(args, 'dropout', 0.1)
args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256)
args.encoder_dropout_in = getattr(args, 'encoder_dropout_in', 0)
args.encoder_dropout_out = getattr(args, 'encoder_dropout_out', 0)
......
......@@ -48,9 +48,9 @@ class TransformerModel(FairseqModel):
help='num encoder layers')
parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
help='num encoder attention heads')
parser.add_argument('--encoder-normalize-before', default=False, action='store_true',
parser.add_argument('--encoder-normalize-before', action='store_true',
help='apply layernorm before each encoder block')
parser.add_argument('--encoder-learned-pos', default=False, action='store_true',
parser.add_argument('--encoder-learned-pos', action='store_true',
help='use learned positional embeddings in the encoder')
parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
help='path to pre-trained decoder embedding')
......@@ -62,13 +62,13 @@ class TransformerModel(FairseqModel):
help='num decoder layers')
parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
help='num decoder attention heads')
parser.add_argument('--decoder-learned-pos', default=False, action='store_true',
parser.add_argument('--decoder-learned-pos', action='store_true',
help='use learned positional embeddings in the decoder')
parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
parser.add_argument('--decoder-normalize-before', action='store_true',
help='apply layernorm before each decoder block')
parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true',
parser.add_argument('--share-decoder-input-output-embed', action='store_true',
help='share decoder input and output embeddings')
parser.add_argument('--share-all-embeddings', default=False, action='store_true',
parser.add_argument('--share-all-embeddings', action='store_true',
help='share encoder, decoder and output embeddings'
' (requires shared dictionary and embed dim)')
......@@ -422,14 +422,20 @@ def base_architecture(args):
args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 2048)
args.encoder_layers = getattr(args, 'encoder_layers', 6)
args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 8)
args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', False)
args.encoder_learned_pos = getattr(args, 'encoder_learned_pos', False)
args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', args.encoder_embed_dim)
args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', args.encoder_ffn_embed_dim)
args.decoder_layers = getattr(args, 'decoder_layers', 6)
args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
args.decoder_normalize_before = getattr(args, 'decoder_normalize_before', False)
args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
args.attention_dropout = getattr(args, 'attention_dropout', 0.)
args.relu_dropout = getattr(args, 'relu_dropout', 0.)
args.dropout = getattr(args, 'dropout', 0.1)
args.share_decoder_input_output_embed = getattr(args, 'share_decoder_input_output_embed', False)
args.share_all_embeddings = getattr(args, 'share_all_embeddings', False)
@register_model_architecture('transformer', 'transformer_iwslt_de_en')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment