Commit 23211c45 authored by Alexei Baevski's avatar Alexei Baevski Committed by Myle Ott
Browse files

make attn dropout 0.1 default for big en-de transformer

parent d85b61d6
...@@ -423,8 +423,9 @@ def transformer_wmt_en_de(args): ...@@ -423,8 +423,9 @@ def transformer_wmt_en_de(args):
args.decoder_attention_heads = 8 args.decoder_attention_heads = 8
@register_model_architecture('transformer', 'transformer_wmt_en_de_big') # parameters used in the "Attention Is All You Need" paper (Vaswani, et al, 2017)
def transformer_wmt_en_de_big(args): @register_model_architecture('transformer', 'transformer_vaswani_wmt_en_de_big')
def transformer_vaswani_wmt_en_de_big(args):
base_architecture(args) base_architecture(args)
args.encoder_embed_dim = 1024 args.encoder_embed_dim = 1024
args.encoder_ffn_embed_dim = 4096 args.encoder_ffn_embed_dim = 4096
...@@ -436,18 +437,17 @@ def transformer_wmt_en_de_big(args): ...@@ -436,18 +437,17 @@ def transformer_wmt_en_de_big(args):
args.decoder_attention_heads = 16 args.decoder_attention_heads = 16
@register_model_architecture('transformer', 'transformer_wmt_en_de_big')
def transformer_wmt_en_de_big(args):
transformer_vaswani_wmt_en_de_big(args)
args.attention_dropout = 0.1
# default parameters used in tensor2tensor implementation
@register_model_architecture('transformer', 'transformer_wmt_en_de_big_t2t') @register_model_architecture('transformer', 'transformer_wmt_en_de_big_t2t')
def transformer_wmt_en_de_big_t2t(args): def transformer_wmt_en_de_big_t2t(args):
base_architecture(args) transformer_vaswani_wmt_en_de_big(args)
args.encoder_embed_dim = 1024
args.encoder_ffn_embed_dim = 4096
args.encoder_layers = 6
args.encoder_attention_heads = 16
args.encoder_normalize_before = True args.encoder_normalize_before = True
args.decoder_embed_dim = 1024
args.decoder_ffn_embed_dim = 4096
args.decoder_layers = 6
args.decoder_attention_heads = 16
args.decoder_normalize_before = True args.decoder_normalize_before = True
args.attention_dropout = 0.1 args.attention_dropout = 0.1
args.relu_dropout = 0.1 args.relu_dropout = 0.1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment