Initial commit

9e8a8c05 · jerrrrry · 9e8a8c05 · 9e8a8c05 · 9e8a8c05 · 9e8a8c05
Commit 9e8a8c05 authored Oct 14, 2024 by jerrrrry
20 changed files
--- a/implementations/pytorch/fairseq/models/__pycache__/transformer.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/models/__pycache__/transformer.cpython-310.pyc
--- a/implementations/pytorch/fairseq/models/composite_encoder.py
+++ b/implementations/pytorch/fairseq/models/composite_encoder.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from . import FairseqEncoder
+
+
+class CompositeEncoder(FairseqEncoder):
+    """
+    Encoder class that forwards on multiple encoders, for example for a fusion model or question-answering
+    Accepts a dictionary of encoder, the first encoder's dictionary is used for initialization
+    """
+
+    def __init__(self, encoders):
+        super().__init__(next(iter(encoders.values())).dictionary)
+        self.encoders = encoders
+        for key in self.encoders:
+            self.add_module(key, self.encoders[key])
+
+    def forward(self, src_tokens, src_lengths):
+        encoder_out = {}
+        for key in self.encoders:
+            encoder_out[key] = self.encoders[key](src_tokens, src_lengths)
+        return encoder_out
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        """Reorder encoder output according to new_order."""
+        for key in self.encoders:
+            encoder_out[key] = self.encoders[key].reorder_encoder_out(encoder_out[key], new_order)
+        return encoder_out
+
+    def max_positions(self):
+        return min([self.encoders[key].max_positions() for key in self.encoders])
+
+    def upgrade_state_dict(self, state_dict):
+        for key in self.encoders:
+            self.encoders[key].upgrade_state_dict(state_dict)
+        return state_dict
--- a/implementations/pytorch/fairseq/models/fairseq_decoder.py
+++ b/implementations/pytorch/fairseq/models/fairseq_decoder.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FairseqDecoder(nn.Module):
+    """Base class for decoders."""
+
+    def __init__(self, dictionary):
+        super().__init__()
+        self.dictionary = dictionary
+
+    def forward(self, prev_output_tokens, encoder_out):
+        raise NotImplementedError
+
+    def get_normalized_probs(self, net_output, log_probs, sample):
+        """Get normalized probabilities (or log probs) from a net's output."""
+
+        if hasattr(self, 'adaptive_softmax') and self.adaptive_softmax is not None:
+            assert sample is not None and 'target' in sample
+            out = self.adaptive_softmax.get_log_prob(net_output[0], sample['target'])
+            return out.exp_() if not log_probs else out
+
+        #logits = net_output[0].float()
+        logits = net_output[0] #.float()
+        if log_probs:
+            return F.log_softmax(logits, dim=-1, dtype=torch.float32)
+        else:
+            return F.softmax(logits, dim=-1, dtype=torch.float32)
+
+    def max_positions(self):
+        """Maximum input length supported by the decoder."""
+        raise NotImplementedError
+
+    def upgrade_state_dict(self, state_dict):
+        return state_dict
--- a/implementations/pytorch/fairseq/models/fairseq_encoder.py
+++ b/implementations/pytorch/fairseq/models/fairseq_encoder.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import torch.nn as nn
+
+
+class FairseqEncoder(nn.Module):
+    """Base class for encoders."""
+
+    def __init__(self, dictionary):
+        super().__init__()
+        self.dictionary = dictionary
+
+    def forward(self, src_tokens, src_lengths):
+        raise NotImplementedError
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        """Reorder encoder output according to new_order."""
+        raise NotImplementedError
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        raise NotImplementedError
+
+    def upgrade_state_dict(self, state_dict):
+        return state_dict
--- a/implementations/pytorch/fairseq/models/fairseq_incremental_decoder.py
+++ b/implementations/pytorch/fairseq/models/fairseq_incremental_decoder.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from . import FairseqDecoder
+
+
+class FairseqIncrementalDecoder(FairseqDecoder):
+    """Base class for incremental decoders."""
+
+    def __init__(self, dictionary):
+        super().__init__(dictionary)
+
+    def forward(self, prev_output_tokens, encoder_out, incremental_state=None):
+        raise NotImplementedError
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        """Reorder incremental state.
+
+        This should be called when the order of the input has changed from the
+        previous time step. A typical use case is beam search, where the input
+        order changes between time steps based on the selection of beams.
+        """
+        def apply_reorder_incremental_state(module):
+            if module != self and hasattr(module, 'reorder_incremental_state'):
+                module.reorder_incremental_state(
+                    incremental_state,
+                    new_order,
+                )
+        self.apply(apply_reorder_incremental_state)
+
+    def set_beam_size(self, beam_size):
+        """Sets the beam size in the decoder and all children."""
+        if getattr(self, '_beam_size', -1) != beam_size:
+            def apply_set_beam_size(module):
+                if module != self and hasattr(module, 'set_beam_size'):
+                    module.set_beam_size(beam_size)
+            self.apply(apply_set_beam_size)
+            self._beam_size = beam_size
--- a/implementations/pytorch/fairseq/models/fairseq_model.py
+++ b/implementations/pytorch/fairseq/models/fairseq_model.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+
+import torch.nn as nn
+
+from . import FairseqDecoder, FairseqEncoder
+
+
+class BaseFairseqModel(nn.Module):
+    """Base class for fairseq models."""
+
+    def __init__(self):
+        super().__init__()
+        self._is_generation_fast = False
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        pass
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        raise NotImplementedError
+
+    def get_targets(self, sample, net_output):
+        """Get targets from either the sample or the net's output."""
+        return sample['target']
+
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        """Get normalized probabilities (or log probs) from a net's output."""
+        return self.decoder.get_normalized_probs(net_output, log_probs, sample)
+
+    def max_positions(self):
+        """Maximum length supported by the model."""
+        raise NotImplementedError
+
+    def max_decoder_positions(self):
+        """Maximum length supported by the decoder."""
+        return self.decoder.max_positions()
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Copies parameters and buffers from state_dict into this module and
+        its descendants.
+
+        Overrides the method in nn.Module; compared with that method this
+        additionally "upgrades" state_dicts from old checkpoints.
+        """
+        self.upgrade_state_dict(state_dict)
+        super().load_state_dict(state_dict, strict)
+
+    def upgrade_state_dict(self, state_dict):
+        assert state_dict is not None
+
+        def do_upgrade(m):
+            if m != self and hasattr(m, 'upgrade_state_dict'):
+                m.upgrade_state_dict(state_dict)
+
+        self.apply(do_upgrade)
+
+    def make_generation_fast_(self, **kwargs):
+        """Optimize model for faster generation."""
+        if self._is_generation_fast:
+            return  # only apply once
+        self._is_generation_fast = True
+
+        # remove weight norm from all modules in the network
+        def apply_remove_weight_norm(module):
+            try:
+                nn.utils.remove_weight_norm(module)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(apply_remove_weight_norm)
+
+        def apply_make_generation_fast_(module):
+            if module != self and hasattr(module, 'make_generation_fast_'):
+                module.make_generation_fast_(**kwargs)
+
+        self.apply(apply_make_generation_fast_)
+
+        def train(mode):
+            if mode:
+                raise RuntimeError('cannot train after make_generation_fast')
+
+        # this model should no longer be used for training
+        self.eval()
+        self.train = train
+
+
+class FairseqModel(BaseFairseqModel):
+    """Base class for encoder-decoder models."""
+
+    def __init__(self, encoder, decoder):
+        super().__init__()
+
+        self.encoder = encoder
+        self.decoder = decoder
+        assert isinstance(self.encoder, FairseqEncoder)
+        assert isinstance(self.decoder, FairseqDecoder)
+
+    def forward(self, src_tokens, src_lengths, prev_output_tokens):
+        encoder_out = self.encoder(src_tokens, src_lengths)
+        decoder_out = self.decoder(prev_output_tokens, encoder_out)
+        return decoder_out
+
+    def max_positions(self):
+        """Maximum length supported by the model."""
+        return (self.encoder.max_positions(), self.decoder.max_positions())
+
+
+class FairseqLanguageModel(BaseFairseqModel):
+    """Base class for decoder-only models."""
+
+    def __init__(self, decoder):
+        super().__init__()
+        self.decoder = decoder
+        assert isinstance(self.decoder, FairseqDecoder)
+
+    def forward(self, src_tokens):
+        return self.decoder(src_tokens)
+
+    def max_positions(self):
+        """Maximum length supported by the model."""
+        return self.decoder.max_positions()
--- a/implementations/pytorch/fairseq/models/fconv.py
+++ b/implementations/pytorch/fairseq/models/fconv.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import options, utils
+from fairseq.modules import (
+    AdaptiveSoftmax, BeamableMM, GradMultiply, LearnedPositionalEmbedding,
+    LinearizedConvolution,
+)
+
+from . import (
+    FairseqEncoder, FairseqIncrementalDecoder, FairseqModel,
+    FairseqLanguageModel, register_model, register_model_architecture,
+)
+
+
+@register_model('fconv')
+class FConvModel(FairseqModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+        self.encoder.num_attention_layers = sum(layer is not None for layer in decoder.attention)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained encoder embedding')
+        parser.add_argument('--encoder-layers', type=str, metavar='EXPR',
+                            help='encoder layers [(dim, kernel_size), ...]')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained decoder embedding')
+        parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
+                            help='decoder layers [(dim, kernel_size), ...]')
+        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
+                            help='decoder output embedding dimension')
+        parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
+                            help='decoder attention [True, ...]')
+        parser.add_argument('--normalization-constant', type=float, metavar='D',
+                            help='multiplies the result of the residual block by sqrt(value)')
+        parser.add_argument('--share-input-output-embed', action='store_true',
+                            help='share input and output embeddings (requires'
+                                 ' --decoder-out-embed-dim and --decoder-embed-dim'
+                                 ' to be equal)')
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure that all args are properly defaulted (in case there are any new ones)
+        base_architecture(args)
+
+        encoder_embed_dict = None
+        if args.encoder_embed_path:
+            encoder_embed_dict = utils.parse_embedding(args.encoder_embed_path)
+            utils.print_embed_overlap(encoder_embed_dict, task.source_dictionary)
+
+        decoder_embed_dict = None
+        if args.decoder_embed_path:
+            decoder_embed_dict = utils.parse_embedding(args.decoder_embed_path)
+            utils.print_embed_overlap(decoder_embed_dict, task.target_dictionary)
+
+        encoder = FConvEncoder(
+            dictionary=task.source_dictionary,
+            embed_dim=args.encoder_embed_dim,
+            embed_dict=encoder_embed_dict,
+            convolutions=eval(args.encoder_layers),
+            dropout=args.dropout,
+            max_positions=args.max_source_positions,
+            normalization_constant=args.normalization_constant,
+        )
+        decoder = FConvDecoder(
+            dictionary=task.target_dictionary,
+            embed_dim=args.decoder_embed_dim,
+            embed_dict=decoder_embed_dict,
+            convolutions=eval(args.decoder_layers),
+            out_embed_dim=args.decoder_out_embed_dim,
+            attention=eval(args.decoder_attention),
+            dropout=args.dropout,
+            max_positions=args.max_target_positions,
+            share_embed=args.share_input_output_embed,
+            normalization_constant=args.normalization_constant,
+        )
+        return FConvModel(encoder, decoder)
+
+
+@register_model('fconv_lm')
+class FConvLanguageModel(FairseqLanguageModel):
+    def __init__(self, decoder):
+        super().__init__(decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
+                            help='decoder layers [(dim, kernel_size), ...]')
+        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
+                            help='decoder output embedding dimension')
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive softmax cutoff points. '
+                                 'Must be used with adaptive_loss criterion')
+        parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
+                            help='decoder attention [True, ...]')
+        parser.add_argument('--normalization-constant', type=float, metavar='D',
+                            help='multiplies the result of the residual block by sqrt(value)')
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure all arguments are present in older models
+        base_lm_architecture(args)
+
+        if hasattr(args, 'max_target_positions'):
+            args.tokens_per_sample = args.max_target_positions
+
+        decoder = FConvDecoder(
+            dictionary=task.target_dictionary,
+            embed_dim=args.decoder_embed_dim,
+            convolutions=eval(args.decoder_layers),
+            out_embed_dim=args.decoder_embed_dim,
+            attention=eval(args.decoder_attention),
+            dropout=args.dropout,
+            max_positions=args.tokens_per_sample,
+            share_embed=False,
+            positional_embeddings=False,
+            adaptive_softmax_cutoff=(
+                options.eval_str_list(args.adaptive_softmax_cutoff, type=int)
+                if args.criterion == 'adaptive_loss' else None
+            ),
+            normalization_constant=args.normalization_constant,
+        )
+        return FConvLanguageModel(decoder)
+
+
+class FConvEncoder(FairseqEncoder):
+    """Convolutional encoder"""
+
+    def __init__(
+        self, dictionary, embed_dim=512, embed_dict=None, max_positions=1024,
+        convolutions=((512, 3),) * 20, dropout=0.1, normalization_constant=0.5,
+        left_pad=True,
+    ):
+        super().__init__(dictionary)
+        self.dropout = dropout
+        self.normalization_constant = normalization_constant
+        self.left_pad = left_pad
+        self.num_attention_layers = None
+
+        num_embeddings = len(dictionary)
+        self.padding_idx = dictionary.pad()
+        self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
+        if embed_dict:
+            self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens)
+
+        self.embed_positions = PositionalEmbedding(
+            max_positions,
+            embed_dim,
+            self.padding_idx,
+            left_pad=self.left_pad,
+        )
+
+        convolutions = extend_conv_spec(convolutions)
+        in_channels = convolutions[0][0]
+        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
+        self.projections = nn.ModuleList()
+        self.convolutions = nn.ModuleList()
+        self.residuals = []
+
+        layer_in_channels = [in_channels]
+        for i, (out_channels, kernel_size, residual) in enumerate(convolutions):
+            if residual == 0:
+                residual_dim = out_channels
+            else:
+                residual_dim = layer_in_channels[-residual]
+            self.projections.append(Linear(residual_dim, out_channels)
+                                    if residual_dim != out_channels else None)
+            if kernel_size % 2 == 1:
+                padding = kernel_size // 2
+            else:
+                padding = 0
+            self.convolutions.append(
+                ConvTBC(in_channels, out_channels * 2, kernel_size,
+                        dropout=dropout, padding=padding)
+            )
+            self.residuals.append(residual)
+            in_channels = out_channels
+            layer_in_channels.append(out_channels)
+        self.fc2 = Linear(in_channels, embed_dim)
+
+    def forward(self, src_tokens, src_lengths):
+        # embed tokens and positions
+        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        input_embedding = x
+
+        # project to size of convolution
+        x = self.fc1(x)
+
+        # used to mask padding in input
+        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B
+        if not encoder_padding_mask.any():
+            encoder_padding_mask = None
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        residuals = [x]
+        # temporal convolutions
+        for proj, conv, res_layer in zip(self.projections, self.convolutions, self.residuals):
+            if res_layer > 0:
+                residual = residuals[-res_layer]
+                residual = residual if proj is None else proj(residual)
+            else:
+                residual = None
+
+            if encoder_padding_mask is not None:
+                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
+
+            x = F.dropout(x, p=self.dropout, training=self.training)
+            if conv.kernel_size[0] % 2 == 1:
+                # padding is implicit in the conv
+                x = conv(x)
+            else:
+                padding_l = (conv.kernel_size[0] - 1) // 2
+                padding_r = conv.kernel_size[0] // 2
+                x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
+                x = conv(x)
+            x = F.glu(x, dim=2)
+
+            if residual is not None:
+                x = (x + residual) * math.sqrt(self.normalization_constant)
+            residuals.append(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(1, 0)
+
+        # project back to size of embedding
+        x = self.fc2(x)
+
+        if encoder_padding_mask is not None:
+            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
+            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
+
+        # scale gradients (this only affects backward, not forward)
+        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))
+
+        # add output to input embedding for attention
+        y = (x + input_embedding) * math.sqrt(self.normalization_constant)
+
+        return {
+            'encoder_out': (x, y),
+            'encoder_padding_mask': encoder_padding_mask,  # B x T
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        if encoder_out['encoder_out'] is not None:
+            encoder_out['encoder_out'] = (
+                encoder_out['encoder_out'][0].index_select(0, new_order),
+                encoder_out['encoder_out'][1].index_select(0, new_order),
+            )
+        if encoder_out['encoder_padding_mask'] is not None:
+            encoder_out['encoder_padding_mask'] = \
+                encoder_out['encoder_padding_mask'].index_select(0, new_order)
+        return encoder_out
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return self.embed_positions.max_positions()
+
+
+class AttentionLayer(nn.Module):
+    def __init__(self, conv_channels, embed_dim, normalization_constant=0.5, bmm=None):
+        super().__init__()
+        self.normalization_constant = normalization_constant
+        # projects from output of convolution to embedding dimension
+        self.in_projection = Linear(conv_channels, embed_dim)
+        # projects from embedding dimension to convolution size
+        self.out_projection = Linear(embed_dim, conv_channels)
+
+        self.bmm = bmm if bmm is not None else torch.bmm
+
+    def forward(self, x, target_embedding, encoder_out, encoder_padding_mask):
+        residual = x
+
+        # attention
+        x = (self.in_projection(x) + target_embedding) * math.sqrt(self.normalization_constant)
+        x = self.bmm(x, encoder_out[0])
+
+        # don't attend over padding
+        if encoder_padding_mask is not None:
+            x = x.float().masked_fill(
+                encoder_padding_mask.unsqueeze(1),
+                float('-inf')
+            ).type_as(x)  # FP16 support: cast to float and back
+
+        # softmax over last dim
+        sz = x.size()
+        x = F.softmax(x.view(sz[0] * sz[1], sz[2]), dim=1)
+        x = x.view(sz)
+        attn_scores = x
+
+        x = self.bmm(x, encoder_out[1])
+
+        # scale attention output (respecting potentially different lengths)
+        s = encoder_out[1].size(1)
+        if encoder_padding_mask is None:
+            x = x * (s * math.sqrt(1.0 / s))
+        else:
+            s = s - encoder_padding_mask.type_as(x).sum(dim=1, keepdim=True)  # exclude padding
+            s = s.unsqueeze(-1)
+            x = x * (s * s.rsqrt())
+
+        # project back
+        x = (self.out_projection(x) + residual) * math.sqrt(self.normalization_constant)
+        return x, attn_scores
+
+    def make_generation_fast_(self, beamable_mm_beam_size=None, **kwargs):
+        """Replace torch.bmm with BeamableMM."""
+        if beamable_mm_beam_size is not None:
+            del self.bmm
+            self.add_module('bmm', BeamableMM(beamable_mm_beam_size))
+
+
+class FConvDecoder(FairseqIncrementalDecoder):
+    """Convolutional decoder"""
+
+    def __init__(
+        self, dictionary, embed_dim=512, embed_dict=None, out_embed_dim=256,
+        max_positions=1024, convolutions=((512, 3),) * 20, attention=True,
+        dropout=0.1, share_embed=False, positional_embeddings=True,
+        adaptive_softmax_cutoff=None, normalization_constant=0.5,
+        left_pad=False,
+    ):
+        super().__init__(dictionary)
+        self.register_buffer('version', torch.Tensor([2]))
+        self.dropout = dropout
+        self.normalization_constant = normalization_constant
+        self.left_pad = left_pad
+        self.need_attn = True
+
+        convolutions = extend_conv_spec(convolutions)
+        in_channels = convolutions[0][0]
+        if isinstance(attention, bool):
+            # expand True into [True, True, ...] and do the same with False
+            attention = [attention] * len(convolutions)
+        if not isinstance(attention, list) or len(attention) != len(convolutions):
+            raise ValueError('Attention is expected to be a list of booleans of '
+                             'length equal to the number of layers.')
+
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.pad()
+        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+        if embed_dict:
+            self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens)
+
+        self.embed_positions = PositionalEmbedding(
+            max_positions,
+            embed_dim,
+            padding_idx,
+            left_pad=self.left_pad,
+        ) if positional_embeddings else None
+
+        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
+        self.projections = nn.ModuleList()
+        self.convolutions = nn.ModuleList()
+        self.attention = nn.ModuleList()
+        self.residuals = []
+
+        layer_in_channels = [in_channels]
+        for i, (out_channels, kernel_size, residual) in enumerate(convolutions):
+            if residual == 0:
+                residual_dim = out_channels
+            else:
+                residual_dim = layer_in_channels[-residual]
+            self.projections.append(Linear(residual_dim, out_channels)
+                                    if residual_dim != out_channels else None)
+            self.convolutions.append(
+                LinearizedConv1d(in_channels, out_channels * 2, kernel_size,
+                                 padding=(kernel_size - 1), dropout=dropout)
+            )
+            self.attention.append(AttentionLayer(out_channels, embed_dim, self.normalization_constant)
+                                  if attention[i] else None)
+            self.residuals.append(residual)
+            in_channels = out_channels
+            layer_in_channels.append(out_channels)
+
+        self.adaptive_softmax = None
+        self.fc2 = self.fc3 = None
+
+        if adaptive_softmax_cutoff is not None:
+            assert not share_embed
+            self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, in_channels, adaptive_softmax_cutoff,
+                                                    dropout=dropout)
+        else:
+            self.fc2 = Linear(in_channels, out_embed_dim)
+            if share_embed:
+                assert out_embed_dim == embed_dim, \
+                    "Shared embed weights implies same dimensions " \
+                    " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim)
+                self.fc3 = nn.Linear(out_embed_dim, num_embeddings)
+                self.fc3.weight = self.embed_tokens.weight
+            else:
+                self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)
+
+    def forward(self, prev_output_tokens, encoder_out_dict=None, incremental_state=None):
+        if encoder_out_dict is not None:
+            encoder_out = encoder_out_dict['encoder_out']
+            encoder_padding_mask = encoder_out_dict['encoder_padding_mask']
+
+            # split and transpose encoder outputs
+            encoder_a, encoder_b = self._split_encoder_out(encoder_out, incremental_state)
+
+        if self.embed_positions is not None:
+            pos_embed = self.embed_positions(prev_output_tokens, incremental_state)
+        else:
+            pos_embed = 0
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+        x = self._embed_tokens(prev_output_tokens, incremental_state)
+
+        # embed tokens and combine with positional embeddings
+        x += pos_embed
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        target_embedding = x
+
+        # project to size of convolution
+        x = self.fc1(x)
+
+        # B x T x C -> T x B x C
+        x = self._transpose_if_training(x, incremental_state)
+
+        # temporal convolutions
+        avg_attn_scores = None
+        num_attn_layers = len(self.attention)
+        residuals = [x]
+        for proj, conv, attention, res_layer in zip(self.projections, self.convolutions, self.attention,
+                                                    self.residuals):
+            if res_layer > 0:
+                residual = residuals[-res_layer]
+                residual = residual if proj is None else proj(residual)
+            else:
+                residual = None
+
+            x = F.dropout(x, p=self.dropout, training=self.training)
+            x = conv(x, incremental_state)
+            x = F.glu(x, dim=2)
+
+            # attention
+            if attention is not None:
+                x = self._transpose_if_training(x, incremental_state)
+
+                x, attn_scores = attention(x, target_embedding, (encoder_a, encoder_b), encoder_padding_mask)
+
+                if not self.training and self.need_attn:
+                    attn_scores = attn_scores / num_attn_layers
+                    if avg_attn_scores is None:
+                        avg_attn_scores = attn_scores
+                    else:
+                        avg_attn_scores.add_(attn_scores)
+
+                x = self._transpose_if_training(x, incremental_state)
+
+            # residual
+            if residual is not None:
+                x = (x + residual) * math.sqrt(self.normalization_constant)
+            residuals.append(x)
+
+        # T x B x C -> B x T x C
+        x = self._transpose_if_training(x, incremental_state)
+
+        # project back to size of vocabulary if not using adaptive softmax
+        if self.fc2 is not None and self.fc3 is not None:
+            x = self.fc2(x)
+            x = F.dropout(x, p=self.dropout, training=self.training)
+            x = self.fc3(x)
+
+        return x, avg_attn_scores
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        super().reorder_incremental_state(incremental_state, new_order)
+        encoder_out = utils.get_incremental_state(self, incremental_state, 'encoder_out')
+        if encoder_out is not None:
+            encoder_out = tuple(eo.index_select(0, new_order) for eo in encoder_out)
+            utils.set_incremental_state(self, incremental_state, 'encoder_out', encoder_out)
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        return self.embed_positions.max_positions() if self.embed_positions is not None else float('inf')
+
+    def upgrade_state_dict(self, state_dict):
+        if state_dict.get('decoder.version', torch.Tensor([1]))[0] < 2:
+            # old models use incorrect weight norm dimension
+            for i, conv in enumerate(self.convolutions):
+                # reconfigure weight norm
+                nn.utils.remove_weight_norm(conv)
+                self.convolutions[i] = nn.utils.weight_norm(conv, dim=0)
+            state_dict['decoder.version'] = torch.Tensor([1])
+        return state_dict
+
+    def make_generation_fast_(self, need_attn=False, **kwargs):
+        self.need_attn = need_attn
+
+    def _embed_tokens(self, tokens, incremental_state):
+        if incremental_state is not None:
+            # keep only the last token for incremental forward pass
+            tokens = tokens[:, -1:]
+        return self.embed_tokens(tokens)
+
+    def _split_encoder_out(self, encoder_out, incremental_state):
+        """Split and transpose encoder outputs.
+
+        This is cached when doing incremental inference.
+        """
+        cached_result = utils.get_incremental_state(self, incremental_state, 'encoder_out')
+        if cached_result is not None:
+            return cached_result
+
+        # transpose only once to speed up attention layers
+        encoder_a, encoder_b = encoder_out
+        encoder_a = encoder_a.transpose(1, 2).contiguous()
+        result = (encoder_a, encoder_b)
+
+        if incremental_state is not None:
+            utils.set_incremental_state(self, incremental_state, 'encoder_out', result)
+        return result
+
+    def _transpose_if_training(self, x, incremental_state):
+        if incremental_state is None:
+            x = x.transpose(0, 1)
+        return x
+
+
+def extend_conv_spec(convolutions):
+    """
+    Extends convolutional spec that is a list of tuples of 2 or 3 parameters
+    (kernel size, dim size and optionally how many layers behind to look for residual)
+    to default the residual propagation param if it is not specified
+    """
+    extended = []
+    for spec in convolutions:
+        if len(spec) == 3:
+            extended.append(spec)
+        elif len(spec) == 2:
+            extended.append(spec + (1,))
+        else:
+            raise Exception('invalid number of parameters in convolution spec ' + str(spec) + '. expected 2 or 3')
+    return tuple(extended)
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, 0, 0.1)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad):
+    m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad)
+    nn.init.normal_(m.weight, 0, 0.1)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def Linear(in_features, out_features, dropout=0):
+    """Weight-normalized Linear layer (input: N x T x C)"""
+    m = nn.Linear(in_features, out_features)
+    nn.init.normal_(m.weight, mean=0, std=math.sqrt((1 - dropout) / in_features))
+    nn.init.constant_(m.bias, 0)
+    return nn.utils.weight_norm(m)
+
+
+def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
+    """Weight-normalized Conv1d layer optimized for decoding"""
+    m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs)
+    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
+    nn.init.normal_(m.weight, mean=0, std=std)
+    nn.init.constant_(m.bias, 0)
+    return nn.utils.weight_norm(m, dim=2)
+
+
+def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
+    """Weight-normalized Conv1d layer"""
+    from fairseq.modules import ConvTBC
+    m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
+    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
+    nn.init.normal_(m.weight, mean=0, std=std)
+    nn.init.constant_(m.bias, 0)
+    return nn.utils.weight_norm(m, dim=2)
+
+
+@register_model_architecture('fconv_lm', 'fconv_lm')
+def base_lm_architecture(args):
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
+    args.decoder_layers = getattr(args, 'decoder_layers', '[(1268, 4)] * 13')
+    args.decoder_attention = getattr(args, 'decoder_attention', 'False')
+    args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
+    args.normalization_constant = getattr(args, 'normalization_constant', 0.5)
+
+
+@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_wikitext103')
+def fconv_lm_dauphin_wikitext103(args):
+    layers = '[(850, 6)] * 3'
+    layers += ' + [(850, 1)] * 1'
+    layers += ' + [(850, 5)] * 4'
+    layers += ' + [(850, 1)] * 1'
+    layers += ' + [(850, 4)] * 3'
+    layers += ' + [(1024, 4)] * 1'
+    layers += ' + [(2048, 4)] * 1'
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 280)
+    args.decoder_layers = getattr(args, 'decoder_layers', layers)
+    args.decoder_attention = getattr(args, 'decoder_attention', 'False')
+    args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,20000,200000')
+    base_lm_architecture(args)
+
+
+@register_model_architecture('fconv_lm', 'fconv_lm_dauphin_gbw')
+def fconv_lm_dauphin_gbw(args):
+    layers = '[(512, 5)]'
+    layers += ' + [(128, 1, 0), (128, 5, 0), (512, 1, 3)] * 3'
+    layers += ' + [(512, 1, 0), (512, 5, 0), (1024, 1, 3)] * 3'
+    layers += ' + [(1024, 1, 0), (1024, 5, 0), (2048, 1, 3)] * 6'
+    layers += ' + [(1024, 1, 0), (1024, 5, 0), (4096, 1, 3)]'
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 128)
+    args.decoder_layers = getattr(args, 'decoder_layers', layers)
+    args.decoder_attention = getattr(args, 'decoder_attention', 'False')
+    args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', '10000,50000,200000')
+    base_lm_architecture(args)
+
+
+@register_model_architecture('fconv', 'fconv')
+def base_architecture(args):
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
+    args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
+    args.encoder_layers = getattr(args, 'encoder_layers', '[(512, 3)] * 20')
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
+    args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
+    args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 3)] * 20')
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
+    args.decoder_attention = getattr(args, 'decoder_attention', 'True')
+    args.share_input_output_embed = getattr(args, 'share_input_output_embed', False)
+    args.normalization_constant = getattr(args, 'normalization_constant', 0.5)
+
+
+@register_model_architecture('fconv', 'fconv_iwslt_de_en')
+def fconv_iwslt_de_en(args):
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256)
+    args.encoder_layers = getattr(args, 'encoder_layers', '[(256, 3)] * 4')
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 256)
+    args.decoder_layers = getattr(args, 'decoder_layers', '[(256, 3)] * 3')
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
+    base_architecture(args)
+
+
+@register_model_architecture('fconv', 'fconv_wmt_en_ro')
+def fconv_wmt_en_ro(args):
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
+    base_architecture(args)
+
+
+@register_model_architecture('fconv', 'fconv_wmt_en_de')
+def fconv_wmt_en_de(args):
+    convs = '[(512, 3)] * 9'  # first 9 layers have 512 units
+    convs += ' + [(1024, 3)] * 4'  # next 4 layers have 1024 units
+    convs += ' + [(2048, 1)] * 2'  # final 2 layers use 1x1 convolutions
+
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 768)
+    args.encoder_layers = getattr(args, 'encoder_layers', convs)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768)
+    args.decoder_layers = getattr(args, 'decoder_layers', convs)
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
+    base_architecture(args)
+
+
+@register_model_architecture('fconv', 'fconv_wmt_en_fr')
+def fconv_wmt_en_fr(args):
+    convs = '[(512, 3)] * 6'  # first 6 layers have 512 units
+    convs += ' + [(768, 3)] * 4'  # next 4 layers have 768 units
+    convs += ' + [(1024, 3)] * 3'  # next 3 layers have 1024 units
+    convs += ' + [(2048, 1)] * 1'  # next 1 layer uses 1x1 convolutions
+    convs += ' + [(4096, 1)] * 1'  # final 1 layer uses 1x1 convolutions
+
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 768)
+    args.encoder_layers = getattr(args, 'encoder_layers', convs)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 768)
+    args.decoder_layers = getattr(args, 'decoder_layers', convs)
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
+    base_architecture(args)
--- a/implementations/pytorch/fairseq/models/fconv_self_att.py
+++ b/implementations/pytorch/fairseq/models/fconv_self_att.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+#
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq.modules import (
+    DownsampledMultiHeadAttention, GradMultiply, LearnedPositionalEmbedding,
+    LinearizedConvolution,
+)
+from fairseq import utils
+
+from . import (
+    FairseqEncoder, CompositeEncoder, FairseqDecoder, FairseqModel,
+    register_model, register_model_architecture,
+)
+
+
+@register_model('fconv_self_att')
+class FConvModelSelfAtt(FairseqModel):
+    def __init__(self, encoder, decoder, pretrained_encoder=None):
+        super().__init__(encoder, decoder)
+        self.encoder.num_attention_layers = sum(layer is not None for layer in decoder.attention)
+        self.pretrained_encoder = pretrained_encoder
+        if self.pretrained_encoder is None:
+            encoders = {'encoder': encoder}
+        else:
+            encoders = {'encoder': encoder, 'pretrained': self.pretrained_encoder}
+        # for fusion model, CompositeEncoder contains both pretrained and training encoders
+        # these are forwarded and then combined in the decoder
+        self.encoder = CompositeEncoder(encoders)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-layers', type=str, metavar='EXPR',
+                            help='encoder layers [(dim, kernel_size), ...]')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
+                            help='decoder layers [(dim, kernel_size), ...]')
+        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
+                            help='decoder output embedding dimension')
+        parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
+                            help='decoder attention [True, ...]')
+        parser.add_argument('--self-attention', type=str, metavar='EXPR',
+                            help='decoder self-attention layers, ex: [True] + [False]*5')
+        parser.add_argument('--multihead-attention-nheads', type=int,
+                            help='Number of heads to use in attention')
+        parser.add_argument('--multihead-self-attention-nheads', type=int,
+                            help='Number of heads to use in self-attention')
+        parser.add_argument('--encoder-attention', type=str, metavar='EXPR',
+                            help='encoder attention [True, ...]')
+        parser.add_argument('--encoder-attention-nheads', type=int,
+                            help='Number of heads to use in encoder attention')
+        parser.add_argument('--project-input', type=str, metavar='EXPR',
+                            help='Use projections in self-attention [True, ...]')
+        parser.add_argument('--gated-attention', type=str, metavar='EXPR',
+                            help='Use GLU layers in self-attention projections [True, ...]')
+        parser.add_argument('--downsample', type=str, metavar='EXPR',
+                            help='Use downsampling in self-attention [True, ...]')
+        parser.add_argument('--pretrained-checkpoint', metavar='DIR',
+                            help='path to load checkpoint from pretrained model')
+        parser.add_argument('--pretrained', type=str, metavar='EXPR',
+                            help='use pretrained model when training [True, ...]')
+
+    @classmethod
+    def build_model(cls, args, task):
+        trained_encoder, trained_decoder = None, None
+        pretrained = eval(args.pretrained)
+        if pretrained:
+            print("| loading pretrained model")
+            trained_model = utils.load_ensemble_for_inference(
+                # not actually for inference, but loads pretrained model parameters
+                filenames=[args.pretrained_checkpoint],
+                task=task,
+            )[0][0]
+            trained_decoder = list(trained_model.children())[1]
+            trained_encoder = list(trained_model.children())[0]
+
+            # freeze pretrained model
+            for param in trained_decoder.parameters():
+                param.requires_grad = False
+            for param in trained_encoder.parameters():
+                param.requires_grad = False
+
+        """Build a new model instance."""
+        encoder = FConvEncoder(
+            task.source_dictionary,
+            embed_dim=args.encoder_embed_dim,
+            convolutions=eval(args.encoder_layers),
+            dropout=args.dropout,
+            max_positions=args.max_source_positions,
+            attention=eval(args.encoder_attention),
+            attention_nheads=args.encoder_attention_nheads
+        )
+
+        decoder = FConvDecoder(
+            task.target_dictionary,
+            embed_dim=args.decoder_embed_dim,
+            convolutions=eval(args.decoder_layers),
+            out_embed_dim=args.decoder_out_embed_dim,
+            attention=eval(args.decoder_attention),
+            dropout=args.dropout,
+            max_positions=args.max_target_positions,
+            selfattention=eval(args.self_attention),
+            attention_nheads=args.multihead_attention_nheads,
+            selfattention_nheads=args.multihead_self_attention_nheads,
+            project_input=eval(args.project_input),
+            gated_attention=eval(args.gated_attention),
+            downsample=eval(args.downsample),
+            pretrained=pretrained,
+            trained_decoder=trained_decoder
+        )
+        model = FConvModelSelfAtt(encoder, decoder, trained_encoder)
+
+        return model
+
+    @property
+    def pretrained(self):
+        return self.pretrained_encoder is not None
+
+
+class FConvEncoder(FairseqEncoder):
+    """Convolutional encoder"""
+    def __init__(
+        self, dictionary, embed_dim=512, max_positions=1024,
+        convolutions=((512, 3),) * 20, dropout=0.1, attention=False,
+        attention_nheads=1, left_pad=True,
+    ):
+        super().__init__(dictionary)
+        self.dropout = dropout
+        self.num_attention_layers = None
+        self.left_pad = left_pad
+
+        num_embeddings = len(dictionary)
+        self.padding_idx = dictionary.pad()
+        self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
+        self.embed_positions = PositionalEmbedding(
+            max_positions,
+            embed_dim,
+            self.padding_idx,
+            left_pad=self.left_pad,
+        )
+
+        def expand_bool_array(val):
+            if isinstance(val, bool):
+                # expand True into [True, True, ...] and do the same with False
+                return [val] * len(convolutions)
+            return val
+
+        attention = expand_bool_array(attention)
+
+        in_channels = convolutions[0][0]
+        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
+        self.projections = nn.ModuleList()
+        self.convolutions = nn.ModuleList()
+        self.attention = nn.ModuleList()
+        self.attproj = nn.ModuleList()
+        for i, (out_channels, kernel_size) in enumerate(convolutions):
+            self.projections.append(
+                Linear(in_channels, out_channels) if in_channels != out_channels else None
+            )
+            self.convolutions.append(
+                ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout)
+            )
+
+            self.attention.append(
+                SelfAttention(out_channels, embed_dim, attention_nheads) if attention[i] else None
+            )
+            in_channels = out_channels
+
+        self.fc2 = Linear(in_channels, embed_dim)
+
+    def forward(self, src_tokens, src_lengths):
+        # embed tokens and positions
+        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        input_embedding = x.transpose(0, 1)
+
+        # project to size of convolution
+        x = self.fc1(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # temporal convolutions
+        for proj, conv, attention in zip(self.projections, self.convolutions, self.attention):
+            residual = x if proj is None else proj(x)
+
+            x = F.dropout(x, p=self.dropout, training=self.training)
+            padding_l = (conv.kernel_size[0] - 1) // 2
+            padding_r = conv.kernel_size[0] // 2
+            x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
+            x = conv(x)
+            x = F.glu(x, dim=2)
+            if attention is not None:
+                x = attention(x)
+            x = (x + residual) * math.sqrt(0.5)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(1, 0)
+
+        # project back to size of embedding
+        x = self.fc2(x)
+
+        # scale gradients (this only affects backward, not forward)
+        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))
+
+        # add output to input embedding for attention
+        y = (x + input_embedding.transpose(0, 1)) * math.sqrt(0.5)
+
+        return {
+            'encoder_out': (x, y),
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        encoder_out['encoder_out'] = tuple(
+            eo.index_select(0, new_order) for eo in encoder_out['encoder_out']
+        )
+
+        if 'pretrained' in encoder_out:
+            encoder_out['pretrained']['encoder_out'] = tuple(
+                eo.index_select(0, new_order)
+                for eo in encoder_out['pretrained']['encoder_out']
+            )
+
+        return encoder_out
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return self.embed_positions.max_positions()
+
+
+class FConvDecoder(FairseqDecoder):
+    """Convolutional decoder"""
+    def __init__(
+        self, dictionary, embed_dim=512, out_embed_dim=256, max_positions=1024,
+        convolutions=((512, 3),) * 8, attention=True, dropout=0.1,
+        selfattention=False, attention_nheads=1, selfattention_nheads=1,
+        project_input=False, gated_attention=False, downsample=False,
+        pretrained=False, trained_decoder=None, left_pad=False,
+    ):
+        super().__init__(dictionary)
+        self.register_buffer('version', torch.Tensor([2]))
+        self.pretrained = pretrained
+        self.pretrained_decoder = trained_decoder
+        self.dropout = dropout
+        self.left_pad = left_pad
+        self.need_attn = True
+        in_channels = convolutions[0][0]
+
+        def expand_bool_array(val):
+            if isinstance(val, bool):
+                # expand True into [True, True, ...] and do the same with False
+                return [val] * len(convolutions)
+            return val
+
+        attention = expand_bool_array(attention)
+        selfattention = expand_bool_array(selfattention)
+
+        if not isinstance(attention, list) or len(attention) != len(convolutions):
+            raise ValueError('Attention is expected to be a list of booleans of '
+                             'length equal to the number of layers.')
+
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.pad()
+        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+
+        self.embed_positions = PositionalEmbedding(
+            max_positions,
+            embed_dim,
+            padding_idx,
+            left_pad=self.left_pad,
+        )
+
+        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
+        self.projections = nn.ModuleList()
+        self.convolutions = nn.ModuleList()
+        self.attention = nn.ModuleList()
+        self.selfattention = nn.ModuleList()
+        self.attproj = nn.ModuleList()
+        for i, (out_channels, kernel_size) in enumerate(convolutions):
+            self.projections.append(
+                Linear(in_channels, out_channels) if in_channels != out_channels else None
+            )
+            self.convolutions.append(
+                LinearizedConv1d(
+                    in_channels, out_channels * 2, kernel_size,
+                    padding=(kernel_size - 1), dropout=dropout,
+                )
+            )
+
+            self.attention.append(
+                DownsampledMultiHeadAttention(
+                    out_channels, embed_dim, attention_nheads,
+                    project_input=project_input, gated=False, downsample=False,
+                ) if attention[i] else None
+            )
+
+            self.attproj.append(
+                Linear(out_channels, embed_dim, dropout=dropout) if attention[i] else None
+            )
+            self.selfattention.append(
+                SelfAttention(
+                    out_channels, embed_dim, selfattention_nheads,
+                    project_input=project_input, gated=gated_attention,
+                    downsample=downsample,
+                ) if selfattention[i] else None
+            )
+            in_channels = out_channels
+
+        self.fc2 = Linear(in_channels, out_embed_dim)
+        self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)
+
+        # model fusion
+        if self.pretrained:
+            # independent gates are learned from the concatenated input
+            self.gate1 = nn.Sequential(Linear(out_embed_dim*2, out_embed_dim), nn.Sigmoid())
+            self.gate2 = nn.Sequential(Linear(out_embed_dim*2, out_embed_dim), nn.Sigmoid())
+            # pretrained and trained models are joined
+            self.joining = nn.Sequential(
+                Linear(out_embed_dim*2, out_embed_dim*2),
+                nn.LayerNorm(out_embed_dim*2),
+                nn.GLU(),
+                Linear(out_embed_dim, out_embed_dim*2),
+                nn.LayerNorm(out_embed_dim*2),
+                nn.GLU(),
+                Linear(out_embed_dim, out_embed_dim),
+                nn.LayerNorm(out_embed_dim)
+            )
+            # pretrained model contains an output layer that is nhid -> vocab size
+            # but the models are combined in their hidden state
+            # the hook stores the output of the pretrained model forward
+            self.pretrained_outputs = {}
+
+            def save_output():
+                def hook(a, b, output):
+                    self.pretrained_outputs["out"] = output
+                return hook
+
+            self.pretrained_decoder.fc2.register_forward_hook(save_output())
+
+    def forward(self, prev_output_tokens, encoder_out_dict):
+        encoder_out = encoder_out_dict['encoder']['encoder_out']
+        trained_encoder_out = encoder_out_dict['pretrained'] if self.pretrained else None
+
+        encoder_a, encoder_b = self._split_encoder_out(encoder_out)
+
+        # embed positions
+        positions = self.embed_positions(prev_output_tokens)
+
+        # embed tokens and positions
+        x = self.embed_tokens(prev_output_tokens) + positions
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        target_embedding = x.transpose(0, 1)
+
+        # project to size of convolution
+        x = self.fc1(x)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # temporal convolutions
+        avg_attn_scores = None
+        for proj, conv, attention, selfattention, attproj in zip(
+            self.projections, self.convolutions, self.attention, self.selfattention, self.attproj
+        ):
+            residual = x if proj is None else proj(x)
+
+            x = F.dropout(x, p=self.dropout, training=self.training)
+            x = conv(x)
+            x = F.glu(x, dim=2)
+
+            # attention
+            if attention is not None:
+                r = x
+                x, attn_scores = attention(attproj(x) + target_embedding, encoder_a, encoder_b)
+                x = x + r
+                if not self.training and self.need_attn:
+                    if avg_attn_scores is None:
+                        avg_attn_scores = attn_scores
+                    else:
+                        avg_attn_scores.add_(attn_scores)
+
+            if selfattention is not None:
+                x = selfattention(x)
+
+            x = (x + residual) * math.sqrt(0.5)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        # project back to size of vocabulary
+        x = self.fc2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        if not self.pretrained:
+            x = self.fc3(x)
+
+        # fusion gating
+        if self.pretrained:
+            trained_x, _ = self.pretrained_decoder.forward(prev_output_tokens, trained_encoder_out)
+            y = torch.cat([x, self.pretrained_outputs["out"]], dim=-1)
+            gate1 = self.gate1(y)
+            gate2 = self.gate2(y)
+            gated_x1 = gate1 * x
+            gated_x2 = gate2 * self.pretrained_outputs["out"]
+            fusion = torch.cat([gated_x1, gated_x2], dim=-1)
+            fusion = self.joining(fusion)
+            fusion_output = self.fc3(fusion)
+            return fusion_output, avg_attn_scores
+        else:
+            return x, avg_attn_scores
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        return self.embed_positions.max_positions()
+
+    def make_generation_fast_(self, need_attn=False, **kwargs):
+        self.need_attn = need_attn
+
+    def _split_encoder_out(self, encoder_out):
+        """Split and transpose encoder outputs."""
+        # transpose only once to speed up attention layers
+        encoder_a, encoder_b = encoder_out
+        encoder_a = encoder_a.transpose(0, 1).contiguous()
+        encoder_b = encoder_b.transpose(0, 1).contiguous()
+        result = (encoder_a, encoder_b)
+        return result
+
+
+class SelfAttention(nn.Module):
+
+    def __init__(self, out_channels, embed_dim, num_heads, project_input=False, gated=False, downsample=False):
+        super().__init__()
+        self.attention = DownsampledMultiHeadAttention(
+            out_channels, embed_dim, num_heads, dropout=0, bias=True,
+            project_input=project_input, gated=gated, downsample=downsample,
+        )
+        self.in_proj_q = Linear(out_channels, embed_dim)
+        self.in_proj_k = Linear(out_channels, embed_dim)
+        self.in_proj_v = Linear(out_channels, embed_dim)
+        self.ln = nn.LayerNorm(out_channels)
+
+    def forward(self, x):
+        residual = x
+        query = self.in_proj_q(x)
+        key = self.in_proj_k(x)
+        value = self.in_proj_v(x)
+        x, _ = self.attention(query, key, value, mask_future_timesteps=True, use_scalar_bias=True)
+        return self.ln(x + residual)
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    m.weight.data.normal_(0, 0.1)
+    return m
+
+
+def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad):
+    m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad)
+    m.weight.data.normal_(0, 0.1)
+    return m
+
+
+def Linear(in_features, out_features, dropout=0.):
+    """Weight-normalized Linear layer (input: N x T x C)"""
+    m = nn.Linear(in_features, out_features)
+    m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
+    m.bias.data.zero_()
+    return m
+
+
+def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0., **kwargs):
+    """Weight-normalized Conv1d layer optimized for decoding"""
+    m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs)
+    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
+    m.weight.data.normal_(mean=0, std=std)
+    m.bias.data.zero_()
+    return m
+
+
+def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
+    """Weight-normalized Conv1d layer"""
+    from fairseq.modules import ConvTBC
+    m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
+    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
+    m.weight.data.normal_(mean=0, std=std)
+    m.bias.data.zero_()
+    return m
+
+
+@register_model_architecture('fconv_self_att', 'fconv_self_att')
+def base_architecture(args):
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
+    args.encoder_layers = getattr(args, 'encoder_layers', '[(512, 3)] * 3')
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
+    args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 3)] * 8')
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
+    args.decoder_attention = getattr(args, 'decoder_attention', 'True')
+    args.self_attention = getattr(args, 'self_attention', 'False')
+    args.encoder_attention = getattr(args, 'encoder_attention', 'False')
+    args.multihead_attention_nheads = getattr(args, 'multihead_attention_nheads', 1)
+    args.multihead_self_attention_nheads = getattr(args, 'multihead_self_attention_nheads', 1)
+    args.encoder_attention_nheads = getattr(args, 'encoder_attention_nheads', 1)
+    args.project_input = getattr(args, 'project_input', 'False')
+    args.gated_attention = getattr(args, 'gated_attention', 'False')
+    args.downsample = getattr(args, 'downsample', 'False')
+    args.pretrained_checkpoint = getattr(args, 'pretrained_checkpoint', '')
+    args.pretrained = getattr(args, 'pretrained', 'False')
+
+@register_model_architecture('fconv_self_att', 'fconv_self_att_wp')
+def fconv_self_att_wp(args):
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256)
+    args.encoder_layers = getattr(args, 'encoder_layers', '[(128, 3)] * 2 + [(512,3)] * 1')
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 256)
+    args.decoder_layers = getattr(args, 'decoder_layers', '[(512, 4)] * 4 + [(768, 4)] * 2 + [(1024, 4)] * 1')
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
+    args.self_attention = getattr(args, 'self_attention', 'True')
+    args.multihead_self_attention_nheads = getattr(args, 'multihead_self_attention_nheads', 4)
+    args.project_input = getattr(args, 'project_input', 'True')
+    args.gated_attention = getattr(args, 'gated_attention', 'True')
+    args.downsample = getattr(args, 'downsample', 'True')
+    base_architecture(args)
--- a/implementations/pytorch/fairseq/models/lstm.py
+++ b/implementations/pytorch/fairseq/models/lstm.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fairseq import options, utils
+
+from . import (
+    FairseqEncoder, FairseqIncrementalDecoder, FairseqModel, register_model,
+    register_model_architecture,
+)
+
+
+@register_model('lstm')
+class LSTMModel(FairseqModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained encoder embedding')
+        parser.add_argument('--encoder-hidden-size', type=int, metavar='N',
+                            help='encoder hidden size')
+        parser.add_argument('--encoder-layers', type=int, metavar='N',
+                            help='number of encoder layers')
+        parser.add_argument('--encoder-bidirectional', action='store_true',
+                            help='make all layers of encoder bidirectional')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained decoder embedding')
+        parser.add_argument('--decoder-hidden-size', type=int, metavar='N',
+                            help='decoder hidden size')
+        parser.add_argument('--decoder-layers', type=int, metavar='N',
+                            help='number of decoder layers')
+        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
+                            help='decoder output embedding dimension')
+        parser.add_argument('--decoder-attention', type=str, metavar='BOOL',
+                            help='decoder attention')
+
+        # Granular dropout settings (if not specified these default to --dropout)
+        parser.add_argument('--encoder-dropout-in', type=float, metavar='D',
+                            help='dropout probability for encoder input embedding')
+        parser.add_argument('--encoder-dropout-out', type=float, metavar='D',
+                            help='dropout probability for encoder output')
+        parser.add_argument('--decoder-dropout-in', type=float, metavar='D',
+                            help='dropout probability for decoder input embedding')
+        parser.add_argument('--decoder-dropout-out', type=float, metavar='D',
+                            help='dropout probability for decoder output')
+        parser.add_argument('--share-decoder-input-output-embed', default=False,
+                            action='store_true',
+                            help='share decoder input and output embeddings')
+        parser.add_argument('--share-all-embeddings', default=False, action='store_true',
+                            help='share encoder, decoder and output embeddings'
+                                 ' (requires shared dictionary and embed dim)')
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure that all args are properly defaulted (in case there are any new ones)
+        base_architecture(args)
+
+        def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+            embed_dict = utils.parse_embedding(embed_path)
+            utils.print_embed_overlap(embed_dict, dictionary)
+            return utils.load_embedding(embed_dict, dictionary, embed_tokens)
+
+        if args.encoder_embed_path:
+            pretrained_encoder_embed = load_pretrained_embedding_from_file(
+                args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim)
+        else:
+            num_embeddings = len(task.source_dictionary)
+            pretrained_encoder_embed = Embedding(
+                num_embeddings, args.encoder_embed_dim, task.source_dictionary.pad()
+            )
+
+        if args.share_all_embeddings:
+            # double check all parameters combinations are valid
+            if task.source_dictionary != task.target_dictionary:
+                raise RuntimeError('--share-all-embeddings requires a joint dictionary')
+            if args.decoder_embed_path and (
+                    args.decoder_embed_path != args.encoder_embed_path):
+                raise RuntimeError(
+                    '--share-all-embed not compatible with --decoder-embed-path'
+                )
+            if args.encoder_embed_dim != args.decoder_embed_dim:
+                raise RuntimeError(
+                    '--share-all-embeddings requires --encoder-embed-dim to '
+                    'match --decoder-embed-dim'
+                )
+            pretrained_decoder_embed = pretrained_encoder_embed
+            args.share_decoder_input_output_embed = True
+        else:
+            # separate decoder input embeddings
+            pretrained_decoder_embed = None
+            if args.decoder_embed_path:
+                pretrained_decoder_embed = load_pretrained_embedding_from_file(
+                    args.decoder_embed_path,
+                    task.target_dictionary,
+                    args.decoder_embed_dim
+                )
+        # one last double check of parameter combinations
+        if args.share_decoder_input_output_embed and (
+                args.decoder_embed_dim != args.decoder_out_embed_dim):
+            raise RuntimeError(
+                '--share-decoder-input-output-embeddings requires '
+                '--decoder-embed-dim to match --decoder-out-embed-dim'
+            )
+
+        encoder = LSTMEncoder(
+            dictionary=task.source_dictionary,
+            embed_dim=args.encoder_embed_dim,
+            hidden_size=args.encoder_hidden_size,
+            num_layers=args.encoder_layers,
+            dropout_in=args.encoder_dropout_in,
+            dropout_out=args.encoder_dropout_out,
+            bidirectional=args.encoder_bidirectional,
+            pretrained_embed=pretrained_encoder_embed,
+        )
+        decoder = LSTMDecoder(
+            dictionary=task.target_dictionary,
+            embed_dim=args.decoder_embed_dim,
+            hidden_size=args.decoder_hidden_size,
+            out_embed_dim=args.decoder_out_embed_dim,
+            num_layers=args.decoder_layers,
+            dropout_in=args.decoder_dropout_in,
+            dropout_out=args.decoder_dropout_out,
+            attention=options.eval_bool(args.decoder_attention),
+            encoder_embed_dim=args.encoder_embed_dim,
+            encoder_output_units=encoder.output_units,
+            pretrained_embed=pretrained_decoder_embed,
+            share_input_output_embed=args.share_decoder_input_output_embed,
+        )
+        return cls(encoder, decoder)
+
+
+class LSTMEncoder(FairseqEncoder):
+    """LSTM encoder."""
+    def __init__(
+        self, dictionary, embed_dim=512, hidden_size=512, num_layers=1,
+        dropout_in=0.1, dropout_out=0.1, bidirectional=False,
+        left_pad=True, pretrained_embed=None, padding_value=0.,
+    ):
+        super().__init__(dictionary)
+        self.num_layers = num_layers
+        self.dropout_in = dropout_in
+        self.dropout_out = dropout_out
+        self.bidirectional = bidirectional
+        self.hidden_size = hidden_size
+
+        num_embeddings = len(dictionary)
+        self.padding_idx = dictionary.pad()
+        if pretrained_embed is None:
+            self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
+        else:
+            self.embed_tokens = pretrained_embed
+
+        self.lstm = LSTM(
+            input_size=embed_dim,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            dropout=self.dropout_out if num_layers > 1 else 0.,
+            bidirectional=bidirectional,
+        )
+        self.left_pad = left_pad
+        self.padding_value = padding_value
+
+        self.output_units = hidden_size
+        if bidirectional:
+            self.output_units *= 2
+
+    def forward(self, src_tokens, src_lengths):
+        if self.left_pad:
+            # convert left-padding to right-padding
+            src_tokens = utils.convert_padding_direction(
+                src_tokens,
+                self.padding_idx,
+                left_to_right=True,
+            )
+
+        bsz, seqlen = src_tokens.size()
+
+        # embed tokens
+        x = self.embed_tokens(src_tokens)
+        x = F.dropout(x, p=self.dropout_in, training=self.training)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # pack embedded source tokens into a PackedSequence
+        packed_x = nn.utils.rnn.pack_padded_sequence(x, src_lengths.data.tolist())
+
+        # apply LSTM
+        if self.bidirectional:
+            state_size = 2 * self.num_layers, bsz, self.hidden_size
+        else:
+            state_size = self.num_layers, bsz, self.hidden_size
+        h0 = x.data.new(*state_size).zero_()
+        c0 = x.data.new(*state_size).zero_()
+        packed_outs, (final_hiddens, final_cells) = self.lstm(packed_x, (h0, c0))
+
+        # unpack outputs and apply dropout
+        x, _ = nn.utils.rnn.pad_packed_sequence(packed_outs, padding_value=self.padding_value)
+        x = F.dropout(x, p=self.dropout_out, training=self.training)
+        assert list(x.size()) == [seqlen, bsz, self.output_units]
+
+        if self.bidirectional:
+
+            def combine_bidir(outs):
+                return torch.cat([
+                    torch.cat([outs[2 * i], outs[2 * i + 1]], dim=0).view(1, bsz, self.output_units)
+                    for i in range(self.num_layers)
+                ], dim=0)
+
+            final_hiddens = combine_bidir(final_hiddens)
+            final_cells = combine_bidir(final_cells)
+
+        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()
+
+        return {
+            'encoder_out': (x, final_hiddens, final_cells),
+            'encoder_padding_mask': encoder_padding_mask if encoder_padding_mask.any() else None
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        encoder_out['encoder_out'] = tuple(
+            eo.index_select(1, new_order)
+            for eo in encoder_out['encoder_out']
+        )
+        if encoder_out['encoder_padding_mask'] is not None:
+            encoder_out['encoder_padding_mask'] = \
+                encoder_out['encoder_padding_mask'].index_select(1, new_order)
+        return encoder_out
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return int(1e5)  # an arbitrary large number
+
+
+class AttentionLayer(nn.Module):
+    def __init__(self, input_embed_dim, output_embed_dim):
+        super().__init__()
+
+        self.input_proj = Linear(input_embed_dim, output_embed_dim, bias=False)
+        self.output_proj = Linear(input_embed_dim + output_embed_dim, output_embed_dim, bias=False)
+
+    def forward(self, input, source_hids, encoder_padding_mask):
+        # input: bsz x input_embed_dim
+        # source_hids: srclen x bsz x output_embed_dim
+
+        # x: bsz x output_embed_dim
+        x = self.input_proj(input)
+
+        # compute attention
+        attn_scores = (source_hids * x.unsqueeze(0)).sum(dim=2)
+
+        # don't attend over padding
+        if encoder_padding_mask is not None:
+            attn_scores = attn_scores.float().masked_fill_(
+                encoder_padding_mask,
+                float('-inf')
+            ).type_as(attn_scores)  # FP16 support: cast to float and back
+
+        attn_scores = F.softmax(attn_scores, dim=0)  # srclen x bsz
+
+        # sum weighted sources
+        x = (attn_scores.unsqueeze(2) * source_hids).sum(dim=0)
+
+        x = F.tanh(self.output_proj(torch.cat((x, input), dim=1)))
+        return x, attn_scores
+
+
+class LSTMDecoder(FairseqIncrementalDecoder):
+    """LSTM decoder."""
+    def __init__(
+        self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512,
+        num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True,
+        encoder_embed_dim=512, encoder_output_units=512, pretrained_embed=None,
+        share_input_output_embed=False,
+    ):
+        super().__init__(dictionary)
+        self.dropout_in = dropout_in
+        self.dropout_out = dropout_out
+        self.hidden_size = hidden_size
+        self.share_input_output_embed = share_input_output_embed
+        self.need_attn = True
+
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.pad()
+        if pretrained_embed is None:
+            self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
+        else:
+            self.embed_tokens = pretrained_embed
+
+        self.encoder_output_units = encoder_output_units
+        assert encoder_output_units == hidden_size, \
+            'encoder_output_units ({}) != hidden_size ({})'.format(encoder_output_units, hidden_size)
+        # TODO another Linear layer if not equal
+
+        self.layers = nn.ModuleList([
+            LSTMCell(
+                input_size=encoder_output_units + embed_dim if layer == 0 else hidden_size,
+                hidden_size=hidden_size,
+            )
+            for layer in range(num_layers)
+        ])
+        self.attention = AttentionLayer(encoder_output_units, hidden_size) if attention else None
+        if hidden_size != out_embed_dim:
+            self.additional_fc = Linear(hidden_size, out_embed_dim)
+        if not self.share_input_output_embed:
+            self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
+
+    def forward(self, prev_output_tokens, encoder_out_dict, incremental_state=None):
+        encoder_out = encoder_out_dict['encoder_out']
+        encoder_padding_mask = encoder_out_dict['encoder_padding_mask']
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+        bsz, seqlen = prev_output_tokens.size()
+
+        # get outputs from encoder
+        encoder_outs, _, _ = encoder_out[:3]
+        srclen = encoder_outs.size(0)
+
+        # embed tokens
+        x = self.embed_tokens(prev_output_tokens)
+        x = F.dropout(x, p=self.dropout_in, training=self.training)
+
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+
+        # initialize previous states (or get from cache during incremental generation)
+        cached_state = utils.get_incremental_state(self, incremental_state, 'cached_state')
+        if cached_state is not None:
+            prev_hiddens, prev_cells, input_feed = cached_state
+        else:
+            _, encoder_hiddens, encoder_cells = encoder_out[:3]
+            num_layers = len(self.layers)
+            prev_hiddens = [encoder_hiddens[i] for i in range(num_layers)]
+            prev_cells = [encoder_cells[i] for i in range(num_layers)]
+            input_feed = x.data.new(bsz, self.encoder_output_units).zero_()
+
+        attn_scores = x.data.new(srclen, seqlen, bsz).zero_()
+        outs = []
+        for j in range(seqlen):
+            # input feeding: concatenate context vector from previous time step
+            input = torch.cat((x[j, :, :], input_feed), dim=1)
+
+            for i, rnn in enumerate(self.layers):
+                # recurrent cell
+                hidden, cell = rnn(input, (prev_hiddens[i], prev_cells[i]))
+
+                # hidden state becomes the input to the next layer
+                input = F.dropout(hidden, p=self.dropout_out, training=self.training)
+
+                # save state for next time step
+                prev_hiddens[i] = hidden
+                prev_cells[i] = cell
+
+            # apply attention using the last layer's hidden state
+            if self.attention is not None:
+                out, attn_scores[:, j, :] = self.attention(hidden, encoder_outs, encoder_padding_mask)
+            else:
+                out = hidden
+            out = F.dropout(out, p=self.dropout_out, training=self.training)
+
+            # input feeding
+            input_feed = out
+
+            # save final output
+            outs.append(out)
+
+        # cache previous states (no-op except during incremental generation)
+        utils.set_incremental_state(
+            self, incremental_state, 'cached_state', (prev_hiddens, prev_cells, input_feed))
+
+        # collect outputs across time steps
+        x = torch.cat(outs, dim=0).view(seqlen, bsz, self.hidden_size)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(1, 0)
+
+        # srclen x tgtlen x bsz -> bsz x tgtlen x srclen
+        if not self.training and self.need_attn:
+            attn_scores = attn_scores.transpose(0, 2)
+        else:
+            attn_scores = None
+
+        # project back to size of vocabulary
+        if hasattr(self, 'additional_fc'):
+            x = self.additional_fc(x)
+            x = F.dropout(x, p=self.dropout_out, training=self.training)
+        if self.share_input_output_embed:
+            x = F.linear(x, self.embed_tokens.weight)
+        else:
+            x = self.fc_out(x)
+
+        return x, attn_scores
+
+    def reorder_incremental_state(self, incremental_state, new_order):
+        super().reorder_incremental_state(incremental_state, new_order)
+        cached_state = utils.get_incremental_state(self, incremental_state, 'cached_state')
+        if cached_state is None:
+            return
+
+        def reorder_state(state):
+            if isinstance(state, list):
+                return [reorder_state(state_i) for state_i in state]
+            return state.index_select(0, new_order)
+
+        new_state = tuple(map(reorder_state, cached_state))
+        utils.set_incremental_state(self, incremental_state, 'cached_state', new_state)
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        return int(1e5)  # an arbitrary large number
+
+    def make_generation_fast_(self, need_attn=False, **kwargs):
+        self.need_attn = need_attn
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.uniform_(m.weight, -0.1, 0.1)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def LSTM(input_size, hidden_size, **kwargs):
+    m = nn.LSTM(input_size, hidden_size, **kwargs)
+    for name, param in m.named_parameters():
+        if 'weight' in name or 'bias' in name:
+            param.data.uniform_(-0.1, 0.1)
+    return m
+
+
+def LSTMCell(input_size, hidden_size, **kwargs):
+    m = nn.LSTMCell(input_size, hidden_size, **kwargs)
+    for name, param in m.named_parameters():
+        if 'weight' in name or 'bias' in name:
+            param.data.uniform_(-0.1, 0.1)
+    return m
+
+
+def Linear(in_features, out_features, bias=True, dropout=0):
+    """Linear layer (input: N x T x C)"""
+    m = nn.Linear(in_features, out_features, bias=bias)
+    m.weight.data.uniform_(-0.1, 0.1)
+    if bias:
+        m.bias.data.uniform_(-0.1, 0.1)
+    return m
+
+
+@register_model_architecture('lstm', 'lstm')
+def base_architecture(args):
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
+    args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
+    args.encoder_hidden_size = getattr(args, 'encoder_hidden_size', args.encoder_embed_dim)
+    args.encoder_layers = getattr(args, 'encoder_layers', 1)
+    args.encoder_bidirectional = getattr(args, 'encoder_bidirectional', False)
+    args.encoder_dropout_in = getattr(args, 'encoder_dropout_in', args.dropout)
+    args.encoder_dropout_out = getattr(args, 'encoder_dropout_out', args.dropout)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
+    args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
+    args.decoder_hidden_size = getattr(args, 'decoder_hidden_size', args.decoder_embed_dim)
+    args.decoder_layers = getattr(args, 'decoder_layers', 1)
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 512)
+    args.decoder_attention = getattr(args, 'decoder_attention', '1')
+    args.decoder_dropout_in = getattr(args, 'decoder_dropout_in', args.dropout)
+    args.decoder_dropout_out = getattr(args, 'decoder_dropout_out', args.dropout)
+    args.share_decoder_input_output_embed = getattr(args, 'share_decoder_input_output_embed', False)
+    args.share_all_embeddings = getattr(args, 'share_all_embeddings', False)
+
+
+@register_model_architecture('lstm', 'lstm_wiseman_iwslt_de_en')
+def lstm_wiseman_iwslt_de_en(args):
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256)
+    args.encoder_dropout_in = getattr(args, 'encoder_dropout_in', 0)
+    args.encoder_dropout_out = getattr(args, 'encoder_dropout_out', 0)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 256)
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 256)
+    args.decoder_dropout_in = getattr(args, 'decoder_dropout_in', 0)
+    args.decoder_dropout_out = getattr(args, 'decoder_dropout_out', args.dropout)
+    base_architecture(args)
+
+
+@register_model_architecture('lstm', 'lstm_luong_wmt_en_de')
+def lstm_luong_wmt_en_de(args):
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1000)
+    args.encoder_layers = getattr(args, 'encoder_layers', 4)
+    args.encoder_dropout_out = getattr(args, 'encoder_dropout_out', 0)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1000)
+    args.decoder_layers = getattr(args, 'decoder_layers', 4)
+    args.decoder_out_embed_dim = getattr(args, 'decoder_out_embed_dim', 1000)
+    args.decoder_dropout_out = getattr(args, 'decoder_dropout_out', 0)
+    base_architecture(args)
--- a/implementations/pytorch/fairseq/models/transformer.py
+++ b/implementations/pytorch/fairseq/models/transformer.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+torch.set_printoptions(threshold=500000000, linewidth=1024)
+
+from fairseq import options
+from fairseq import utils
+
+from apex.contrib.multihead_attn import SelfMultiheadAttn
+from apex.contrib.multihead_attn import EncdecMultiheadAttn
+
+from fairseq.modules import (
+    AdaptiveSoftmax, LearnedPositionalEmbedding, SinusoidalPositionalEmbedding, 
+)
+
+from . import (
+    FairseqIncrementalDecoder, FairseqDecoder, FairseqEncoder, FairseqLanguageModel, FairseqModel, register_model,
+    register_model_architecture,
+)
+
+from apex.normalization.fused_layer_norm import FusedLayerNorm
+
+@torch.jit.script
+def jit_dropout_add(x, residual, prob, is_training) :
+    # type: (Tensor, Tensor, float, bool) -> Tensor
+    # TODO: Bug in Pytorch 19.07 prevents propogation of "training" argument
+    #out = F.dropout(x, p=prob, training=is_training)
+    out = F.dropout(x, p=prob, training=True)
+    out = residual + out
+    return out
+
+# TODO: Remove workaround to enable relu+dropout fusion
+# when it gets fixed
+if hasattr(torch._C, '_jit_set_profiling_executor') :
+    torch._C._jit_set_profiling_executor(False)
+if hasattr(torch._C, '_jit_set_profiling_mode') :
+    torch._C._jit_set_profiling_mode(False)
+
+@torch.jit.script
+def jit_relu_dropout(x, prob, is_training) :
+    # type: (Tensor, float, bool) -> Tensor
+    out = F.threshold(x, 0., 0.)
+    out = F.dropout(out, p=prob, training=True)
+    return out
+
+@register_model('transformer')
+class TransformerModel(FairseqModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument('--dropout', type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--attention-dropout', type=float, metavar='D',
+                            help='dropout probability for attention weights')
+        parser.add_argument('--relu-dropout', type=float, metavar='D',
+                            help='dropout probability after ReLU in FFN')
+        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained encoder embedding')
+        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension')
+        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
+                            help='encoder embedding dimension for FFN')
+        parser.add_argument('--encoder-layers', type=int, metavar='N',
+                            help='num encoder layers')
+        parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
+                            help='num encoder attention heads')
+        parser.add_argument('--encoder-normalize-before', action='store_true',
+                            help='apply layernorm before each encoder block')
+        parser.add_argument('--encoder-learned-pos', action='store_true',
+                            help='use learned positional embeddings in the encoder')
+        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+                            help='path to pre-trained decoder embedding')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension for FFN')
+        parser.add_argument('--decoder-layers', type=int, metavar='N',
+                            help='num decoder layers')
+        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+                            help='num decoder attention heads')
+        parser.add_argument('--decoder-learned-pos', action='store_true',
+                            help='use learned positional embeddings in the decoder')
+        parser.add_argument('--decoder-normalize-before', action='store_true',
+                            help='apply layernorm before each decoder block')
+        parser.add_argument('--share-decoder-input-output-embed', action='store_true',
+                            help='share decoder input and output embeddings')
+        parser.add_argument('--share-all-embeddings', action='store_true',
+                            help='share encoder, decoder and output embeddings'
+                                 ' (requires shared dictionary and embed dim)')
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive softmax cutoff points. '
+                                 'Must be used with adaptive_loss criterion')
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure all arguments are present in older models
+        base_architecture(args)
+
+        if not hasattr(args, 'max_source_positions'):
+            args.max_source_positions = 256
+        if not hasattr(args, 'max_target_positions'):
+            args.max_target_positions = 256
+        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
+
+        def build_embedding(dictionary, embed_dim, path=None):
+            num_embeddings = len(dictionary)
+            padding_idx = dictionary.pad()
+            emb = Embedding(num_embeddings, embed_dim, padding_idx)
+            # if provided, load from preloaded dictionaries
+            if path:
+                embed_dict = utils.parse_embedding(path)
+                utils.load_embedding(embed_dict, dictionary, emb)
+            return emb
+
+        if args.share_all_embeddings:
+            if src_dict != tgt_dict:
+                raise RuntimeError('--share-all-embeddings requires a joined dictionary')
+            if args.encoder_embed_dim != args.decoder_embed_dim:
+                raise RuntimeError(
+                    '--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim')
+            if args.decoder_embed_path and (
+                    args.decoder_embed_path != args.encoder_embed_path):
+                raise RuntimeError('--share-all-embeddings not compatible with --decoder-embed-path')
+            encoder_embed_tokens = build_embedding(
+                src_dict, args.encoder_embed_dim, args.encoder_embed_path
+            )
+            decoder_embed_tokens = encoder_embed_tokens
+            args.share_decoder_input_output_embed = True
+        else:
+            encoder_embed_tokens = build_embedding(
+                src_dict, args.encoder_embed_dim, args.encoder_embed_path
+            )
+            decoder_embed_tokens = build_embedding(
+                tgt_dict, args.decoder_embed_dim, args.decoder_embed_path
+            )
+
+        encoder = TransformerEncoder(args, src_dict, encoder_embed_tokens)
+        decoder = TransformerDecoder(args, tgt_dict, decoder_embed_tokens)
+        return TransformerModel(encoder, decoder)
+
+
+@register_model('transformer_lm')
+class TransformerLanguageModel(FairseqLanguageModel):
+    def __init__(self, decoder):
+        super().__init__(decoder)
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
+                            help='dropout probability')
+        parser.add_argument('--attention-dropout', default=0., type=float, metavar='D',
+                            help='dropout probability for attention weights')
+        parser.add_argument('--relu-dropout', default=0., type=float, metavar='D',
+                            help='dropout probability after ReLU in FFN')
+        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension')
+        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+                            help='decoder embedding dimension for FFN')
+        parser.add_argument('--decoder-layers', type=int, metavar='N',
+                            help='num decoder layers')
+        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+                            help='num decoder attention heads')
+        parser.add_argument('--decoder-normalize-before', default=False, action='store_true',
+                            help='apply layernorm before each decoder block')
+        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+                            help='comma separated list of adaptive softmax cutoff points. '
+                                 'Must be used with adaptive_loss criterion')
+        parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
+                            help='if set, disables positional embeddings (outside self attention)')
+        parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true',
+                            help='share decoder input and output embeddings')
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+
+        # make sure all arguments are present in older models
+        base_lm_architecture(args)
+
+        if not hasattr(args, 'max_source_positions'):
+            args.max_source_positions = args.tokens_per_sample
+        if not hasattr(args, 'max_target_positions'):
+            args.max_target_positions = args.tokens_per_sample
+
+        embed_tokens = Embedding(len(task.dictionary), args.decoder_embed_dim, task.dictionary.pad())
+
+        decoder = TransformerDecoder(args, task.dictionary, embed_tokens, no_encoder_attn=True)
+        return TransformerLanguageModel(decoder)
+
+
+class TransformerEncoder(FairseqEncoder):
+    """Transformer encoder."""
+
+    def __init__(self, args, dictionary, embed_tokens, left_pad=True):
+        super().__init__(dictionary)
+        self.dropout = args.dropout
+
+        embed_dim = embed_tokens.embedding_dim
+        self.padding_idx = embed_tokens.padding_idx
+        self.max_source_positions = args.max_source_positions
+
+        self.embed_tokens = embed_tokens
+        self.embed_scale = math.sqrt(embed_dim)
+        self.embed_positions = PositionalEmbedding(
+            args.max_source_positions, embed_dim, self.padding_idx,
+            left_pad=left_pad,
+            learned=args.encoder_learned_pos,
+        ) if not args.no_token_positional_embeddings else None
+
+        self.layers = nn.ModuleList([])
+        self.layers.extend([
+            TransformerEncoderLayer(args)
+            for i in range(args.encoder_layers)
+        ])
+
+        self.normalize = args.encoder_normalize_before
+        if self.normalize:
+           self.layer_norm = FusedLayerNorm(embed_dim)
+
+    def forward(self, src_tokens, src_lengths):
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(src_tokens)
+        if self.embed_positions is not None:
+            x += self.embed_positions(src_tokens)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+
+        # B x T x C -> T x B x C
+        # The tensor needs to copy transposed because
+        # fused dropout is not capable of handing strided data
+        x = x.transpose(0, 1)
+        # TODO: KJS Remove this contiguous once C++ Multihead Attention is made to accomodate
+        # Is this a bug that when contiguous it doens't make the stride contiguous?
+        if x.size(1) == 1 :
+            if x.is_contiguous() :
+                x = x.view(x.size(0), x.size(1), x.size(2))
+            else :
+                x = x.contiguous()
+        else :
+            x = x.contiguous()
+
+        # compute padding mask
+        encoder_padding_mask = src_tokens.eq(self.padding_idx).to(torch.uint8)
+
+        # encoder layers
+        for layer in self.layers:
+            x = layer(x, encoder_padding_mask)
+
+        if self.normalize:
+            x = self.layer_norm(x)
+
+        return {
+            'encoder_out': x,  # T x B x C
+            'encoder_padding_mask': encoder_padding_mask,  # B x T
+        }
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        if encoder_out['encoder_out'] is not None:
+            encoder_out['encoder_out'] = \
+                encoder_out['encoder_out'].index_select(1, new_order)
+        if encoder_out['encoder_padding_mask'] is not None:
+            encoder_out['encoder_padding_mask'] = \
+                encoder_out['encoder_padding_mask'].index_select(0, new_order)
+        return encoder_out
+
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        if self.embed_positions is None:
+            return self.max_source_positions
+        return min(self.max_source_positions, self.embed_positions.max_positions())
+
+    def upgrade_state_dict(self, state_dict):
+        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
+            if 'encoder.embed_positions.weights' in state_dict:
+                del state_dict['encoder.embed_positions.weights']
+            state_dict['encoder.embed_positions._float_tensor'] = torch.FloatTensor(1)
+        return state_dict
+
+
+#class TransformerDecoder(FairseqIncrementalDecoder):
+class TransformerDecoder(FairseqDecoder):
+    """Transformer decoder."""
+
+    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False):
+        super().__init__(dictionary)
+        self.dropout = args.dropout
+        self.share_input_output_embed = args.share_decoder_input_output_embed
+
+        embed_dim = embed_tokens.embedding_dim
+        padding_idx = embed_tokens.padding_idx
+        self.max_target_positions = args.max_target_positions
+
+        self.embed_tokens = embed_tokens
+        self.embed_scale = math.sqrt(embed_dim)
+        self.embed_positions = PositionalEmbedding(
+            args.max_target_positions, embed_dim, padding_idx,
+            left_pad=left_pad,
+            learned=args.decoder_learned_pos,
+        ) if not args.no_token_positional_embeddings else None
+
+        self.layers = nn.ModuleList([])
+        self.layers.extend([
+            TransformerDecoderLayer(args, no_encoder_attn)
+            for _ in range(args.decoder_layers)
+        ])
+        self.adaptive_softmax = None
+
+        if args.adaptive_softmax_cutoff is not None:
+            self.adaptive_softmax = AdaptiveSoftmax(
+                len(dictionary), args.decoder_embed_dim,
+                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
+                dropout=args.dropout
+            )
+        elif not self.share_input_output_embed:
+            self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), embed_dim))
+            nn.init.normal_(self.embed_out, mean=0, std=embed_dim ** -0.5)
+        self.normalize = args.decoder_normalize_before
+        if self.normalize:
+           self.layer_norm = FusedLayerNorm(embed_dim)
+
+    def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None):
+        # embed positions
+        positions = self.embed_positions(
+            prev_output_tokens,
+            incremental_state=incremental_state,
+        ) if self.embed_positions is not None else None
+
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+            if positions is not None:
+                positions = positions[:, -1:]
+
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+        if positions is not None:
+            x += positions
+        x = F.dropout(x, p=self.dropout, training=self.training)
+
+        # B x T x C -> T x B x C
+        # The tensor needs to copy transposed because
+        # fused dropout is not capable of handing strided data
+        x = x.transpose(0, 1)
+        # TODO: KJS Remove this contiguous once C++ Multihead Attention is made to accomodate
+        # Is this a bug that when contiguous it doens't make the stride contiguous?
+        if x.size(1) == 1 :
+            if x.is_contiguous() :
+                x = x.view(x.size(0), x.size(1), x.size(2))
+            else :
+                x = x.contiguous()
+        else :
+            x = x.contiguous()
+        attn = None
+
+        # decoder layers
+        for layer in self.layers:
+            x, attn = layer(
+                x,
+                encoder_out['encoder_out'] if encoder_out is not None else None,
+                torch.triu(torch.ones(x.size(0), x.size(0), device=x.device, dtype=torch.uint8), 1),
+                encoder_out['encoder_padding_mask'] if encoder_out is not None else None,
+                incremental_state,
+            )
+
+        if self.normalize:
+            x = self.layer_norm(x)
+
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+
+        if self.adaptive_softmax is None:
+            # project back to size of vocabulary
+            if self.share_input_output_embed:
+                x = F.linear(x, self.embed_tokens.weight)
+            else:
+                x = F.linear(x, self.embed_out)
+
+        return x, attn
+
+    def max_positions(self):
+        """Maximum output length supported by the decoder."""
+        if self.embed_positions is None:
+            return self.max_target_positions
+        return min(self.max_target_positions, self.embed_positions.max_positions())
+
+    def upgrade_state_dict(self, state_dict):
+        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
+            if 'decoder.embed_positions.weights' in state_dict:
+                del state_dict['decoder.embed_positions.weights']
+            state_dict['decoder.embed_positions._float_tensor'] = torch.FloatTensor(1)
+
+        for i in range(len(self.layers)):
+            # update layer norms
+            layer_norm_map = {
+                '0': 'self_attn_layer_norm',
+                '1': 'encoder_attn_layer_norm',
+                '2': 'final_layer_norm'
+            }
+            for old, new in layer_norm_map.items():
+                for m in ('weight', 'bias'):
+                    k = 'decoder.layers.{}.layer_norms.{}.{}'.format(i, old, m)
+                    if k in state_dict:
+                        state_dict['decoder.layers.{}.{}.{}'.format(i, new, m)] = state_dict[k]
+                        del state_dict[k]
+
+        return state_dict
+
+
+class TransformerEncoderLayer(nn.Module):
+    """Encoder layer block.
+
+    In the original paper each operation (multi-head attention or FFN) is
+    postprocessed with: dropout -> add residual -> layernorm.
+    In the tensor2tensor code they suggest that learning is more robust when
+    preprocessing each layer with layernorm and postprocessing with:
+    dropout -> add residual.
+    We default to the approach in the paper, but the tensor2tensor approach can
+    be enabled by setting `normalize_before=True`.
+    """
+
+    def __init__(self, args):
+        super().__init__()
+        self.embed_dim = args.encoder_embed_dim
+        self.multihead_attn_impl = args.multihead_attn_impl
+        if args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
+            self.self_attn = SelfMultiheadAttn(
+                self.embed_dim, args.encoder_attention_heads,
+                dropout=args.attention_dropout,
+                bias=False,
+                include_norm_add=True,
+                impl='fast',
+            )
+        elif args.multihead_attn_impl == 'fast' : 
+            self.self_attn = SelfMultiheadAttn(
+                self.embed_dim, args.encoder_attention_heads,
+                dropout=args.attention_dropout,
+                bias=False,
+                include_norm_add=False,
+                impl='fast',
+            )
+        else :
+            self.self_attn = SelfMultiheadAttn(
+                self.embed_dim, args.encoder_attention_heads,
+                dropout=args.attention_dropout,
+                bias=False,
+                include_norm_add=False,
+                impl='default',
+            )
+
+        # in_proj_weight has shape [3 * hidden, hidden] but it should be
+        # initialized like a [hidden, hidden] matrix.
+        # sqrt(6 / (hidden + hidden)) / sqrt(6 / (3 * hidden + hidden)) = sqrt(2)
+        # therefore xavier_uniform gain should be set to sqrt(2).
+        torch.nn.init.xavier_uniform_(self.self_attn.in_proj_weight, gain=math.sqrt(2))
+
+        self.dropout = args.dropout
+        self.relu_dropout = args.relu_dropout
+        self.normalize_before = args.encoder_normalize_before
+        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
+        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
+        if args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
+            self.layer_norms = nn.ModuleList([FusedLayerNorm(self.embed_dim) for i in range(1)])
+        else :
+            self.layer_norms = nn.ModuleList([FusedLayerNorm(self.embed_dim) for i in range(2)])
+
+    def forward(self, x, encoder_padding_mask):
+        if self.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
+            x, _ = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=encoder_padding_mask,
+                need_weights=False,
+                attn_mask=None,
+                is_training=self.training,
+            )
+        else :
+            residual = x
+         
+            x = self.maybe_layer_norm(0, x, before=True)
+            x, _ = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=encoder_padding_mask,
+                need_weights=False,
+                attn_mask=None,
+                is_training=self.training,
+            )
+            # TODO: Bug in Pytorch 19.07 prevents propogation of "training" argument
+            if self.training :
+                x = jit_dropout_add(x, residual, self.dropout, self.training)
+            else :
+                x = F.dropout(x, p=self.dropout, training=self.training)
+                x = residual + x
+            x = self.maybe_layer_norm(0, x, after=True)
+
+        residual = x
+        if self.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
+            x = self.maybe_layer_norm(0, x, before=True)
+        else :
+            x = self.maybe_layer_norm(1, x, before=True)
+
+        # Fuse Bias to GEMMs by making Tensors 2D
+        sent_len,sents,hid_dim = x.size()
+        x = x.view(sent_len*sents, hid_dim)
+        if self.training :
+            x = jit_relu_dropout(self.fc1(x), self.relu_dropout, self.training)
+        else :
+            x = self.fc1(x)
+            x = F.threshold(x, 0., 0.)
+            x = F.dropout(x, p=self.relu_dropout, training=False)
+        #x = F.threshold(self.fc1(x),0,0)
+        #x = F.dropout(x, p=self.relu_dropout, training=self.training)
+        x = self.fc2(x)
+        x = x.view(sent_len, sents, hid_dim)
+
+        # TODO: Bug in Pytorch 19.07 prevents propogation of "training" argument
+        if self.training :
+            x = jit_dropout_add(x, residual, self.dropout, self.training)
+        else :
+            x = F.dropout(x, p=self.dropout, training=self.training)
+            x = residual + x
+        if self.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
+            x = self.maybe_layer_norm(0, x, after=True)
+        else :
+            x = self.maybe_layer_norm(1, x, after=True)
+        return x
+
+    def maybe_layer_norm(self, i, x, before=False, after=False):
+        assert before ^ after
+        if after ^ self.normalize_before:
+            return self.layer_norms[i](x)
+        else:
+            return x
+
+
+class TransformerDecoderLayer(nn.Module):
+    """Decoder layer block."""
+
+    def __init__(self, args, no_encoder_attn=False):
+        super().__init__()
+        self.embed_dim = args.decoder_embed_dim
+        self.multihead_attn_impl = args.multihead_attn_impl
+        if args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
+            self.self_attn = SelfMultiheadAttn(
+                self.embed_dim, args.decoder_attention_heads,
+                dropout=args.attention_dropout,
+                bias=False,
+                include_norm_add=True,
+                impl='fast',
+            )
+        elif args.multihead_attn_impl == 'fast' : 
+            self.self_attn = SelfMultiheadAttn(
+                self.embed_dim, args.encoder_attention_heads,
+                dropout=args.attention_dropout,
+                bias=False,
+                include_norm_add=False,
+                impl='fast',
+            )
+        else :
+            self.self_attn = SelfMultiheadAttn(
+                self.embed_dim, args.encoder_attention_heads,
+                dropout=args.attention_dropout,
+                bias=False,
+                include_norm_add=False,
+                impl='default',
+            )
+
+        # in_proj_weight has shape [3 * hidden, hidden] but it should be
+        # initialized like a [hidden, hidden] matrix.
+        # sqrt(6 / (hidden + hidden)) / sqrt(6 / (3 * hidden + hidden)) = sqrt(2)
+        # therefore xavier_uniform gain should be set to sqrt(2).
+        torch.nn.init.xavier_uniform_(self.self_attn.in_proj_weight, gain=math.sqrt(2))
+
+        self.dropout = args.dropout
+        self.relu_dropout = args.relu_dropout
+        self.normalize_before = args.decoder_normalize_before
+
+        if not (args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd') :
+            self.self_attn_layer_norm = FusedLayerNorm(self.embed_dim)
+
+        if no_encoder_attn:
+            self.encoder_attn = None
+            if not (args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd') :
+                self.encoder_attn_layer_norm = None
+        else:
+            if args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
+                self.encoder_attn = EncdecMultiheadAttn(
+                    self.embed_dim, args.decoder_attention_heads,
+                    dropout=args.attention_dropout,
+                    bias=False,
+                    include_norm_add=True,
+                    impl='fast',
+                )
+            elif args.multihead_attn_impl == 'fast' :
+                self.encoder_attn = EncdecMultiheadAttn(
+                    self.embed_dim, args.decoder_attention_heads,
+                    dropout=args.attention_dropout,
+                    bias=False,
+                    include_norm_add=False,
+                    impl='fast',
+                )
+            else : 
+                self.encoder_attn = EncdecMultiheadAttn(
+                    self.embed_dim, args.decoder_attention_heads,
+                    dropout=args.attention_dropout,
+                    bias=False,
+                    include_norm_add=False,
+                    impl='default',
+                )
+
+            # in_proj_weight_kv has shape [2 * hidden, hidden] but it should be
+            # initialized like a [hidden, hidden] matrix.
+            # sqrt(6 / (hidden + hidden)) / sqrt(6 / (2 * hidden + hidden)) = sqrt(1.5)
+            # therefore xavier_uniform gain should be set to sqrt(1.5).
+            torch.nn.init.xavier_uniform_(self.encoder_attn.in_proj_weight_kv, gain=math.sqrt(1.5))
+
+            if not (args.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd') :
+                self.encoder_attn_layer_norm = FusedLayerNorm(self.embed_dim)
+
+        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
+        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)
+
+        self.final_layer_norm = FusedLayerNorm(self.embed_dim)
+        self.need_attn = True
+
+    def forward(self, x, encoder_out, time_padding_mask, encoder_padding_mask, incremental_state):
+        if self.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
+            x, _ = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=None,
+                need_weights=False,
+                attn_mask=time_padding_mask,
+                is_training=self.training,
+            )
+        else :
+            residual = x
+            x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
+            x, _ = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=None,
+                need_weights=False,
+                attn_mask=time_padding_mask,
+                is_training=self.training,
+            )
+            # TODO: Bug in Pytorch 19.07 prevents propogation of "training" argument
+            if self.training :
+                x = jit_dropout_add(x, residual, self.dropout, self.training)
+            else :
+                x = F.dropout(x, p=self.dropout, training=self.training)
+                x = residual + x
+            x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
+
+        attn = None
+        if self.encoder_attn is not None:
+            if self.multihead_attn_impl == 'fast_with_lyrnrm_and_dropoutadd' :
+                x, attn = self.encoder_attn(
+                    query=x,
+                    key=encoder_out,
+                    value=encoder_out,
+                    key_padding_mask=encoder_padding_mask,
+                    need_weights=False,
+                    attn_mask=None,
+                    is_training=self.training,
+                )
+            else :
+                residual = x
+                x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True)
+                x, attn = self.encoder_attn(
+                    query=x,
+                    key=encoder_out,
+                    value=encoder_out,
+                    key_padding_mask=encoder_padding_mask,
+                    need_weights=False,
+                    attn_mask=None,
+                    is_training=self.training,
+                )
+                # TODO: Bug in Pytorch 19.07 prevents propogation of "training" argument
+                if self.training :
+                    x = jit_dropout_add(x, residual, self.dropout, self.training)
+                else :
+                    x = F.dropout(x, p=self.dropout, training=self.training)
+                    x = residual + x
+                x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True)
+
+        residual = x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
+
+        # Fuse Bias to GEMMs by making Tensors 2D
+        sent_len,sents,hid_dim = x.size()
+        x = x.view(sent_len*sents, hid_dim)
+        if self.training :
+            x = jit_relu_dropout(self.fc1(x), self.relu_dropout, self.training)
+        else :
+            x = self.fc1(x)
+            x = F.threshold(x, 0., 0.)
+            x = F.dropout(x, p=self.relu_dropout, training=False)
+        #x = F.threshold(self.fc1(x),0,0)
+        #x = F.dropout(x, p=self.relu_dropout, training=self.training)
+        x = self.fc2(x)
+        x = x.view(sent_len, sents, hid_dim)
+
+        # TODO: Bug in Pytorch 19.07 prevents propogation of "training" argument
+        if self.training :
+            x = jit_dropout_add(x, residual, self.dropout, self.training)
+        else :
+            x = F.dropout(x, p=self.dropout, training=self.training)
+            x = residual + x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
+        return x, attn
+
+    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
+        assert before ^ after
+        if after ^ self.normalize_before:
+            return layer_norm(x)
+        else:
+            return x
+
+    def make_generation_fast_(self, need_attn=False, **kwargs):
+        self.need_attn = need_attn
+
+
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
+    nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+
+
+def LayerNorm(embedding_dim):
+    m = FusedLayerNorm(embedding_dim)
+    return m
+
+
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    nn.init.constant_(m.bias, 0.)
+    return m
+
+
+def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad, learned=False):
+    if learned:
+        m = LearnedPositionalEmbedding(num_embeddings + padding_idx + 1, embedding_dim, padding_idx, left_pad)
+        nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
+        nn.init.constant_(m.weight[padding_idx], 0)
+    else:
+        m = SinusoidalPositionalEmbedding(embedding_dim, padding_idx, left_pad, num_embeddings + padding_idx + 1)
+    return m
+
+
+@register_model_architecture('transformer_lm', 'transformer_lm')
+def base_lm_architecture(args):
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048)
+    args.decoder_layers = getattr(args, 'decoder_layers', 6)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
+    args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
+    args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
+
+    # The model training is not stable without this
+    args.decoder_normalize_before = True
+
+
+@register_model_architecture('transformer_lm', 'transformer_lm_big')
+def transformer_lm_big(args):
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
+    base_lm_architecture(args)
+
+
+@register_model_architecture('transformer_lm', 'transformer_lm_wiki103')
+def transformer_lm_wiki103(args):
+    args.dropout = getattr(args, 'dropout', 0.3)
+    base_lm_architecture(args)
+
+
+@register_model_architecture('transformer_lm', 'transformer_lm_gbw')
+def transformer_lm_gbw(args):
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    transformer_lm_big(args)
+
+
+@register_model_architecture('transformer', 'transformer')
+def base_architecture(args):
+    args.encoder_embed_path = getattr(args, 'encoder_embed_path', None)
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 2048)
+    args.encoder_layers = getattr(args, 'encoder_layers', 6)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 8)
+    args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', False)
+    args.encoder_learned_pos = getattr(args, 'encoder_learned_pos', False)
+    args.decoder_embed_path = getattr(args, 'decoder_embed_path', None)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', args.encoder_ffn_embed_dim)
+    args.decoder_layers = getattr(args, 'decoder_layers', 6)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8)
+    args.decoder_normalize_before = getattr(args, 'decoder_normalize_before', False)
+    args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.)
+    args.relu_dropout = getattr(args, 'relu_dropout', 0.)
+    args.dropout = getattr(args, 'dropout', 0.1)
+    args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None)
+    args.share_decoder_input_output_embed = getattr(args, 'share_decoder_input_output_embed', False)
+    args.share_all_embeddings = getattr(args, 'share_all_embeddings', False)
+    args.no_token_positional_embeddings = getattr(args, 'no_token_positional_embeddings', False)
+
+
+@register_model_architecture('transformer', 'transformer_iwslt_de_en')
+def transformer_iwslt_de_en(args):
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 1024)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 4)
+    args.encoder_layers = getattr(args, 'encoder_layers', 6)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 1024)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 4)
+    args.decoder_layers = getattr(args, 'decoder_layers', 6)
+    base_architecture(args)
+
+
+@register_model_architecture('transformer', 'transformer_wmt_en_de')
+def transformer_wmt_en_de(args):
+    base_architecture(args)
+
+
+# parameters used in the "Attention Is All You Need" paper (Vaswani, et al, 2017)
+@register_model_architecture('transformer', 'transformer_vaswani_wmt_en_de_big')
+def transformer_vaswani_wmt_en_de_big(args):
+    args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1024)
+    args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 4096)
+    args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 16)
+    args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', False)
+    args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024)
+    args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096)
+    args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16)
+    args.dropout = getattr(args, 'dropout', 0.3)
+    base_architecture(args)
+
+
+@register_model_architecture('transformer', 'transformer_vaswani_wmt_en_fr_big')
+def transformer_vaswani_wmt_en_fr_big(args):
+    args.dropout = getattr(args, 'dropout', 0.1)
+    transformer_vaswani_wmt_en_de_big(args)
+
+
+@register_model_architecture('transformer', 'transformer_wmt_en_de_big')
+def transformer_wmt_en_de_big(args):
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    transformer_vaswani_wmt_en_de_big(args)
+
+
+# default parameters used in tensor2tensor implementation
+@register_model_architecture('transformer', 'transformer_wmt_en_de_big_t2t')
+def transformer_wmt_en_de_big_t2t(args):
+    args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', True)
+    args.decoder_normalize_before = getattr(args, 'decoder_normalize_before', True)
+    args.attention_dropout = getattr(args, 'attention_dropout', 0.1)
+    args.relu_dropout = getattr(args, 'relu_dropout', 0.1)
+    transformer_vaswani_wmt_en_de_big(args)
--- a/implementations/pytorch/fairseq/modules/__init__.py
+++ b/implementations/pytorch/fairseq/modules/__init__.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from .adaptive_softmax import AdaptiveSoftmax
+from .beamable_mm import BeamableMM
+from .conv_tbc import ConvTBC
+from .downsampled_multihead_attention import DownsampledMultiHeadAttention
+from .grad_multiply import GradMultiply
+from .learned_positional_embedding import LearnedPositionalEmbedding
+from .linearized_convolution import LinearizedConvolution
+from .scalar_bias import ScalarBias
+from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding
+
+__all__ = [
+    'AdaptiveSoftmax',
+    'BeamableMM',
+    'ConvTBC',
+    'DownsampledMultiHeadAttention',
+    'GradMultiply',
+    'LearnedPositionalEmbedding',
+    'LinearizedConvolution',
+    'ScalarBias',
+    'SinusoidalPositionalEmbedding',
+]
--- a/implementations/pytorch/fairseq/modules/__pycache__/__init__.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/modules/__pycache__/__init__.cpython-310.pyc
--- a/implementations/pytorch/fairseq/modules/__pycache__/adaptive_softmax.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/modules/__pycache__/adaptive_softmax.cpython-310.pyc
--- a/implementations/pytorch/fairseq/modules/__pycache__/beamable_mm.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/modules/__pycache__/beamable_mm.cpython-310.pyc
--- a/implementations/pytorch/fairseq/modules/__pycache__/conv_tbc.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/modules/__pycache__/conv_tbc.cpython-310.pyc
--- a/implementations/pytorch/fairseq/modules/__pycache__/downsampled_multihead_attention.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/modules/__pycache__/downsampled_multihead_attention.cpython-310.pyc
--- a/implementations/pytorch/fairseq/modules/__pycache__/grad_multiply.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/modules/__pycache__/grad_multiply.cpython-310.pyc
--- a/implementations/pytorch/fairseq/modules/__pycache__/learned_positional_embedding.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/modules/__pycache__/learned_positional_embedding.cpython-310.pyc
--- a/implementations/pytorch/fairseq/modules/__pycache__/linearized_convolution.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/modules/__pycache__/linearized_convolution.cpython-310.pyc
--- a/implementations/pytorch/fairseq/modules/__pycache__/scalar_bias.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/modules/__pycache__/scalar_bias.cpython-310.pyc