Asr initial push (#810)

Summary: Initial code for speech recognition task. Right now only one ASR model added - https://arxiv.org/abs/1904.11660 unit test testing: python -m unittest discover tests also run model training with this code and obtained 5.0 test_clean | 13.4 test_other on librispeech with pytorch/audio features Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/810 Reviewed By: cpuhrsch Differential Revision: D16706659 Pulled By: okhonko fbshipit-source-id: 89a5f9883e50bc0e548234287aa0ea73f7402514

Asr initial push (#810)
Summary: Initial code for speech recognition task. Right now only one ASR model added - https://arxiv.org/abs/1904.11660 unit test testing: python -m unittest discover tests also run model training with this code and obtained 5.0 test_clean | 13.4 test_other on librispeech with pytorch/audio features Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/810 Reviewed By: cpuhrsch Differential Revision: D16706659 Pulled By: okhonko fbshipit-source-id: 89a5f9883e50bc0e548234287aa0ea73f7402514
72f9364c · Dmytro Okhonko · Facebook Github Bot · 9a1038f6 · 72f9364c · 72f9364c
Commit 72f9364c authored Aug 08, 2019 by Dmytro Okhonko Committed by Facebook Github Bot Aug 08, 2019
20 changed files
--- a/examples/speech_recognition/README.md
+++ b/examples/speech_recognition/README.md
+# Speech Recognition
+`examples/speech_recognition` is implementing ASR task in Fairseq, along with needed features, datasets, models and loss functions to train and infer model described in [Transformers with convolutional context for ASR (Abdelrahman Mohamed et al., 2019)](https://arxiv.org/abs/1904.11660).
+## Additional dependencies
+On top of main fairseq dependencies there are couple more additional requirements.
+1) Please follow the instructions to install [torchaudio](https://github.com/pytorch/audio). This is required to compute audio fbank features.
+2) [Sclite](http://www1.icsi.berkeley.edu/Speech/docs/sctk-1.2/sclite.htm#sclite_name_0) is used to measure WER. Sclite can be downloaded and installed from source from sctk package [here](http://www.openslr.org/4/). Training and inference doesn't require Sclite dependency.
+## Preparing librispeech data
+```
+./examples/speech_recognition/datasets/prepare-librispeech.sh $DIR_TO_SAVE_RAW_DATA $DIR_FOR_PREPROCESSED_DATA
+```
+## Training librispeech data
+```
+python train.py $DIR_FOR_PREPROCESSED_DATA --save-dir $MODEL_PATH --max-epoch 80 --task speech_recognition --arch vggtransformer_2 --optimizer adadelta --lr 1.0 --adadelta-eps 1e-8 --adadelta-rho 0.95 --clip-norm 10.0  --max-tokens 5000 --log-format json --log-interval 1 --criterion cross_entropy_acc --user-dir examples/speech_recognition/
+```
+## Inference for librispeech
+`$SET` can be `test_clean` or `test_other`
+Any checkpoint in `$MODEL_PATH` can be selected. In this example we are working with `checkpoint_last.pt`
+```
+python examples/speech_recognition/infer.py $DIR_FOR_PREPROCESSED_DATA --task speech_recognition --max-tokens 25000 --nbest 1 --path $MODEL_PATH/checkpoint_last.pt --beam 20 --results-path $RES_DIR --batch-size 40 --gen-subset $SET --user-dir examples/speech_recognition/
+```
+## Inference for librispeech
+```
+sclite -r ${RES_DIR}/ref.word-checkpoint_last.pt-${SET}.txt -h ${RES_DIR}/hypo.word-checkpoint_last.pt-${SET}.txt -i rm -o all stdout > $RES_REPORT
+```
+`Sum/Avg` row from first table of the report has WER
--- a/examples/speech_recognition/__init__.py
+++ b/examples/speech_recognition/__init__.py
+from . import tasks, criterions, models  # noqa
--- a/examples/speech_recognition/criterions/__init__.py
+++ b/examples/speech_recognition/criterions/__init__.py
+import importlib
+import os
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith('.py') and not file.startswith('_'):
+        criterion_name = file[:file.find('.py')]
+        importlib.import_module('examples.speech_recognition.criterions.' + criterion_name)
--- a/examples/speech_recognition/criterions/cross_entropy_acc.py
+++ b/examples/speech_recognition/criterions/cross_entropy_acc.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import absolute_import, division, print_function, unicode_literals
+import logging
+import math
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+@register_criterion("cross_entropy_acc")
+class CrossEntropyWithAccCriterion(FairseqCriterion):
+    def __init__(self, args, task):
+        super().__init__(args, task)
+    def compute_loss(self, model, net_output, target, reduction, log_probs):
+        # N, T -> N * T
+        target = target.view(-1)
+        lprobs = model.get_normalized_probs(net_output, log_probs=log_probs)
+        if not hasattr(lprobs, "batch_first"):
+            logging.warning(
+                "ERROR: we need to know whether "
+                "batch first for the net output; "
+                "you need to set batch_first attribute for the return value of "
+                "model.get_normalized_probs. Now, we assume this is true, but "
+                "in the future, we will raise exception instead. "
+            )
+        batch_first = getattr(lprobs, "batch_first", True)
+        if not batch_first:
+            lprobs = lprobs.transpose(0, 1)
+        # N, T, D -> N * T, D
+        lprobs = lprobs.view(-1, lprobs.size(-1))
+        loss = F.nll_loss(
+            lprobs, target, ignore_index=self.padding_idx, reduction=reduction
+        )
+        return lprobs, loss
+    def get_logging_output(self, sample, target, lprobs, loss):
+        target = target.view(-1)
+        mask = target != self.padding_idx
+        correct = torch.sum(
+            lprobs.argmax(1).masked_select(mask) == target.masked_select(mask)
+        )
+        total = torch.sum(mask)
+        sample_size = (
+            sample["target"].size(0) if self.args.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": utils.item(loss.data),  # * sample['ntokens'],
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+            "correct": utils.item(correct.data),
+            "total": utils.item(total.data),
+            "nframes": torch.sum(sample["net_input"]["src_lengths"]).item(),
+        }
+        return sample_size, logging_output
+    def forward(self, model, sample, reduction="sum", log_probs=True):
+        """Computes the cross entropy with accuracy metric for the given sample.
+        This is similar to CrossEntropyCriterion in fairseq, but also
+        computes accuracy metrics as part of logging
+        Args:
+            logprobs (Torch.tensor) of shape N, T, D i.e.
+                batchsize, timesteps, dimensions
+            targets (Torch.tensor) of shape N, T  i.e batchsize, timesteps
+        Returns:
+        tuple: With three elements:
+            1) the loss
+            2) the sample size, which is used as the denominator for the gradient
+            3) logging outputs to display while training
+        TODO:
+            * Currently this Criterion will only work with LSTMEncoderModels or
+            FairseqModels which have decoder, or Models which return TorchTensor
+            as net_output.
+            We need to make a change to support all FairseqEncoder models.
+        """
+        net_output = model(**sample["net_input"])
+        target = model.get_targets(sample, net_output)
+        lprobs, loss = self.compute_loss(
+            model, net_output, target, reduction, log_probs
+        )
+        sample_size, logging_output = self.get_logging_output(
+            sample, target, lprobs, loss
+        )
+        return loss, sample_size, logging_output
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        correct_sum = sum(log.get("correct", 0) for log in logging_outputs)
+        total_sum = sum(log.get("total", 0) for log in logging_outputs)
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        nframes = sum(log.get("nframes", 0) for log in logging_outputs)
+        agg_output = {
+            "loss": loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.0,
+            # if args.sentence_avg, then sample_size is nsentences, then loss
+            # is per-sentence loss; else sample_size is ntokens, the loss
+            # becomes per-output token loss
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "nframes": nframes,
+            "sample_size": sample_size,
+            "acc": correct_sum * 100.0 / total_sum if total_sum > 0 else 0.0,
+            "correct": correct_sum,
+            "total": total_sum,
+            # total is the number of validate tokens
+        }
+        if sample_size != ntokens:
+            agg_output["nll_loss"] = loss_sum / ntokens / math.log(2)
+        # loss: per output token loss
+        # nll_loss: per sentence loss
+        return agg_output
--- a/examples/speech_recognition/data/__init__.py
+++ b/examples/speech_recognition/data/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .asr_dataset import AsrDataset
+__all__ = [
+    'AsrDataset',
+]
--- a/examples/speech_recognition/data/asr_dataset.py
+++ b/examples/speech_recognition/data/asr_dataset.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import numpy as np
+from fairseq.data import FairseqDataset
+from . import data_utils
+from .collaters import Seq2SeqCollater
+class AsrDataset(FairseqDataset):
+    """
+    A dataset representing speech and corresponding transcription.
+    Args:
+        aud_paths: (List[str]): A list of str with paths to audio files.
+        aud_durations_ms (List[int]): A list of int containing the durations of
+            audio files.
+        tgt (List[torch.LongTensor]): A list of LongTensors containing the indices
+            of target transcriptions.
+        tgt_dict (~fairseq.data.Dictionary): target vocabulary.
+        ids (List[str]): A list of utterance IDs.
+        speakers (List[str]): A list of speakers corresponding to utterances.
+        num_mel_bins (int): Number of triangular mel-frequency bins (default: 80)
+        frame_length (float): Frame length in milliseconds (default: 25.0)
+        frame_shift (float): Frame shift in milliseconds (default: 10.0)
+    """
+    def __init__(
+        self, aud_paths, aud_durations_ms, tgt,
+        tgt_dict, ids, speakers,
+        num_mel_bins=80, frame_length=25.0, frame_shift=10.0
+    ):
+        assert frame_length > 0
+        assert frame_shift > 0
+        assert all(x > frame_length for x in aud_durations_ms)
+        self.frame_sizes = [
+            int(1 + (d - frame_length) / frame_shift)
+            for d in aud_durations_ms
+        ]
+        assert len(aud_paths) > 0
+        assert len(aud_paths) == len(aud_durations_ms)
+        assert len(aud_paths) == len(tgt)
+        assert len(aud_paths) == len(ids)
+        assert len(aud_paths) == len(speakers)
+        self.aud_paths = aud_paths
+        self.tgt_dict = tgt_dict
+        self.tgt = tgt
+        self.ids = ids
+        self.speakers = speakers
+        self.num_mel_bins = num_mel_bins
+        self.frame_length = frame_length
+        self.frame_shift = frame_shift
+    def __getitem__(self, index):
+        import torchaudio
+        import torchaudio.compliance.kaldi as kaldi
+        tgt_item = self.tgt[index] if self.tgt is not None else None
+        path = self.aud_paths[index]
+        if not os.path.exists(path):
+            raise FileNotFoundError("Audio file not found: {}".format(path))
+        sound, sample_rate = torchaudio.load_wav(path)
+        output = kaldi.fbank(
+            sound,
+            num_mel_bins=self.num_mel_bins,
+            frame_length=self.frame_length,
+            frame_shift=self.frame_shift
+        )
+        output_cmvn = data_utils.apply_mv_norm(output)
+        self.collater = Seq2SeqCollater(
+            0, 1, pad_index=self.tgt_dict.pad(),
+            eos_index=self.tgt_dict.eos(), move_eos_to_beginning=True
+        )
+        return {"id": index, "data": [output_cmvn.detach(), tgt_item]}
+    def __len__(self):
+        return len(self.aud_paths)
+    def collater(self, samples):
+        """Merge a list of samples to form a mini-batch.
+        Args:
+            samples (List[int]): sample indices to collate
+        Returns:
+            dict: a mini-batch suitable for forwarding with a Model
+        """
+        return self.collater.collate(samples)
+    def num_tokens(self, index):
+        return self.frame_sizes[index]
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        return (
+            self.frame_sizes[index],
+            len(self.tgt[index]) if self.tgt is not None else 0,
+        )
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        return np.arange(len(self))
--- a/examples/speech_recognition/data/collaters.py
+++ b/examples/speech_recognition/data/collaters.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+    This module contains collection of classes which implement
+    collate functionalities for various tasks.
+    Collaters should know what data to expect for each sample
+    and they should pack / collate them into batches
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+import numpy as np
+import torch
+from fairseq.data import data_utils as fairseq_data_utils
+class Seq2SeqCollater(object):
+    """
+        Implements collate function mainly for seq2seq tasks
+        This expects each sample to contain feature (src_tokens) and
+        targets.
+        This collator is also used for aligned training task.
+    """
+    def __init__(
+        self,
+        feature_index=0,
+        label_index=1,
+        pad_index=1,
+        eos_index=2,
+        move_eos_to_beginning=True,
+    ):
+        self.feature_index = feature_index
+        self.label_index = label_index
+        self.pad_index = pad_index
+        self.eos_index = eos_index
+        self.move_eos_to_beginning = move_eos_to_beginning
+    def _collate_frames(self, frames):
+        """Convert a list of 2d frames into a padded 3d tensor
+        Args:
+            frames (list): list of 2d frames of size L[i]*f_dim. Where L[i] is
+                length of i-th frame and f_dim is static dimension of features
+        Returns:
+            3d tensor of size len(frames)*len_max*f_dim where len_max is max of L[i]
+        """
+        len_max = max(frame.size(0) for frame in frames)
+        f_dim = frames[0].size(1)
+        res = frames[0].new(len(frames), len_max, f_dim).fill_(0.0)
+        for i, v in enumerate(frames):
+            res[i, : v.size(0)] = v
+        return res
+    def collate(self, samples):
+        """
+        utility function to collate samples into batch for speech recognition.
+        """
+        if len(samples) == 0:
+            return {}
+        # parse samples into torch tensors
+        parsed_samples = []
+        for s in samples:
+            # skip invalid samples
+            if s["data"][self.feature_index] is None:
+                continue
+            source = s["data"][self.feature_index]
+            if isinstance(source, (np.ndarray, np.generic)):
+                source = torch.from_numpy(source)
+            target = s["data"][self.label_index]
+            if isinstance(target, (np.ndarray, np.generic)):
+                target = torch.from_numpy(target).long()
+            parsed_sample = {"id": s["id"], "source": source, "target": target}
+            parsed_samples.append(parsed_sample)
+        samples = parsed_samples
+        id = torch.LongTensor([s["id"] for s in samples])
+        frames = self._collate_frames([s["source"] for s in samples])
+        # sort samples by descending number of frames
+        frames_lengths = torch.LongTensor([s["source"].size(0) for s in samples])
+        frames_lengths, sort_order = frames_lengths.sort(descending=True)
+        id = id.index_select(0, sort_order)
+        frames = frames.index_select(0, sort_order)
+        target = None
+        target_lengths = None
+        prev_output_tokens = None
+        if samples[0].get("target", None) is not None:
+            ntokens = sum(len(s["target"]) for s in samples)
+            target = fairseq_data_utils.collate_tokens(
+                [s["target"] for s in samples],
+                self.pad_index,
+                self.eos_index,
+                left_pad=False,
+                move_eos_to_beginning=False,
+            )
+            target = target.index_select(0, sort_order)
+            target_lengths = torch.LongTensor(
+                [s["target"].size(0) for s in samples]
+            ).index_select(0, sort_order)
+            prev_output_tokens = fairseq_data_utils.collate_tokens(
+                [s["target"] for s in samples],
+                self.pad_index,
+                self.eos_index,
+                left_pad=False,
+                move_eos_to_beginning=self.move_eos_to_beginning,
+            )
+            prev_output_tokens = prev_output_tokens.index_select(0, sort_order)
+        else:
+            ntokens = sum(len(s["source"]) for s in samples)
+        batch = {
+            "id": id,
+            "ntokens": ntokens,
+            "net_input": {"src_tokens": frames, "src_lengths": frames_lengths},
+            "target": target,
+            "target_lengths": target_lengths,
+            "nsentences": len(samples),
+        }
+        if prev_output_tokens is not None:
+            batch["net_input"]["prev_output_tokens"] = prev_output_tokens
+        return batch
--- a/examples/speech_recognition/data/data_utils.py
+++ b/examples/speech_recognition/data/data_utils.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+def calc_mean_invstddev(feature):
+    if len(feature.size()) != 2:
+        raise ValueError("We expect the input feature to be 2-D tensor")
+    mean = feature.mean(0)
+    var = feature.var(0)
+    # avoid division by ~zero
+    eps = 1e-8
+    if (var < eps).any():
+        return mean, 1.0 / (torch.sqrt(var) + eps)
+    return mean, 1.0 / torch.sqrt(var)
+def apply_mv_norm(features):
+    mean, invstddev = calc_mean_invstddev(features)
+    res = (features - mean) * invstddev
+    return res
+def lengths_to_encoder_padding_mask(lengths, batch_first=False):
+    """
+    convert lengths (a 1-D Long/Int tensor) to 2-D binary tensor
+    Args:
+        lengths: a (B, )-shaped tensor
+    Return:
+        max_length: maximum length of B sequences
+        encoder_padding_mask: a (max_length, B) binary mask, where
+        [t, b] = 0 for t < lengths[b] and 1 otherwise
+    TODO:
+        kernelize this function if benchmarking shows this function is slow
+    """
+    max_lengths = torch.max(lengths).item()
+    bsz = lengths.size(0)
+    encoder_padding_mask = torch.arange(
+        max_lengths
+    ).to(  # a (T, ) tensor with [0, ..., T-1]
+        lengths.device
+    ).view(  # move to the right device
+        1, max_lengths
+    ).expand(  # reshape to (1, T)-shaped tensor
+        bsz, -1
+    ) >= lengths.view(  # expand to (B, T)-shaped tensor
+        bsz, 1
+    ).expand(
+        -1, max_lengths
+    )
+    if not batch_first:
+        return encoder_padding_mask.t(), max_lengths
+    else:
+        return encoder_padding_mask, max_lengths
--- a/examples/speech_recognition/datasets/asr_prep_json.py
+++ b/examples/speech_recognition/datasets/asr_prep_json.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import absolute_import, division, print_function, unicode_literals
+from collections import namedtuple
+import concurrent.futures
+from itertools import chain
+import argparse
+import os
+import json
+import sentencepiece as spm
+import multiprocessing
+import torchaudio
+from fairseq.data import Dictionary
+MILLISECONDS_TO_SECONDS = 0.001
+def process_sample(aud_path, lable, utt_id, sp, tgt_dict):
+    input = {}
+    output = {}
+    si, ei = torchaudio.info(aud_path)
+    input["length_ms"] = int(si.length / si.channels / si.rate / MILLISECONDS_TO_SECONDS)
+    input["path"] = aud_path
+    token = " ".join(sp.EncodeAsPieces(lable))
+    ids = tgt_dict.encode_line(token, append_eos=False)
+    output["text"] = lable
+    output["token"] = token
+    output["tokenid"] = ', '.join(map(str, [t.tolist() for t in ids]))
+    return {utt_id: {"input": input, "output": output}}
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--audio-dirs", nargs="+", default=['-'], required=True,
+                        help="input directories with audio files")
+    parser.add_argument("--labels", required=True,
+                        help="aggregated input labels with format <ID LABEL> per line",
+                        type=argparse.FileType('r', encoding='UTF-8'))
+    parser.add_argument("--spm-model", required=True,
+                        help="sentencepiece model to use for encoding",
+                        type=argparse.FileType('r', encoding='UTF-8'))
+    parser.add_argument("--dictionary", required=True,
+                        help="file to load fairseq dictionary from",
+                        type=argparse.FileType('r', encoding='UTF-8'))
+    parser.add_argument("--audio-format", choices=["flac", "wav"], default="wav")
+    parser.add_argument("--output", required=True, type=argparse.FileType('w'),
+                        help="path to save json output")
+    args = parser.parse_args()
+    sp = spm.SentencePieceProcessor()
+    sp.Load(args.spm_model.name)
+    tgt_dict = Dictionary.load(args.dictionary)
+    labels = {}
+    for line in args.labels:
+        (utt_id, label) = line.split(" ", 1)
+        labels[utt_id] = label
+    if len(labels) == 0:
+        raise Exception('No labels found in ', args.labels_path)
+    Sample = namedtuple('Sample', 'aud_path utt_id')
+    samples = []
+    for path, _, files in chain.from_iterable(os.walk(path) for path in args.audio_dirs):
+        for f in files:
+            if f.endswith(args.audio_format):
+                if len(os.path.splitext(f)) != 2:
+                    raise Exception('Expect <utt_id.extension> file name. Got: ', f)
+                utt_id = os.path.splitext(f)[0]
+                if utt_id not in labels:
+                    continue
+                samples.append(Sample(os.path.join(path, f), utt_id))
+    utts = {}
+    num_cpu = multiprocessing.cpu_count()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_cpu) as executor:
+        future_to_sample = {executor.submit(process_sample, s.aud_path, labels[s.utt_id], s.utt_id, sp, tgt_dict): s for s in samples}
+        for future in concurrent.futures.as_completed(future_to_sample):
+            try:
+                data = future.result()
+            except Exception as exc:
+                print('generated an exception: ', exc)
+            else:
+                utts.update(data)
+    json.dump({"utts": utts}, args.output, indent=4)
+if __name__ == "__main__":
+    main()
--- a/examples/speech_recognition/datasets/prepare-librispeech.sh
+++ b/examples/speech_recognition/datasets/prepare-librispeech.sh
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Prepare librispeech dataset
+base_url=www.openslr.org/resources/12
+train_dir=train_960
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <download_dir> <out_dir>"
+  echo "e.g.: $0 /tmp/librispeech_raw/ ~/data/librispeech_final"
+  exit 1
+fi
+download_dir=${1%/}
+out_dir=${2%/}
+fairseq_root=~/fairseq-py/
+mkdir -p ${out_dir}
+cd ${out_dir} || exit
+nbpe=5000
+bpemode=unigram
+if [ ! -d "$fairseq_root" ]; then
+    echo "$0: Please set correct fairseq_root"
+    exit 1
+fi
+echo "Data Download"
+for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
+    url=$base_url/$part.tar.gz
+    if ! wget -P $download_dir $url; then
+        echo "$0: wget failed for $url"
+        exit 1
+    fi
+    if ! tar -C $download_dir -xvzf $download_dir/$part.tar.gz; then
+        echo "$0: error un-tarring archive $download_dir/$part.tar.gz"
+        exit 1
+    fi
+done
+echo "Merge all train packs into one"
+mkdir -p ${download_dir}/LibriSpeech/${train_dir}/
+for part in train-clean-100 train-clean-360 train-other-500; do
+    mv ${download_dir}/LibriSpeech/${part}/* $download_dir/LibriSpeech/${train_dir}/
+done
+echo "Merge train text"
+find ${download_dir}/LibriSpeech/${train_dir}/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/${train_dir}/text
+# Use combined dev-clean and dev-other as validation set
+find ${download_dir}/LibriSpeech/dev-clean/ ${download_dir}/LibriSpeech/dev-other/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/valid_text
+find ${download_dir}/LibriSpeech/test-clean/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/test-clean/text
+find ${download_dir}/LibriSpeech/test-other/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/test-other/text
+dict=data/lang_char/${train_dir}_${bpemode}${nbpe}_units.txt
+encoded=data/lang_char/${train_dir}_${bpemode}${nbpe}_encoded.txt
+fairseq_dict=data/lang_char/${train_dir}_${bpemode}${nbpe}_fairseq_dict.txt
+bpemodel=data/lang_char/${train_dir}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+echo "Dictionary preparation"
+mkdir -p data/lang_char/
+echo "<unk> 3" > ${dict}
+echo "</s> 2" >> ${dict}
+echo "<pad> 1" >> ${dict}
+cut -f 2- -d" " ${download_dir}/LibriSpeech/${train_dir}/text > data/lang_char/input.txt
+spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --unk_id=3 --eos_id=2 --pad_id=1 --bos_id=-1 --character_coverage=1
+spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt > ${encoded}
+cat ${encoded} | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+3}' >> ${dict}
+cat ${encoded} | tr ' ' '\n' | sort | uniq -c | awk '{print $2 " " $1}' > ${fairseq_dict}
+wc -l ${dict}
+echo "Prepare train and test jsons"
+for part in train_960 test-other test-clean; do
+    python ${fairseq_root}/examples/speech_recognition/datasets/asr_prep_json.py --audio-dirs ${download_dir}/LibriSpeech/${part} --labels ${download_dir}/LibriSpeech/${part}/text --spm-model ${bpemodel}.model --audio-format flac --dictionary ${fairseq_dict} --output ${part}.json
+done
+# fairseq expects to find train.json and valid.json during training
+mv train_960.json train.json
+echo "Prepare valid json"
+python ${fairseq_root}/examples/speech_recognition/datasets/asr_prep_json.py --audio-dirs ${download_dir}/LibriSpeech/dev-clean ${download_dir}/LibriSpeech/dev-other --labels ${download_dir}/LibriSpeech/valid_text --spm-model ${bpemodel}.model --audio-format flac --dictionary ${fairseq_dict} --output valid.json
+cp ${fairseq_dict} ./dict.txt
+cp ${bpemodel}.model ./spm.model
--- a/examples/speech_recognition/infer.py
+++ b/examples/speech_recognition/infer.py
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Run inference for pre-processed data with a trained model.
+"""
+import logging
+import os
+import sentencepiece as spm
+import torch
+from fairseq import options, progress_bar, utils, tasks
+from fairseq.meters import StopwatchMeter, TimeMeter
+from fairseq.utils import import_user_module
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+def add_asr_eval_argument(parser):
+    parser.add_argument("--ctc", action="store_true", help="decode a ctc model")
+    parser.add_argument("--rnnt", default=False, help="decode a rnnt model")
+    parser.add_argument("--kspmodel", default=None, help="sentence piece model")
+    parser.add_argument(
+        "--wfstlm", default=None, help="wfstlm on dictonary output units"
+    )
+    parser.add_argument(
+        "--rnnt_decoding_type",
+        default="greedy",
+        help="wfstlm on dictonary\
+output units",
+    )
+    parser.add_argument(
+        "--lm_weight",
+        default=0.2,
+        help="weight for wfstlm while interpolating\
+with neural score",
+    )
+    parser.add_argument(
+        "--rnnt_len_penalty", default=-0.5, help="rnnt length penalty on word level"
+    )
+    return parser
+def check_args(args):
+    assert args.path is not None, "--path required for generation!"
+    assert args.results_path is not None, "--results_path required for generation!"
+    assert (
+        not args.sampling or args.nbest == args.beam
+    ), "--sampling requires --nbest to be equal to --beam"
+    assert (
+        args.replace_unk is None or args.raw_text
+    ), "--replace-unk requires a raw text dataset (--raw-text)"
+def get_dataset_itr(args, task):
+    return task.get_batch_iterator(
+        dataset=task.dataset(args.gen_subset),
+        max_tokens=args.max_tokens,
+        max_sentences=args.max_sentences,
+        max_positions=(1000000.0, 1000000.0),
+        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
+        required_batch_size_multiple=args.required_batch_size_multiple,
+        num_shards=args.num_shards,
+        shard_id=args.shard_id,
+        num_workers=args.num_workers,
+    ).next_epoch_itr(shuffle=False)
+def process_predictions(args, hypos, sp, tgt_dict, target_tokens, res_files, speaker, id):
+    for hypo in hypos[: min(len(hypos), args.nbest)]:
+        hyp_pieces = tgt_dict.string(hypo["tokens"].int().cpu())
+        hyp_words = sp.DecodePieces(hyp_pieces.split())
+        print(
+            "{} ({}-{})".format(hyp_pieces, speaker, id),
+            file=res_files["hypo.units"],
+        )
+        print(
+            "{} ({}-{})".format(hyp_words, speaker, id),
+            file=res_files["hypo.words"],
+        )
+        tgt_pieces = tgt_dict.string(target_tokens)
+        tgt_words = sp.DecodePieces(tgt_pieces.split())
+        print(
+            "{} ({}-{})".format(tgt_pieces, speaker, id),
+            file=res_files["ref.units"],
+        )
+        print(
+            "{} ({}-{})".format(tgt_words, speaker, id),
+            file=res_files["ref.words"],
+        )
+        # only score top hypothesis
+        if not args.quiet:
+            logger.debug("HYPO:" + hyp_words)
+            logger.debug("TARGET:" + tgt_words)
+            logger.debug("___________________")
+def prepare_result_files(args):
+    def get_res_file(file_prefix):
+        path = os.path.join(
+            args.results_path,
+            "{}-{}-{}.txt".format(
+                file_prefix, os.path.basename(args.path), args.gen_subset
+            ),
+        )
+        return open(path, "w", buffering=1)
+    return {
+        "hypo.words": get_res_file("hypo.word"),
+        "hypo.units": get_res_file("hypo.units"),
+        "ref.words": get_res_file("ref.word"),
+        "ref.units": get_res_file("ref.units"),
+    }
+def optimize_models(args, use_cuda, models):
+    """Optimize ensemble for generation
+    """
+    for model in models:
+        model.make_generation_fast_(
+            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
+            need_attn=args.print_alignment,
+        )
+        if args.fp16:
+            model.half()
+        if use_cuda:
+            model.cuda()
+def main(args):
+    check_args(args)
+    import_user_module(args)
+    if args.max_tokens is None and args.max_sentences is None:
+        args.max_tokens = 30000
+    logger.info(args)
+    use_cuda = torch.cuda.is_available() and not args.cpu
+    # Load dataset splits
+    task = tasks.setup_task(args)
+    task.load_dataset(args.gen_subset)
+    logger.info(
+        "| {} {} {} examples".format(
+            args.data, args.gen_subset, len(task.dataset(args.gen_subset))
+        )
+    )
+    # Set dictionary
+    tgt_dict = task.target_dictionary
+    if args.ctc or args.rnnt:
+        tgt_dict.add_symbol("<ctc_blank>")
+        if args.ctc:
+            logger.info("| decoding a ctc model")
+        if args.rnnt:
+            logger.info("| decoding a rnnt model")
+    # Load ensemble
+    logger.info("| loading model(s) from {}".format(args.path))
+    models, _model_args = utils.load_ensemble_for_inference(
+        args.path.split(":"),
+        task,
+        model_arg_overrides=eval(args.model_overrides),  # noqa
+    )
+    optimize_models(args, use_cuda, models)
+    # Load dataset (possibly sharded)
+    itr = get_dataset_itr(args, task)
+    # Initialize generator
+    gen_timer = StopwatchMeter()
+    generator = task.build_generator(args)
+    num_sentences = 0
+    if not os.path.exists(args.results_path):
+        os.makedirs(args.results_path)
+    sp = spm.SentencePieceProcessor()
+    sp.Load(os.path.join(args.data, 'spm.model'))
+    res_files = prepare_result_files(args)
+    with progress_bar.build_progress_bar(args, itr) as t:
+        wps_meter = TimeMeter()
+        for sample in t:
+            sample = utils.move_to_cuda(sample) if use_cuda else sample
+            if "net_input" not in sample:
+                continue
+            prefix_tokens = None
+            if args.prefix_size > 0:
+                prefix_tokens = sample["target"][:, : args.prefix_size]
+            gen_timer.start()
+            hypos = task.inference_step(generator, models, sample, prefix_tokens)
+            num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos)
+            gen_timer.stop(num_generated_tokens)
+            for i, sample_id in enumerate(sample['id'].tolist()):
+                speaker = task.dataset(args.gen_subset).speakers[int(sample_id)]
+                id = task.dataset(args.gen_subset).ids[int(sample_id)]
+                target_tokens = (
+                    utils.strip_pad(sample["target"][i, :], tgt_dict.pad()).int().cpu()
+                )
+                # Process top predictions
+                process_predictions(
+                    args, hypos[i], sp, tgt_dict, target_tokens, res_files, speaker, id
+                )
+            wps_meter.update(num_generated_tokens)
+            t.log({"wps": round(wps_meter.avg)})
+            num_sentences += sample["nsentences"]
+    logger.info(
+        "| Processed {} sentences ({} tokens) in {:.1f}s ({:.2f}"
+        "sentences/s, {:.2f} tokens/s)".format(
+            num_sentences,
+            gen_timer.n,
+            gen_timer.sum,
+            num_sentences / gen_timer.sum,
+            1.0 / gen_timer.avg,
+        )
+    )
+    logger.info("| Generate {} with beam={}".format(args.gen_subset, args.beam))
+def cli_main():
+    parser = options.get_generation_parser()
+    parser = add_asr_eval_argument(parser)
+    args = options.parse_args_and_arch(parser)
+    main(args)
+if __name__ == "__main__":
+    cli_main()
--- a/examples/speech_recognition/models/__init__.py
+++ b/examples/speech_recognition/models/__init__.py
+import importlib
+import os
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith('.py') and not file.startswith('_'):
+        model_name = file[:file.find('.py')]
+        importlib.import_module('examples.speech_recognition.models.' + model_name)
--- a/examples/speech_recognition/models/vggtransformer.py
+++ b/examples/speech_recognition/models/vggtransformer.py
--- a/examples/speech_recognition/tasks/__init__.py
+++ b/examples/speech_recognition/tasks/__init__.py
+import importlib
+import os
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith('.py') and not file.startswith('_'):
+        task_name = file[:file.find('.py')]
+        importlib.import_module('examples.speech_recognition.tasks.' + task_name)
--- a/examples/speech_recognition/tasks/speech_recognition.py
+++ b/examples/speech_recognition/tasks/speech_recognition.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import re
+import torch
+from fairseq.data import Dictionary
+from fairseq.tasks import FairseqTask, register_task
+from examples.speech_recognition.data import AsrDataset
+def get_asr_dataset_from_json(data_json_path, tgt_dict):
+    """
+    Parse data json and create dataset.
+    See scripts/asr_prep_json.py which pack json from raw files
+    Json example:
+    {
+    "utts": {
+        "4771-29403-0025": {
+            "input": {
+                "length_ms": 170,
+                "path": "/tmp/file1.flac"
+            },
+            "output": {
+                "text": "HELLO \n",
+                "token": "HE LLO",
+                "tokenid": "4815, 861"
+            }
+        },
+        "1564-142299-0096": {
+            ...
+        }
+    }
+    """
+    if not os.path.isfile(data_json_path):
+        raise FileNotFoundError("Dataset not found: {}".format(data_json_path))
+    with open(data_json_path, "rb") as f:
+        data_samples = json.load(f)["utts"]
+        assert len(data_samples) != 0
+        sorted_samples = sorted(
+            data_samples.items(),
+            key=lambda sample: int(sample[1]["input"]["length_ms"]),
+            reverse=True,
+        )
+        aud_paths = [s[1]["input"]["path"] for s in sorted_samples]
+        ids = [s[0] for s in sorted_samples]
+        speakers = []
+        for s in sorted_samples:
+            m = re.search("(.+?)-(.+?)-(.+?)", s[0])
+            speakers.append(m.group(1) + "_" + m.group(2))
+        frame_sizes = [s[1]["input"]["length_ms"] for s in sorted_samples]
+        tgt = [
+            torch.LongTensor(
+                [int(i) for i in s[1]["output"]["tokenid"].split(", ")]
+            )
+            for s in sorted_samples
+        ]
+        # append eos
+        tgt = [torch.cat([t, torch.LongTensor([tgt_dict.eos()])]) for t in tgt]
+        return AsrDataset(
+            aud_paths, frame_sizes, tgt, tgt_dict, ids, speakers
+        )
+@register_task("speech_recognition")
+class SpeechRecognitionTask(FairseqTask):
+    """
+    Task for training speech recognition model.
+    """
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument("data", help="path to data directory")
+    def __init__(self, args, tgt_dict):
+        super().__init__(args)
+        self.tgt_dict = tgt_dict
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task (e.g., load dictionaries)."""
+        dict_path = os.path.join(args.data, "dict.txt")
+        if not os.path.isfile(dict_path):
+            raise FileNotFoundError("Dict not found: {}".format(dict_path))
+        tgt_dict = Dictionary.load(dict_path)
+        print("| dictionary: {} types".format(len(tgt_dict)))
+        return cls(args, tgt_dict)
+    def load_dataset(self, split, combine=False, **kwargs):
+        """Load a given dataset split.
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        data_json_path = os.path.join(self.args.data, "{}.json".format(split))
+        self.datasets[split] = get_asr_dataset_from_json(
+            data_json_path, self.tgt_dict)
+    @property
+    def target_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self.tgt_dict
+    @property
+    def source_dictionary(self):
+        """Return the source :class:`~fairseq.data.Dictionary` (if applicable
+        for this task)."""
+        return None
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -23,6 +23,8 @@ from fairseq.modules import (
    MultiheadAttention,
    PositionalEmbedding,
    SinusoidalPositionalEmbedding,
+    TransformerDecoderLayer,
+    TransformerEncoderLayer,
 )
 DEFAULT_MAX_SOURCE_POSITIONS = 1024
@@ -504,253 +506,6 @@ class TransformerDecoder(FairseqIncrementalDecoder):
        return state_dict
-class TransformerEncoderLayer(nn.Module):
-    """Encoder layer block.
-    In the original paper each operation (multi-head attention or FFN) is
-    postprocessed with: `dropout -> add residual -> layernorm`. In the
-    tensor2tensor code they suggest that learning is more robust when
-    preprocessing each layer with layernorm and postprocessing with:
-    `dropout -> add residual`. We default to the approach in the paper, but the
-    tensor2tensor approach can be enabled by setting
-    *args.encoder_normalize_before* to ``True``.
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-    """
-    def __init__(self, args):
-        super().__init__()
-        self.embed_dim = args.encoder_embed_dim
-        self.self_attn = MultiheadAttention(
-            self.embed_dim, args.encoder_attention_heads,
-            dropout=args.attention_dropout, self_attention=True
-        )
-        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
-        self.dropout = args.dropout
-        self.activation_fn = utils.get_activation_fn(
-            activation=getattr(args, 'activation_fn', 'relu')
-        )
-        self.activation_dropout = getattr(args, 'activation_dropout', 0)
-        if self.activation_dropout == 0:
-            # for backwards compatibility with models that use args.relu_dropout
-            self.activation_dropout = getattr(args, 'relu_dropout', 0)
-        self.normalize_before = args.encoder_normalize_before
-        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
-        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
-        self.final_layer_norm = LayerNorm(self.embed_dim)
-    def upgrade_state_dict_named(self, state_dict, name):
-        """
-        Rename layer norm states from `...layer_norms.0.weight` to
-        `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to
-        `...final_layer_norm.weight`
-        """
-        layer_norm_map = {
-            '0': 'self_attn_layer_norm',
-            '1': 'final_layer_norm'
-        }
-        for old, new in layer_norm_map.items():
-            for m in ('weight', 'bias'):
-                k = '{}.layer_norms.{}.{}'.format(name, old, m)
-                if k in state_dict:
-                    state_dict[
-                        '{}.{}.{}'.format(name, new, m)
-                    ] = state_dict[k]
-                    del state_dict[k]
-    def forward(self, x, encoder_padding_mask):
-        """
-        Args:
-            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
-            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
-                `(batch, src_len)` where padding elements are indicated by ``1``.
-        Returns:
-            encoded output of shape `(seq_len, batch, embed_dim)`
-        """
-        residual = x
-        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
-        x, _ = self.self_attn(query=x, key=x, value=x, key_padding_mask=encoder_padding_mask)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
-        residual = x
-        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
-        x = self.activation_fn(self.fc1(x))
-        x = F.dropout(x, p=self.activation_dropout, training=self.training)
-        x = self.fc2(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
-        return x
-    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
-        assert before ^ after
-        if after ^ self.normalize_before:
-            return layer_norm(x)
-        else:
-            return x
-class TransformerDecoderLayer(nn.Module):
-    """Decoder layer block.
-    In the original paper each operation (multi-head attention, encoder
-    attention or FFN) is postprocessed with: `dropout -> add residual ->
-    layernorm`. In the tensor2tensor code they suggest that learning is more
-    robust when preprocessing each layer with layernorm and postprocessing with:
-    `dropout -> add residual`. We default to the approach in the paper, but the
-    tensor2tensor approach can be enabled by setting
-    *args.decoder_normalize_before* to ``True``.
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-        no_encoder_attn (bool, optional): whether to attend to encoder outputs
-            (default: False).
-    """
-    def __init__(self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False):
-        super().__init__()
-        self.embed_dim = args.decoder_embed_dim
-        self.self_attn = MultiheadAttention(
-            embed_dim=self.embed_dim,
-            num_heads=args.decoder_attention_heads,
-            dropout=args.attention_dropout,
-            add_bias_kv=add_bias_kv,
-            add_zero_attn=add_zero_attn,
-            self_attention=True
-        )
-        self.dropout = args.dropout
-        self.activation_fn = utils.get_activation_fn(
-            activation=getattr(args, 'activation_fn', 'relu')
-        )
-        self.activation_dropout = getattr(args, 'activation_dropout', 0)
-        if self.activation_dropout == 0:
-            # for backwards compatibility with models that use args.relu_dropout
-            self.activation_dropout = getattr(args, 'relu_dropout', 0)
-        self.normalize_before = args.decoder_normalize_before
-        # use layerNorm rather than FusedLayerNorm for exporting.
-        # char_inputs can be used to determint this.
-        # TODO  remove this once we update apex with the fix
-        export = getattr(args, 'char_inputs', False)
-        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
-        if no_encoder_attn:
-            self.encoder_attn = None
-            self.encoder_attn_layer_norm = None
-        else:
-            self.encoder_attn = MultiheadAttention(
-                self.embed_dim,
-                args.decoder_attention_heads,
-                kdim=getattr(args, 'encoder_embed_dim', None),
-                vdim=getattr(args, 'encoder_embed_dim', None),
-                dropout=args.attention_dropout,
-                encoder_decoder_attention=True,
-            )
-            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
-        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
-        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)
-        self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
-        self.need_attn = True
-        self.onnx_trace = False
-    def prepare_for_onnx_export_(self):
-        self.onnx_trace = True
-    def forward(
-        self,
-        x,
-        encoder_out=None,
-        encoder_padding_mask=None,
-        incremental_state=None,
-        prev_self_attn_state=None,
-        prev_attn_state=None,
-        self_attn_mask=None,
-        self_attn_padding_mask=None,
-    ):
-        """
-        Args:
-            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
-            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
-                `(batch, src_len)` where padding elements are indicated by ``1``.
-        Returns:
-            encoded output of shape `(seq_len, batch, embed_dim)`
-        """
-        residual = x
-        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
-        if prev_self_attn_state is not None:
-            if incremental_state is None:
-                incremental_state = {}
-            prev_key, prev_value = prev_self_attn_state
-            saved_state = {"prev_key": prev_key, "prev_value": prev_value}
-            self.self_attn._set_input_buffer(incremental_state, saved_state)
-        x, attn = self.self_attn(
-            query=x,
-            key=x,
-            value=x,
-            key_padding_mask=self_attn_padding_mask,
-            incremental_state=incremental_state,
-            need_weights=False,
-            attn_mask=self_attn_mask,
-        )
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
-        if self.encoder_attn is not None:
-            residual = x
-            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True)
-            if prev_attn_state is not None:
-                if incremental_state is None:
-                    incremental_state = {}
-                prev_key, prev_value = prev_attn_state
-                saved_state = {"prev_key": prev_key, "prev_value": prev_value}
-                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
-            x, attn = self.encoder_attn(
-                query=x,
-                key=encoder_out,
-                value=encoder_out,
-                key_padding_mask=encoder_padding_mask,
-                incremental_state=incremental_state,
-                static_kv=True,
-                need_weights=(not self.training and self.need_attn),
-            )
-            x = F.dropout(x, p=self.dropout, training=self.training)
-            x = residual + x
-            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True)
-        residual = x
-        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
-        x = self.activation_fn(self.fc1(x))
-        x = F.dropout(x, p=self.activation_dropout, training=self.training)
-        x = self.fc2(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
-        if self.onnx_trace and incremental_state is not None:
-            saved_state = self.self_attn._get_input_buffer(incremental_state)
-            self_attn_state = saved_state["prev_key"], saved_state["prev_value"]
-            return x, attn, self_attn_state
-        return x, attn
-    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
-        assert before ^ after
-        if after ^ self.normalize_before:
-            return layer_norm(x)
-        else:
-            return x
-    def make_generation_fast_(self, need_attn=False, **kwargs):
-        self.need_attn = need_attn
 def Embedding(num_embeddings, embedding_dim, padding_idx):
    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)

--- a/fairseq/modules/__init__.py
+++ b/fairseq/modules/__init__.py
@@ -26,6 +26,8 @@ from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding
 from .transformer_sentence_encoder_layer import TransformerSentenceEncoderLayer
 from .transformer_sentence_encoder import TransformerSentenceEncoder
 from .unfold import unfold1d
+from .transformer_layer import TransformerDecoderLayer, TransformerEncoderLayer
+from .vggblock import VGGBlock
 __all__ = [
    'AdaptiveInput',
@@ -51,5 +53,8 @@ __all__ = [
    'SinusoidalPositionalEmbedding',
    'TransformerSentenceEncoderLayer',
    'TransformerSentenceEncoder',
+    'TransformerDecoderLayer',
+    'TransformerEncoderLayer',
+    'VGGBlock',
    'unfold1d',
 ]
--- a/fairseq/modules/transformer_layer.py
+++ b/fairseq/modules/transformer_layer.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.modules import LayerNorm, MultiheadAttention
+class TransformerEncoderLayer(nn.Module):
+    """Encoder layer block.
+    In the original paper each operation (multi-head attention or FFN) is
+    postprocessed with: `dropout -> add residual -> layernorm`. In the
+    tensor2tensor code they suggest that learning is more robust when
+    preprocessing each layer with layernorm and postprocessing with:
+    `dropout -> add residual`. We default to the approach in the paper, but the
+    tensor2tensor approach can be enabled by setting
+    *args.encoder_normalize_before* to ``True``.
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+    """
+    def __init__(self, args):
+        super().__init__()
+        self.embed_dim = args.encoder_embed_dim
+        self.self_attn = MultiheadAttention(
+            self.embed_dim, args.encoder_attention_heads,
+            dropout=args.attention_dropout, self_attention=True
+        )
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.dropout = args.dropout
+        self.activation_fn = utils.get_activation_fn(
+            activation=getattr(args, 'activation_fn', 'relu')
+        )
+        self.activation_dropout = getattr(args, 'activation_dropout', 0)
+        if self.activation_dropout == 0:
+            # for backwards compatibility with models that use args.relu_dropout
+            self.activation_dropout = getattr(args, 'relu_dropout', 0)
+        self.normalize_before = args.encoder_normalize_before
+        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
+        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
+        self.final_layer_norm = LayerNorm(self.embed_dim)
+    def upgrade_state_dict_named(self, state_dict, name):
+        """
+        Rename layer norm states from `...layer_norms.0.weight` to
+        `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to
+        `...final_layer_norm.weight`
+        """
+        layer_norm_map = {
+            '0': 'self_attn_layer_norm',
+            '1': 'final_layer_norm'
+        }
+        for old, new in layer_norm_map.items():
+            for m in ('weight', 'bias'):
+                k = '{}.layer_norms.{}.{}'.format(name, old, m)
+                if k in state_dict:
+                    state_dict[
+                        '{}.{}.{}'.format(name, new, m)
+                    ] = state_dict[k]
+                    del state_dict[k]
+    def forward(self, x, encoder_padding_mask, attn_mask=None):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+                `(batch, src_len)` where padding elements are indicated by ``1``.
+            attn_mask (ByteTensor): binary tensor of shape (T_tgt, T_src), where
+            T_tgt is the length of query, while T_src is the length of key,
+            though here both query and key is x here,
+            attn_mask[t_tgt, t_src] = 1 means when calculating embedding
+            for t_tgt, t_src is excluded (or masked out), =0 means it is
+            included in attention
+        Returns:
+            encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        residual = x
+        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
+        if attn_mask is not None:
+            attn_mask = attn_mask.masked_fill(attn_mask.byte(), -1e8)
+        # anything in original attn_mask = 1, becomes -1e8
+        # anything in original attn_mask = 0, becomes 0
+        # Note that we cannot use -inf here, because at some edge cases,
+        # the attention weight (before softmax) for some padded element in query
+        # will become -inf, which results in NaN in model parameters
+        # TODO: to formally solve this problem, we need to change fairseq's
+        # MultiheadAttention. We will do this later on.
+        x, _ = self.self_attn(query=x, key=x, value=x, key_padding_mask=encoder_padding_mask)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
+        residual = x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
+        x = self.activation_fn(self.fc1(x))
+        x = F.dropout(x, p=self.activation_dropout, training=self.training)
+        x = self.fc2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
+        return x
+    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
+        assert before ^ after
+        if after ^ self.normalize_before:
+            return layer_norm(x)
+        else:
+            return x
+class TransformerDecoderLayer(nn.Module):
+    """Decoder layer block.
+    In the original paper each operation (multi-head attention, encoder
+    attention or FFN) is postprocessed with: `dropout -> add residual ->
+    layernorm`. In the tensor2tensor code they suggest that learning is more
+    robust when preprocessing each layer with layernorm and postprocessing with:
+    `dropout -> add residual`. We default to the approach in the paper, but the
+    tensor2tensor approach can be enabled by setting
+    *args.decoder_normalize_before* to ``True``.
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+    def __init__(self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False):
+        super().__init__()
+        self.embed_dim = args.decoder_embed_dim
+        self.self_attn = MultiheadAttention(
+            embed_dim=self.embed_dim,
+            num_heads=args.decoder_attention_heads,
+            dropout=args.attention_dropout,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+            self_attention=True
+        )
+        self.dropout = args.dropout
+        self.activation_fn = utils.get_activation_fn(
+            activation=getattr(args, 'activation_fn', 'relu')
+        )
+        self.activation_dropout = getattr(args, 'activation_dropout', 0)
+        if self.activation_dropout == 0:
+            # for backwards compatibility with models that use args.relu_dropout
+            self.activation_dropout = getattr(args, 'relu_dropout', 0)
+        self.normalize_before = args.decoder_normalize_before
+        # use layerNorm rather than FusedLayerNorm for exporting.
+        # char_inputs can be used to determint this.
+        # TODO  remove this once we update apex with the fix
+        export = getattr(args, 'char_inputs', False)
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+        if no_encoder_attn:
+            self.encoder_attn = None
+            self.encoder_attn_layer_norm = None
+        else:
+            self.encoder_attn = MultiheadAttention(
+                self.embed_dim,
+                args.decoder_attention_heads,
+                kdim=getattr(args, 'encoder_embed_dim', None),
+                vdim=getattr(args, 'encoder_embed_dim', None),
+                dropout=args.attention_dropout,
+                encoder_decoder_attention=True,
+            )
+            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
+        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)
+        self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
+        self.need_attn = True
+        self.onnx_trace = False
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+    def forward(
+        self,
+        x,
+        encoder_out=None,
+        encoder_padding_mask=None,
+        incremental_state=None,
+        prev_self_attn_state=None,
+        prev_attn_state=None,
+        self_attn_mask=None,
+        self_attn_padding_mask=None,
+    ):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+                `(batch, src_len)` where padding elements are indicated by ``1``.
+        Returns:
+            encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        residual = x
+        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
+        if prev_self_attn_state is not None:
+            if incremental_state is None:
+                incremental_state = {}
+            prev_key, prev_value = prev_self_attn_state
+            saved_state = {"prev_key": prev_key, "prev_value": prev_value}
+            self.self_attn._set_input_buffer(incremental_state, saved_state)
+        x, attn = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=self_attn_padding_mask,
+            incremental_state=incremental_state,
+            need_weights=False,
+            attn_mask=self_attn_mask,
+        )
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
+        if self.encoder_attn is not None:
+            residual = x
+            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True)
+            if prev_attn_state is not None:
+                if incremental_state is None:
+                    incremental_state = {}
+                prev_key, prev_value = prev_attn_state
+                saved_state = {"prev_key": prev_key, "prev_value": prev_value}
+                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
+            x, attn = self.encoder_attn(
+                query=x,
+                key=encoder_out,
+                value=encoder_out,
+                key_padding_mask=encoder_padding_mask,
+                incremental_state=incremental_state,
+                static_kv=True,
+                need_weights=(not self.training and self.need_attn),
+            )
+            x = F.dropout(x, p=self.dropout, training=self.training)
+            x = residual + x
+            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True)
+        residual = x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
+        x = self.activation_fn(self.fc1(x))
+        x = F.dropout(x, p=self.activation_dropout, training=self.training)
+        x = self.fc2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
+        if self.onnx_trace and incremental_state is not None:
+            saved_state = self.self_attn._get_input_buffer(incremental_state)
+            self_attn_state = saved_state["prev_key"], saved_state["prev_value"]
+            return x, attn, self_attn_state
+        return x, attn
+    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
+        assert before ^ after
+        if after ^ self.normalize_before:
+            return layer_norm(x)
+        else:
+            return x
+    def make_generation_fast_(self, need_attn=False, **kwargs):
+        self.need_attn = need_attn
+def Linear(in_features, out_features, bias=True):
+    m = nn.Linear(in_features, out_features, bias)
+    nn.init.xavier_uniform_(m.weight)
+    if bias:
+        nn.init.constant_(m.bias, 0.)
+    return m
--- a/fairseq/modules/vggblock.py
+++ b/fairseq/modules/vggblock.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import absolute_import, division, print_function, unicode_literals
+from collections.abc import Iterable
+from itertools import repeat
+import torch
+import torch.nn as nn
+def _pair(v):
+    if isinstance(v, Iterable):
+        assert len(v) == 2, "len(v) != 2"
+        return v
+    return tuple(repeat(v, 2))
+def infer_conv_output_dim(conv_op, input_dim, sample_inchannel):
+    sample_seq_len = 200
+    sample_bsz = 10
+    x = torch.randn(sample_bsz, sample_inchannel, sample_seq_len, input_dim)
+    # N x C x H x W
+    # N: sample_bsz, C: sample_inchannel, H: sample_seq_len, W: input_dim
+    x = conv_op(x)
+    # N x C x H x W
+    x = x.transpose(1, 2)
+    # N x H x C x W
+    bsz, seq = x.size()[:2]
+    per_channel_dim = x.size()[3]
+    # bsz: N, seq: H, CxW the rest
+    return x.contiguous().view(bsz, seq, -1).size(-1), per_channel_dim
+class VGGBlock(torch.nn.Module):
+    """
+    VGG motibated cnn module https://arxiv.org/pdf/1409.1556.pdf
+    Args:
+        in_channels: (int) number of input channels (typically 1)
+        out_channels: (int) number of output channels
+        conv_kernel_size: convolution channels
+        pooling_kernel_size: the size of the pooling window to take a max over
+        num_conv_layers: (int) number of convolution layers
+        input_dim: (int) input dimension
+        conv_stride: the stride of the convolving kernel.
+            Can be a single number or a tuple (sH, sW)  Default: 1
+        padding: implicit paddings on both sides of the input.
+            Can be a single number or a tuple (padH, padW). Default: None
+        layer_norm: (bool) if layer norm is going to be applied. Default: False
+    Shape:
+        Input: BxCxTxfeat, i.e. (batch_size, input_size, timesteps, features)
+        Output: BxCxTxfeat, i.e. (batch_size, input_size, timesteps, features)
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        conv_kernel_size,
+        pooling_kernel_size,
+        num_conv_layers,
+        input_dim,
+        conv_stride=1,
+        padding=None,
+        layer_norm=False,
+    ):
+        assert (
+            input_dim is not None
+        ), "Need input_dim for LayerNorm and infer_conv_output_dim"
+        super(VGGBlock, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.conv_kernel_size = _pair(conv_kernel_size)
+        self.pooling_kernel_size = _pair(pooling_kernel_size)
+        self.num_conv_layers = num_conv_layers
+        self.padding = (
+            tuple(e // 2 for e in self.conv_kernel_size)
+            if padding is None
+            else _pair(padding)
+        )
+        self.conv_stride = _pair(conv_stride)
+        self.layers = nn.ModuleList()
+        for layer in range(num_conv_layers):
+            conv_op = nn.Conv2d(
+                in_channels if layer == 0 else out_channels,
+                out_channels,
+                self.conv_kernel_size,
+                stride=self.conv_stride,
+                padding=self.padding,
+            )
+            self.layers.append(conv_op)
+            if layer_norm:
+                conv_output_dim, per_channel_dim = infer_conv_output_dim(
+                    conv_op, input_dim, in_channels if layer == 0 else out_channels
+                )
+                self.layers.append(nn.LayerNorm(per_channel_dim))
+                input_dim = per_channel_dim
+            self.layers.append(nn.ReLU())
+        pool_op = nn.MaxPool2d(kernel_size=pooling_kernel_size, ceil_mode=True)
+        self.layers.append(pool_op)
+        self.total_output_dim, self.output_dim = infer_conv_output_dim(
+            pool_op, input_dim, out_channels
+        )
+    def forward(self, x):
+        for i, _ in enumerate(self.layers):
+            x = self.layers[i](x)
+        return x
--- a/tests/speech_recognition/__init__.py
+++ b/tests/speech_recognition/__init__.py