add fairseq0.10.2

7df61696 · Sugon_ldc · 7df61696 · 7df61696 · 7df61696 · 7df61696
Commit 7df61696 authored Jul 28, 2023 by Sugon_ldc
20 changed files
--- a/examples/speech_recognition/__init__.py
+++ b/examples/speech_recognition/__init__.py
+from . import criterions, models, tasks  # noqa
--- a/examples/speech_recognition/criterions/ASG_loss.py
+++ b/examples/speech_recognition/criterions/ASG_loss.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from examples.speech_recognition.data.replabels import pack_replabels
+from fairseq import utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+@register_criterion("asg_loss")
+class ASGCriterion(FairseqCriterion):
+    @staticmethod
+    def add_args(parser):
+        group = parser.add_argument_group("ASG Loss")
+        group.add_argument(
+            "--asg-transitions-init",
+            help="initial diagonal value of transition matrix",
+            type=float,
+            default=0.0,
+        )
+        group.add_argument(
+            "--max-replabel", help="maximum # of replabels", type=int, default=2
+        )
+        group.add_argument(
+            "--linseg-updates",
+            help="# of training updates to use LinSeg initialization",
+            type=int,
+            default=0,
+        )
+        group.add_argument(
+            "--hide-linseg-messages",
+            help="hide messages about LinSeg initialization",
+            action="store_true",
+        )
+    def __init__(
+        self,
+        task,
+        silence_token,
+        asg_transitions_init,
+        max_replabel,
+        linseg_updates,
+        hide_linseg_messages,
+    ):
+        from wav2letter.criterion import ASGLoss, CriterionScaleMode
+        super().__init__(task)
+        self.tgt_dict = task.target_dictionary
+        self.eos = self.tgt_dict.eos()
+        self.silence = (
+            self.tgt_dict.index(silence_token)
+            if silence_token in self.tgt_dict
+            else None
+        )
+        self.max_replabel = max_replabel
+        num_labels = len(self.tgt_dict)
+        self.asg = ASGLoss(num_labels, scale_mode=CriterionScaleMode.TARGET_SZ_SQRT)
+        self.asg.trans = torch.nn.Parameter(
+            asg_transitions_init * torch.eye(num_labels), requires_grad=True
+        )
+        self.linseg_progress = torch.nn.Parameter(
+            torch.tensor([0], dtype=torch.int), requires_grad=False
+        )
+        self.linseg_maximum = linseg_updates
+        self.linseg_message_state = "none" if hide_linseg_messages else "start"
+    @classmethod
+    def build_criterion(cls, args, task):
+        return cls(
+            task,
+            args.silence_token,
+            args.asg_transitions_init,
+            args.max_replabel,
+            args.linseg_updates,
+            args.hide_linseg_messages,
+        )
+    def linseg_step(self):
+        if not self.training:
+            return False
+        if self.linseg_progress.item() < self.linseg_maximum:
+            if self.linseg_message_state == "start":
+                print("| using LinSeg to initialize ASG")
+                self.linseg_message_state = "finish"
+            self.linseg_progress.add_(1)
+            return True
+        elif self.linseg_message_state == "finish":
+            print("| finished LinSeg initialization")
+            self.linseg_message_state = "none"
+        return False
+    def replace_eos_with_silence(self, tgt):
+        if tgt[-1] != self.eos:
+            return tgt
+        elif self.silence is None or (len(tgt) > 1 and tgt[-2] == self.silence):
+            return tgt[:-1]
+        else:
+            return tgt[:-1] + [self.silence]
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample["net_input"])
+        emissions = net_output["encoder_out"].transpose(0, 1).contiguous()
+        B = emissions.size(0)
+        T = emissions.size(1)
+        device = emissions.device
+        target = torch.IntTensor(B, T)
+        target_size = torch.IntTensor(B)
+        using_linseg = self.linseg_step()
+        for b in range(B):
+            initial_target_size = sample["target_lengths"][b].item()
+            if initial_target_size == 0:
+                raise ValueError("target size cannot be zero")
+            tgt = sample["target"][b, :initial_target_size].tolist()
+            tgt = self.replace_eos_with_silence(tgt)
+            tgt = pack_replabels(tgt, self.tgt_dict, self.max_replabel)
+            tgt = tgt[:T]
+            if using_linseg:
+                tgt = [tgt[t * len(tgt) // T] for t in range(T)]
+            target[b][: len(tgt)] = torch.IntTensor(tgt)
+            target_size[b] = len(tgt)
+        loss = self.asg.forward(emissions, target.to(device), target_size.to(device))
+        if reduce:
+            loss = torch.sum(loss)
+        sample_size = (
+            sample["target"].size(0) if self.args.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": utils.item(loss.data) if reduce else loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+        }
+        return loss, sample_size, logging_output
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        agg_output = {
+            "loss": loss_sum / nsentences,
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "sample_size": sample_size,
+        }
+        return agg_output
--- a/examples/speech_recognition/criterions/__init__.py
+++ b/examples/speech_recognition/criterions/__init__.py
+import importlib
+import os
+# ASG loss requires wav2letter
+files_to_skip = set()
+try:
+    import wav2letter
+except ImportError:
+    files_to_skip.add("ASG_loss.py")
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith(".py") and not file.startswith("_") and file not in files_to_skip:
+        criterion_name = file[: file.find(".py")]
+        importlib.import_module(
+            "examples.speech_recognition.criterions." + criterion_name
+        )
--- a/examples/speech_recognition/criterions/cross_entropy_acc.py
+++ b/examples/speech_recognition/criterions/cross_entropy_acc.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import absolute_import, division, print_function, unicode_literals
+import logging
+import math
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+@register_criterion("cross_entropy_acc")
+class CrossEntropyWithAccCriterion(FairseqCriterion):
+    def __init__(self, task, sentence_avg):
+        super().__init__(task)
+        self.sentence_avg = sentence_avg
+    def compute_loss(self, model, net_output, target, reduction, log_probs):
+        # N, T -> N * T
+        target = target.view(-1)
+        lprobs = model.get_normalized_probs(net_output, log_probs=log_probs)
+        if not hasattr(lprobs, "batch_first"):
+            logging.warning(
+                "ERROR: we need to know whether "
+                "batch first for the net output; "
+                "you need to set batch_first attribute for the return value of "
+                "model.get_normalized_probs. Now, we assume this is true, but "
+                "in the future, we will raise exception instead. "
+            )
+        batch_first = getattr(lprobs, "batch_first", True)
+        if not batch_first:
+            lprobs = lprobs.transpose(0, 1)
+        # N, T, D -> N * T, D
+        lprobs = lprobs.view(-1, lprobs.size(-1))
+        loss = F.nll_loss(
+            lprobs, target, ignore_index=self.padding_idx, reduction=reduction
+        )
+        return lprobs, loss
+    def get_logging_output(self, sample, target, lprobs, loss):
+        target = target.view(-1)
+        mask = target != self.padding_idx
+        correct = torch.sum(
+            lprobs.argmax(1).masked_select(mask) == target.masked_select(mask)
+        )
+        total = torch.sum(mask)
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": utils.item(loss.data),  # * sample['ntokens'],
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+            "correct": utils.item(correct.data),
+            "total": utils.item(total.data),
+            "nframes": torch.sum(sample["net_input"]["src_lengths"]).item(),
+        }
+        return sample_size, logging_output
+    def forward(self, model, sample, reduction="sum", log_probs=True):
+        """Computes the cross entropy with accuracy metric for the given sample.
+        This is similar to CrossEntropyCriterion in fairseq, but also
+        computes accuracy metrics as part of logging
+        Args:
+            logprobs (Torch.tensor) of shape N, T, D i.e.
+                batchsize, timesteps, dimensions
+            targets (Torch.tensor) of shape N, T  i.e batchsize, timesteps
+        Returns:
+        tuple: With three elements:
+            1) the loss
+            2) the sample size, which is used as the denominator for the gradient
+            3) logging outputs to display while training
+        TODO:
+            * Currently this Criterion will only work with LSTMEncoderModels or
+            FairseqModels which have decoder, or Models which return TorchTensor
+            as net_output.
+            We need to make a change to support all FairseqEncoder models.
+        """
+        net_output = model(**sample["net_input"])
+        target = model.get_targets(sample, net_output)
+        lprobs, loss = self.compute_loss(
+            model, net_output, target, reduction, log_probs
+        )
+        sample_size, logging_output = self.get_logging_output(
+            sample, target, lprobs, loss
+        )
+        return loss, sample_size, logging_output
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        correct_sum = sum(log.get("correct", 0) for log in logging_outputs)
+        total_sum = sum(log.get("total", 0) for log in logging_outputs)
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        nframes = sum(log.get("nframes", 0) for log in logging_outputs)
+        agg_output = {
+            "loss": loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.0,
+            # if args.sentence_avg, then sample_size is nsentences, then loss
+            # is per-sentence loss; else sample_size is ntokens, the loss
+            # becomes per-output token loss
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "nframes": nframes,
+            "sample_size": sample_size,
+            "acc": correct_sum * 100.0 / total_sum if total_sum > 0 else 0.0,
+            "correct": correct_sum,
+            "total": total_sum,
+            # total is the number of validate tokens
+        }
+        if sample_size != ntokens:
+            agg_output["nll_loss"] = loss_sum / ntokens / math.log(2)
+        # loss: per output token loss
+        # nll_loss: per sentence loss
+        return agg_output
--- a/examples/speech_recognition/data/__init__.py
+++ b/examples/speech_recognition/data/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .asr_dataset import AsrDataset
+__all__ = [
+    "AsrDataset",
+]
--- a/examples/speech_recognition/data/asr_dataset.py
+++ b/examples/speech_recognition/data/asr_dataset.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import numpy as np
+from fairseq.data import FairseqDataset
+from . import data_utils
+from .collaters import Seq2SeqCollater
+class AsrDataset(FairseqDataset):
+    """
+    A dataset representing speech and corresponding transcription.
+    Args:
+        aud_paths: (List[str]): A list of str with paths to audio files.
+        aud_durations_ms (List[int]): A list of int containing the durations of
+            audio files.
+        tgt (List[torch.LongTensor]): A list of LongTensors containing the indices
+            of target transcriptions.
+        tgt_dict (~fairseq.data.Dictionary): target vocabulary.
+        ids (List[str]): A list of utterance IDs.
+        speakers (List[str]): A list of speakers corresponding to utterances.
+        num_mel_bins (int): Number of triangular mel-frequency bins (default: 80)
+        frame_length (float): Frame length in milliseconds (default: 25.0)
+        frame_shift (float): Frame shift in milliseconds (default: 10.0)
+    """
+    def __init__(
+        self,
+        aud_paths,
+        aud_durations_ms,
+        tgt,
+        tgt_dict,
+        ids,
+        speakers,
+        num_mel_bins=80,
+        frame_length=25.0,
+        frame_shift=10.0,
+    ):
+        assert frame_length > 0
+        assert frame_shift > 0
+        assert all(x > frame_length for x in aud_durations_ms)
+        self.frame_sizes = [
+            int(1 + (d - frame_length) / frame_shift) for d in aud_durations_ms
+        ]
+        assert len(aud_paths) > 0
+        assert len(aud_paths) == len(aud_durations_ms)
+        assert len(aud_paths) == len(tgt)
+        assert len(aud_paths) == len(ids)
+        assert len(aud_paths) == len(speakers)
+        self.aud_paths = aud_paths
+        self.tgt_dict = tgt_dict
+        self.tgt = tgt
+        self.ids = ids
+        self.speakers = speakers
+        self.num_mel_bins = num_mel_bins
+        self.frame_length = frame_length
+        self.frame_shift = frame_shift
+        self.s2s_collater = Seq2SeqCollater(
+            0,
+            1,
+            pad_index=self.tgt_dict.pad(),
+            eos_index=self.tgt_dict.eos(),
+            move_eos_to_beginning=True,
+        )
+    def __getitem__(self, index):
+        import torchaudio
+        import torchaudio.compliance.kaldi as kaldi
+        tgt_item = self.tgt[index] if self.tgt is not None else None
+        path = self.aud_paths[index]
+        if not os.path.exists(path):
+            raise FileNotFoundError("Audio file not found: {}".format(path))
+        sound, sample_rate = torchaudio.load_wav(path)
+        output = kaldi.fbank(
+            sound,
+            num_mel_bins=self.num_mel_bins,
+            frame_length=self.frame_length,
+            frame_shift=self.frame_shift,
+        )
+        output_cmvn = data_utils.apply_mv_norm(output)
+        return {"id": index, "data": [output_cmvn.detach(), tgt_item]}
+    def __len__(self):
+        return len(self.aud_paths)
+    def collater(self, samples):
+        """Merge a list of samples to form a mini-batch.
+        Args:
+            samples (List[int]): sample indices to collate
+        Returns:
+            dict: a mini-batch suitable for forwarding with a Model
+        """
+        return self.s2s_collater.collate(samples)
+    def num_tokens(self, index):
+        return self.frame_sizes[index]
+    def size(self, index):
+        """Return an example's size as a float or tuple. This value is used when
+        filtering a dataset with ``--max-positions``."""
+        return (
+            self.frame_sizes[index],
+            len(self.tgt[index]) if self.tgt is not None else 0,
+        )
+    def ordered_indices(self):
+        """Return an ordered list of indices. Batches will be constructed based
+        on this order."""
+        return np.arange(len(self))
--- a/examples/speech_recognition/data/collaters.py
+++ b/examples/speech_recognition/data/collaters.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+    This module contains collection of classes which implement
+    collate functionalities for various tasks.
+    Collaters should know what data to expect for each sample
+    and they should pack / collate them into batches
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+import numpy as np
+import torch
+from fairseq.data import data_utils as fairseq_data_utils
+class Seq2SeqCollater(object):
+    """
+    Implements collate function mainly for seq2seq tasks
+    This expects each sample to contain feature (src_tokens) and
+    targets.
+    This collator is also used for aligned training task.
+    """
+    def __init__(
+        self,
+        feature_index=0,
+        label_index=1,
+        pad_index=1,
+        eos_index=2,
+        move_eos_to_beginning=True,
+    ):
+        self.feature_index = feature_index
+        self.label_index = label_index
+        self.pad_index = pad_index
+        self.eos_index = eos_index
+        self.move_eos_to_beginning = move_eos_to_beginning
+    def _collate_frames(self, frames):
+        """Convert a list of 2d frames into a padded 3d tensor
+        Args:
+            frames (list): list of 2d frames of size L[i]*f_dim. Where L[i] is
+                length of i-th frame and f_dim is static dimension of features
+        Returns:
+            3d tensor of size len(frames)*len_max*f_dim where len_max is max of L[i]
+        """
+        len_max = max(frame.size(0) for frame in frames)
+        f_dim = frames[0].size(1)
+        res = frames[0].new(len(frames), len_max, f_dim).fill_(0.0)
+        for i, v in enumerate(frames):
+            res[i, : v.size(0)] = v
+        return res
+    def collate(self, samples):
+        """
+        utility function to collate samples into batch for speech recognition.
+        """
+        if len(samples) == 0:
+            return {}
+        # parse samples into torch tensors
+        parsed_samples = []
+        for s in samples:
+            # skip invalid samples
+            if s["data"][self.feature_index] is None:
+                continue
+            source = s["data"][self.feature_index]
+            if isinstance(source, (np.ndarray, np.generic)):
+                source = torch.from_numpy(source)
+            target = s["data"][self.label_index]
+            if isinstance(target, (np.ndarray, np.generic)):
+                target = torch.from_numpy(target).long()
+            elif isinstance(target, list):
+                target = torch.LongTensor(target)
+            parsed_sample = {"id": s["id"], "source": source, "target": target}
+            parsed_samples.append(parsed_sample)
+        samples = parsed_samples
+        id = torch.LongTensor([s["id"] for s in samples])
+        frames = self._collate_frames([s["source"] for s in samples])
+        # sort samples by descending number of frames
+        frames_lengths = torch.LongTensor([s["source"].size(0) for s in samples])
+        frames_lengths, sort_order = frames_lengths.sort(descending=True)
+        id = id.index_select(0, sort_order)
+        frames = frames.index_select(0, sort_order)
+        target = None
+        target_lengths = None
+        prev_output_tokens = None
+        if samples[0].get("target", None) is not None:
+            ntokens = sum(len(s["target"]) for s in samples)
+            target = fairseq_data_utils.collate_tokens(
+                [s["target"] for s in samples],
+                self.pad_index,
+                self.eos_index,
+                left_pad=False,
+                move_eos_to_beginning=False,
+            )
+            target = target.index_select(0, sort_order)
+            target_lengths = torch.LongTensor(
+                [s["target"].size(0) for s in samples]
+            ).index_select(0, sort_order)
+            prev_output_tokens = fairseq_data_utils.collate_tokens(
+                [s["target"] for s in samples],
+                self.pad_index,
+                self.eos_index,
+                left_pad=False,
+                move_eos_to_beginning=self.move_eos_to_beginning,
+            )
+            prev_output_tokens = prev_output_tokens.index_select(0, sort_order)
+        else:
+            ntokens = sum(len(s["source"]) for s in samples)
+        batch = {
+            "id": id,
+            "ntokens": ntokens,
+            "net_input": {"src_tokens": frames, "src_lengths": frames_lengths},
+            "target": target,
+            "target_lengths": target_lengths,
+            "nsentences": len(samples),
+        }
+        if prev_output_tokens is not None:
+            batch["net_input"]["prev_output_tokens"] = prev_output_tokens
+        return batch
--- a/examples/speech_recognition/data/data_utils.py
+++ b/examples/speech_recognition/data/data_utils.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+def calc_mean_invstddev(feature):
+    if len(feature.size()) != 2:
+        raise ValueError("We expect the input feature to be 2-D tensor")
+    mean = feature.mean(0)
+    var = feature.var(0)
+    # avoid division by ~zero
+    eps = 1e-8
+    if (var < eps).any():
+        return mean, 1.0 / (torch.sqrt(var) + eps)
+    return mean, 1.0 / torch.sqrt(var)
+def apply_mv_norm(features):
+    # If there is less than 2 spectrograms, the variance cannot be computed (is NaN)
+    # and normalization is not possible, so return the item as it is
+    if features.size(0) < 2:
+        return features
+    mean, invstddev = calc_mean_invstddev(features)
+    res = (features - mean) * invstddev
+    return res
+def lengths_to_encoder_padding_mask(lengths, batch_first=False):
+    """
+    convert lengths (a 1-D Long/Int tensor) to 2-D binary tensor
+    Args:
+        lengths: a (B, )-shaped tensor
+    Return:
+        max_length: maximum length of B sequences
+        encoder_padding_mask: a (max_length, B) binary mask, where
+        [t, b] = 0 for t < lengths[b] and 1 otherwise
+    TODO:
+        kernelize this function if benchmarking shows this function is slow
+    """
+    max_lengths = torch.max(lengths).item()
+    bsz = lengths.size(0)
+    encoder_padding_mask = torch.arange(
+        max_lengths
+    ).to(  # a (T, ) tensor with [0, ..., T-1]
+        lengths.device
+    ).view(  # move to the right device
+        1, max_lengths
+    ).expand(  # reshape to (1, T)-shaped tensor
+        bsz, -1
+    ) >= lengths.view(  # expand to (B, T)-shaped tensor
+        bsz, 1
+    ).expand(
+        -1, max_lengths
+    )
+    if not batch_first:
+        return encoder_padding_mask.t(), max_lengths
+    else:
+        return encoder_padding_mask, max_lengths
+def encoder_padding_mask_to_lengths(
+    encoder_padding_mask, max_lengths, batch_size, device
+):
+    """
+    convert encoder_padding_mask (2-D binary tensor) to a 1-D tensor
+    Conventionally, encoder output contains a encoder_padding_mask, which is
+    a 2-D mask in a shape (T, B), whose (t, b) element indicate whether
+    encoder_out[t, b] is a valid output (=0) or not (=1). Occasionally, we
+    need to convert this mask tensor to a 1-D tensor in shape (B, ), where
+    [b] denotes the valid length of b-th sequence
+    Args:
+        encoder_padding_mask: a (T, B)-shaped binary tensor or None; if None,
+        indicating all are valid
+    Return:
+        seq_lengths: a (B,)-shaped tensor, where its (b, )-th element is the
+        number of valid elements of b-th sequence
+        max_lengths: maximum length of all sequence, if encoder_padding_mask is
+        not None, max_lengths must equal to encoder_padding_mask.size(0)
+        batch_size: batch size; if encoder_padding_mask is
+        not None, max_lengths must equal to encoder_padding_mask.size(1)
+        device: which device to put the result on
+    """
+    if encoder_padding_mask is None:
+        return torch.Tensor([max_lengths] * batch_size).to(torch.int32).to(device)
+    assert encoder_padding_mask.size(0) == max_lengths, "max_lengths does not match"
+    assert encoder_padding_mask.size(1) == batch_size, "batch_size does not match"
+    return max_lengths - torch.sum(encoder_padding_mask, dim=0)
--- a/examples/speech_recognition/data/replabels.py
+++ b/examples/speech_recognition/data/replabels.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Replabel transforms for use with wav2letter's ASG criterion.
+"""
+def replabel_symbol(i):
+    """
+    Replabel symbols used in wav2letter, currently just "1", "2", ...
+    This prevents training with numeral tokens, so this might change in the future
+    """
+    return str(i)
+def pack_replabels(tokens, dictionary, max_reps):
+    """
+    Pack a token sequence so that repeated symbols are replaced by replabels
+    """
+    if len(tokens) == 0 or max_reps <= 0:
+        return tokens
+    replabel_value_to_idx = [0] * (max_reps + 1)
+    for i in range(1, max_reps + 1):
+        replabel_value_to_idx[i] = dictionary.index(replabel_symbol(i))
+    result = []
+    prev_token = -1
+    num_reps = 0
+    for token in tokens:
+        if token == prev_token and num_reps < max_reps:
+            num_reps += 1
+        else:
+            if num_reps > 0:
+                result.append(replabel_value_to_idx[num_reps])
+                num_reps = 0
+            result.append(token)
+            prev_token = token
+    if num_reps > 0:
+        result.append(replabel_value_to_idx[num_reps])
+    return result
+def unpack_replabels(tokens, dictionary, max_reps):
+    """
+    Unpack a token sequence so that replabels are replaced by repeated symbols
+    """
+    if len(tokens) == 0 or max_reps <= 0:
+        return tokens
+    replabel_idx_to_value = {}
+    for i in range(1, max_reps + 1):
+        replabel_idx_to_value[dictionary.index(replabel_symbol(i))] = i
+    result = []
+    prev_token = -1
+    for token in tokens:
+        try:
+            for _ in range(replabel_idx_to_value[token]):
+                result.append(prev_token)
+            prev_token = -1
+        except KeyError:
+            result.append(token)
+            prev_token = token
+    return result
--- a/examples/speech_recognition/datasets/asr_prep_json.py
+++ b/examples/speech_recognition/datasets/asr_prep_json.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import absolute_import, division, print_function, unicode_literals
+import argparse
+import concurrent.futures
+import json
+import multiprocessing
+import os
+from collections import namedtuple
+from itertools import chain
+import sentencepiece as spm
+from fairseq.data import Dictionary
+MILLISECONDS_TO_SECONDS = 0.001
+def process_sample(aud_path, lable, utt_id, sp, tgt_dict):
+    import torchaudio
+    input = {}
+    output = {}
+    si, ei = torchaudio.info(aud_path)
+    input["length_ms"] = int(
+        si.length / si.channels / si.rate / MILLISECONDS_TO_SECONDS
+    )
+    input["path"] = aud_path
+    token = " ".join(sp.EncodeAsPieces(lable))
+    ids = tgt_dict.encode_line(token, append_eos=False)
+    output["text"] = lable
+    output["token"] = token
+    output["tokenid"] = ", ".join(map(str, [t.tolist() for t in ids]))
+    return {utt_id: {"input": input, "output": output}}
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--audio-dirs",
+        nargs="+",
+        default=["-"],
+        required=True,
+        help="input directories with audio files",
+    )
+    parser.add_argument(
+        "--labels",
+        required=True,
+        help="aggregated input labels with format <ID LABEL> per line",
+        type=argparse.FileType("r", encoding="UTF-8"),
+    )
+    parser.add_argument(
+        "--spm-model",
+        required=True,
+        help="sentencepiece model to use for encoding",
+        type=argparse.FileType("r", encoding="UTF-8"),
+    )
+    parser.add_argument(
+        "--dictionary",
+        required=True,
+        help="file to load fairseq dictionary from",
+        type=argparse.FileType("r", encoding="UTF-8"),
+    )
+    parser.add_argument("--audio-format", choices=["flac", "wav"], default="wav")
+    parser.add_argument(
+        "--output",
+        required=True,
+        type=argparse.FileType("w"),
+        help="path to save json output",
+    )
+    args = parser.parse_args()
+    sp = spm.SentencePieceProcessor()
+    sp.Load(args.spm_model.name)
+    tgt_dict = Dictionary.load(args.dictionary)
+    labels = {}
+    for line in args.labels:
+        (utt_id, label) = line.split(" ", 1)
+        labels[utt_id] = label
+    if len(labels) == 0:
+        raise Exception("No labels found in ", args.labels_path)
+    Sample = namedtuple("Sample", "aud_path utt_id")
+    samples = []
+    for path, _, files in chain.from_iterable(
+        os.walk(path) for path in args.audio_dirs
+    ):
+        for f in files:
+            if f.endswith(args.audio_format):
+                if len(os.path.splitext(f)) != 2:
+                    raise Exception("Expect <utt_id.extension> file name. Got: ", f)
+                utt_id = os.path.splitext(f)[0]
+                if utt_id not in labels:
+                    continue
+                samples.append(Sample(os.path.join(path, f), utt_id))
+    utts = {}
+    num_cpu = multiprocessing.cpu_count()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_cpu) as executor:
+        future_to_sample = {
+            executor.submit(
+                process_sample, s.aud_path, labels[s.utt_id], s.utt_id, sp, tgt_dict
+            ): s
+            for s in samples
+        }
+        for future in concurrent.futures.as_completed(future_to_sample):
+            try:
+                data = future.result()
+            except Exception as exc:
+                print("generated an exception: ", exc)
+            else:
+                utts.update(data)
+    json.dump({"utts": utts}, args.output, indent=4)
+if __name__ == "__main__":
+    main()
--- a/examples/speech_recognition/datasets/prepare-librispeech.sh
+++ b/examples/speech_recognition/datasets/prepare-librispeech.sh
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Prepare librispeech dataset
+base_url=www.openslr.org/resources/12
+train_dir=train_960
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <download_dir> <out_dir>"
+  echo "e.g.: $0 /tmp/librispeech_raw/ ~/data/librispeech_final"
+  exit 1
+fi
+download_dir=${1%/}
+out_dir=${2%/}
+fairseq_root=~/fairseq-py/
+mkdir -p ${out_dir}
+cd ${out_dir} || exit
+nbpe=5000
+bpemode=unigram
+if [ ! -d "$fairseq_root" ]; then
+    echo "$0: Please set correct fairseq_root"
+    exit 1
+fi
+echo "Data Download"
+for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
+    url=$base_url/$part.tar.gz
+    if ! wget -P $download_dir $url; then
+        echo "$0: wget failed for $url"
+        exit 1
+    fi
+    if ! tar -C $download_dir -xvzf $download_dir/$part.tar.gz; then
+        echo "$0: error un-tarring archive $download_dir/$part.tar.gz"
+        exit 1
+    fi
+done
+echo "Merge all train packs into one"
+mkdir -p ${download_dir}/LibriSpeech/${train_dir}/
+for part in train-clean-100 train-clean-360 train-other-500; do
+    mv ${download_dir}/LibriSpeech/${part}/* $download_dir/LibriSpeech/${train_dir}/
+done
+echo "Merge train text"
+find ${download_dir}/LibriSpeech/${train_dir}/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/${train_dir}/text
+# Use combined dev-clean and dev-other as validation set
+find ${download_dir}/LibriSpeech/dev-clean/ ${download_dir}/LibriSpeech/dev-other/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/valid_text
+find ${download_dir}/LibriSpeech/test-clean/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/test-clean/text
+find ${download_dir}/LibriSpeech/test-other/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/test-other/text
+dict=data/lang_char/${train_dir}_${bpemode}${nbpe}_units.txt
+encoded=data/lang_char/${train_dir}_${bpemode}${nbpe}_encoded.txt
+fairseq_dict=data/lang_char/${train_dir}_${bpemode}${nbpe}_fairseq_dict.txt
+bpemodel=data/lang_char/${train_dir}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+echo "Dictionary preparation"
+mkdir -p data/lang_char/
+echo "<unk> 3" > ${dict}
+echo "</s> 2" >> ${dict}
+echo "<pad> 1" >> ${dict}
+cut -f 2- -d" " ${download_dir}/LibriSpeech/${train_dir}/text > data/lang_char/input.txt
+spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --unk_id=3 --eos_id=2 --pad_id=1 --bos_id=-1 --character_coverage=1
+spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt > ${encoded}
+cat ${encoded} | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+3}' >> ${dict}
+cat ${encoded} | tr ' ' '\n' | sort | uniq -c | awk '{print $2 " " $1}' > ${fairseq_dict}
+wc -l ${dict}
+echo "Prepare train and test jsons"
+for part in train_960 test-other test-clean; do
+    python ${fairseq_root}/examples/speech_recognition/datasets/asr_prep_json.py --audio-dirs ${download_dir}/LibriSpeech/${part} --labels ${download_dir}/LibriSpeech/${part}/text --spm-model ${bpemodel}.model --audio-format flac --dictionary ${fairseq_dict} --output ${part}.json
+done
+# fairseq expects to find train.json and valid.json during training
+mv train_960.json train.json
+echo "Prepare valid json"
+python ${fairseq_root}/examples/speech_recognition/datasets/asr_prep_json.py --audio-dirs ${download_dir}/LibriSpeech/dev-clean ${download_dir}/LibriSpeech/dev-other --labels ${download_dir}/LibriSpeech/valid_text --spm-model ${bpemodel}.model --audio-format flac --dictionary ${fairseq_dict} --output valid.json
+cp ${fairseq_dict} ./dict.txt
+cp ${bpemodel}.model ./spm.model
--- a/examples/speech_recognition/infer.py
+++ b/examples/speech_recognition/infer.py
+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Run inference for pre-processed data with a trained model.
+"""
+import logging
+import math
+import os
+import sys
+import editdistance
+import numpy as np
+import torch
+from fairseq import checkpoint_utils, options, progress_bar, tasks, utils
+from fairseq.data.data_utils import post_process
+from fairseq.logging.meters import StopwatchMeter, TimeMeter
+logging.basicConfig()
+logging.root.setLevel(logging.INFO)
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def add_asr_eval_argument(parser):
+    parser.add_argument("--kspmodel", default=None, help="sentence piece model")
+    parser.add_argument(
+        "--wfstlm", default=None, help="wfstlm on dictonary output units"
+    )
+    parser.add_argument(
+        "--rnnt_decoding_type",
+        default="greedy",
+        help="wfstlm on dictonary\
+output units",
+    )
+    try:
+        parser.add_argument(
+            "--lm-weight",
+            "--lm_weight",
+            type=float,
+            default=0.2,
+            help="weight for lm while interpolating with neural score",
+        )
+    except:
+        pass
+    parser.add_argument(
+        "--rnnt_len_penalty", default=-0.5, help="rnnt length penalty on word level"
+    )
+    parser.add_argument(
+        "--w2l-decoder",
+        choices=["viterbi", "kenlm", "fairseqlm"],
+        help="use a w2l decoder",
+    )
+    parser.add_argument("--lexicon", help="lexicon for w2l decoder")
+    parser.add_argument("--unit-lm", action="store_true", help="if using a unit lm")
+    parser.add_argument("--kenlm-model", "--lm-model", help="lm model for w2l decoder")
+    parser.add_argument("--beam-threshold", type=float, default=25.0)
+    parser.add_argument("--beam-size-token", type=float, default=100)
+    parser.add_argument("--word-score", type=float, default=1.0)
+    parser.add_argument("--unk-weight", type=float, default=-math.inf)
+    parser.add_argument("--sil-weight", type=float, default=0.0)
+    parser.add_argument(
+        "--dump-emissions",
+        type=str,
+        default=None,
+        help="if present, dumps emissions into this file and exits",
+    )
+    parser.add_argument(
+        "--dump-features",
+        type=str,
+        default=None,
+        help="if present, dumps features into this file and exits",
+    )
+    parser.add_argument(
+        "--load-emissions",
+        type=str,
+        default=None,
+        help="if present, loads emissions from this file",
+    )
+    return parser
+def check_args(args):
+    # assert args.path is not None, "--path required for generation!"
+    # assert args.results_path is not None, "--results_path required for generation!"
+    assert (
+        not args.sampling or args.nbest == args.beam
+    ), "--sampling requires --nbest to be equal to --beam"
+    assert (
+        args.replace_unk is None or args.raw_text
+    ), "--replace-unk requires a raw text dataset (--raw-text)"
+def get_dataset_itr(args, task, models):
+    return task.get_batch_iterator(
+        dataset=task.dataset(args.gen_subset),
+        max_tokens=args.max_tokens,
+        max_sentences=args.batch_size,
+        max_positions=(sys.maxsize, sys.maxsize),
+        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
+        required_batch_size_multiple=args.required_batch_size_multiple,
+        num_shards=args.num_shards,
+        shard_id=args.shard_id,
+        num_workers=args.num_workers,
+        data_buffer_size=args.data_buffer_size,
+    ).next_epoch_itr(shuffle=False)
+def process_predictions(
+    args, hypos, sp, tgt_dict, target_tokens, res_files, speaker, id
+):
+    for hypo in hypos[: min(len(hypos), args.nbest)]:
+        hyp_pieces = tgt_dict.string(hypo["tokens"].int().cpu())
+        if "words" in hypo:
+            hyp_words = " ".join(hypo["words"])
+        else:
+            hyp_words = post_process(hyp_pieces, args.remove_bpe)
+        if res_files is not None:
+            print(
+                "{} ({}-{})".format(hyp_pieces, speaker, id),
+                file=res_files["hypo.units"],
+            )
+            print(
+                "{} ({}-{})".format(hyp_words, speaker, id),
+                file=res_files["hypo.words"],
+            )
+        tgt_pieces = tgt_dict.string(target_tokens)
+        tgt_words = post_process(tgt_pieces, args.remove_bpe)
+        if res_files is not None:
+            print(
+                "{} ({}-{})".format(tgt_pieces, speaker, id),
+                file=res_files["ref.units"],
+            )
+            print(
+                "{} ({}-{})".format(tgt_words, speaker, id), file=res_files["ref.words"]
+            )
+            # only score top hypothesis
+            if not args.quiet:
+                logger.debug("HYPO:" + hyp_words)
+                logger.debug("TARGET:" + tgt_words)
+                logger.debug("___________________")
+        hyp_words = hyp_words.split()
+        tgt_words = tgt_words.split()
+        return editdistance.eval(hyp_words, tgt_words), len(tgt_words)
+def prepare_result_files(args):
+    def get_res_file(file_prefix):
+        if args.num_shards > 1:
+            file_prefix = f"{args.shard_id}_{file_prefix}"
+        path = os.path.join(
+            args.results_path,
+            "{}-{}-{}.txt".format(
+                file_prefix, os.path.basename(args.path), args.gen_subset
+            ),
+        )
+        return open(path, "w", buffering=1)
+    if not args.results_path:
+        return None
+    return {
+        "hypo.words": get_res_file("hypo.word"),
+        "hypo.units": get_res_file("hypo.units"),
+        "ref.words": get_res_file("ref.word"),
+        "ref.units": get_res_file("ref.units"),
+    }
+def load_models_and_criterions(
+    filenames, data_path, arg_overrides=None, task=None, model_state=None
+):
+    models = []
+    criterions = []
+    if arg_overrides is None:
+        arg_overrides = {}
+    arg_overrides["wer_args"] = None
+    arg_overrides["data"] = data_path
+    if filenames is None:
+        assert model_state is not None
+        filenames = [0]
+    else:
+        filenames = filenames.split(":")
+    for filename in filenames:
+        if model_state is None:
+            if not os.path.exists(filename):
+                raise IOError("Model file not found: {}".format(filename))
+            state = checkpoint_utils.load_checkpoint_to_cpu(filename, arg_overrides)
+        else:
+            state = model_state
+        args = state["args"]
+        if task is None:
+            task = tasks.setup_task(args)
+        model = task.build_model(args)
+        model.load_state_dict(state["model"], strict=True)
+        models.append(model)
+        criterion = task.build_criterion(args)
+        if "criterion" in state:
+            criterion.load_state_dict(state["criterion"], strict=True)
+        criterions.append(criterion)
+    return models, criterions, args
+def optimize_models(args, use_cuda, models):
+    """Optimize ensemble for generation"""
+    for model in models:
+        model.make_generation_fast_(
+            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
+            need_attn=args.print_alignment,
+        )
+        if args.fp16:
+            model.half()
+        if use_cuda:
+            model.cuda()
+class ExistingEmissionsDecoder(object):
+    def __init__(self, decoder, emissions):
+        self.decoder = decoder
+        self.emissions = emissions
+    def generate(self, models, sample, **unused):
+        ids = sample["id"].cpu().numpy()
+        try:
+            emissions = np.stack(self.emissions[ids])
+        except:
+            print([x.shape for x in self.emissions[ids]])
+            raise Exception("invalid sizes")
+        emissions = torch.from_numpy(emissions)
+        return self.decoder.decode(emissions)
+def main(args, task=None, model_state=None):
+    check_args(args)
+    if args.max_tokens is None and args.batch_size is None:
+        args.max_tokens = 4000000
+    logger.info(args)
+    use_cuda = torch.cuda.is_available() and not args.cpu
+    if task is None:
+        # Load dataset splits
+        task = tasks.setup_task(args)
+        task.load_dataset(args.gen_subset)
+        logger.info(
+            "| {} {} {} examples".format(
+                args.data, args.gen_subset, len(task.dataset(args.gen_subset))
+            )
+        )
+    # Set dictionary
+    tgt_dict = task.target_dictionary
+    logger.info("| decoding with criterion {}".format(args.criterion))
+    # Load ensemble
+    if args.load_emissions:
+        models, criterions = [], []
+    else:
+        logger.info("| loading model(s) from {}".format(args.path))
+        models, criterions, _ = load_models_and_criterions(
+            args.path,
+            data_path=args.data,
+            arg_overrides=eval(args.model_overrides),  # noqa
+            task=task,
+            model_state=model_state,
+        )
+        optimize_models(args, use_cuda, models)
+    # hack to pass transitions to W2lDecoder
+    if args.criterion == "asg_loss":
+        trans = criterions[0].asg.trans.data
+        args.asg_transitions = torch.flatten(trans).tolist()
+    # Load dataset (possibly sharded)
+    itr = get_dataset_itr(args, task, models)
+    # Initialize generator
+    gen_timer = StopwatchMeter()
+    def build_generator(args):
+        w2l_decoder = getattr(args, "w2l_decoder", None)
+        if w2l_decoder == "viterbi":
+            from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder
+            return W2lViterbiDecoder(args, task.target_dictionary)
+        elif w2l_decoder == "kenlm":
+            from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder
+            return W2lKenLMDecoder(args, task.target_dictionary)
+        elif w2l_decoder == "fairseqlm":
+            from examples.speech_recognition.w2l_decoder import W2lFairseqLMDecoder
+            return W2lFairseqLMDecoder(args, task.target_dictionary)
+        else:
+            print(
+                "only wav2letter decoders with (viterbi, kenlm, fairseqlm) options are supported at the moment"
+            )
+    # please do not touch this unless you test both generate.py and infer.py with audio_pretraining task
+    generator = build_generator(args)
+    if args.load_emissions:
+        generator = ExistingEmissionsDecoder(
+            generator, np.load(args.load_emissions, allow_pickle=True)
+        )
+        logger.info("loaded emissions from " + args.load_emissions)
+    num_sentences = 0
+    if args.results_path is not None and not os.path.exists(args.results_path):
+        os.makedirs(args.results_path)
+    max_source_pos = (
+        utils.resolve_max_positions(
+            task.max_positions(), *[model.max_positions() for model in models]
+        ),
+    )
+    if max_source_pos is not None:
+        max_source_pos = max_source_pos[0]
+        if max_source_pos is not None:
+            max_source_pos = max_source_pos[0] - 1
+    if args.dump_emissions:
+        emissions = {}
+    if args.dump_features:
+        features = {}
+        models[0].bert.proj = None
+    else:
+        res_files = prepare_result_files(args)
+    errs_t = 0
+    lengths_t = 0
+    with progress_bar.build_progress_bar(args, itr) as t:
+        wps_meter = TimeMeter()
+        for sample in t:
+            sample = utils.move_to_cuda(sample) if use_cuda else sample
+            if "net_input" not in sample:
+                continue
+            prefix_tokens = None
+            if args.prefix_size > 0:
+                prefix_tokens = sample["target"][:, : args.prefix_size]
+            gen_timer.start()
+            if args.dump_emissions:
+                with torch.no_grad():
+                    encoder_out = models[0](**sample["net_input"])
+                    emm = models[0].get_normalized_probs(encoder_out, log_probs=True)
+                    emm = emm.transpose(0, 1).cpu().numpy()
+                    for i, id in enumerate(sample["id"]):
+                        emissions[id.item()] = emm[i]
+                    continue
+            elif args.dump_features:
+                with torch.no_grad():
+                    encoder_out = models[0](**sample["net_input"])
+                    feat = encoder_out["encoder_out"].transpose(0, 1).cpu().numpy()
+                    for i, id in enumerate(sample["id"]):
+                        padding = (
+                            encoder_out["encoder_padding_mask"][i].cpu().numpy()
+                            if encoder_out["encoder_padding_mask"] is not None
+                            else None
+                        )
+                        features[id.item()] = (feat[i], padding)
+                    continue
+            hypos = task.inference_step(generator, models, sample, prefix_tokens)
+            num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos)
+            gen_timer.stop(num_generated_tokens)
+            for i, sample_id in enumerate(sample["id"].tolist()):
+                speaker = None
+                # id = task.dataset(args.gen_subset).ids[int(sample_id)]
+                id = sample_id
+                toks = (
+                    sample["target"][i, :]
+                    if "target_label" not in sample
+                    else sample["target_label"][i, :]
+                )
+                target_tokens = utils.strip_pad(toks, tgt_dict.pad()).int().cpu()
+                # Process top predictions
+                errs, length = process_predictions(
+                    args,
+                    hypos[i],
+                    None,
+                    tgt_dict,
+                    target_tokens,
+                    res_files,
+                    speaker,
+                    id,
+                )
+                errs_t += errs
+                lengths_t += length
+            wps_meter.update(num_generated_tokens)
+            t.log({"wps": round(wps_meter.avg)})
+            num_sentences += (
+                sample["nsentences"] if "nsentences" in sample else sample["id"].numel()
+            )
+    wer = None
+    if args.dump_emissions:
+        emm_arr = []
+        for i in range(len(emissions)):
+            emm_arr.append(emissions[i])
+        np.save(args.dump_emissions, emm_arr)
+        logger.info(f"saved {len(emissions)} emissions to {args.dump_emissions}")
+    elif args.dump_features:
+        feat_arr = []
+        for i in range(len(features)):
+            feat_arr.append(features[i])
+        np.save(args.dump_features, feat_arr)
+        logger.info(f"saved {len(features)} emissions to {args.dump_features}")
+    else:
+        if lengths_t > 0:
+            wer = errs_t * 100.0 / lengths_t
+            logger.info(f"WER: {wer}")
+        logger.info(
+            "| Processed {} sentences ({} tokens) in {:.1f}s ({:.2f}"
+            "sentences/s, {:.2f} tokens/s)".format(
+                num_sentences,
+                gen_timer.n,
+                gen_timer.sum,
+                num_sentences / gen_timer.sum,
+                1.0 / gen_timer.avg,
+            )
+        )
+        logger.info("| Generate {} with beam={}".format(args.gen_subset, args.beam))
+    return task, wer
+def make_parser():
+    parser = options.get_generation_parser()
+    parser = add_asr_eval_argument(parser)
+    return parser
+def cli_main():
+    parser = make_parser()
+    args = options.parse_args_and_arch(parser)
+    main(args)
+if __name__ == "__main__":
+    cli_main()
--- a/examples/speech_recognition/models/__init__.py
+++ b/examples/speech_recognition/models/__init__.py
+import importlib
+import os
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith(".py") and not file.startswith("_"):
+        model_name = file[: file.find(".py")]
+        importlib.import_module("examples.speech_recognition.models." + model_name)
--- a/examples/speech_recognition/models/vggtransformer.py
+++ b/examples/speech_recognition/models/vggtransformer.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import math
+from collections.abc import Iterable
+import torch
+import torch.nn as nn
+from examples.speech_recognition.data.data_utils import lengths_to_encoder_padding_mask
+from fairseq import utils
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderDecoderModel,
+    FairseqEncoderModel,
+    FairseqIncrementalDecoder,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules import (
+    LinearizedConvolution,
+    TransformerDecoderLayer,
+    TransformerEncoderLayer,
+    VGGBlock,
+)
+@register_model("asr_vggtransformer")
+class VGGTransformerModel(FairseqEncoderDecoderModel):
+    """
+    Transformers with convolutional context for ASR
+    https://arxiv.org/abs/1904.11660
+    """
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument(
+            "--input-feat-per-channel",
+            type=int,
+            metavar="N",
+            help="encoder input dimension per input channel",
+        )
+        parser.add_argument(
+            "--vggblock-enc-config",
+            type=str,
+            metavar="EXPR",
+            help="""
+    an array of tuples each containing the configuration of one vggblock:
+    [(out_channels,
+      conv_kernel_size,
+      pooling_kernel_size,
+      num_conv_layers,
+      use_layer_norm), ...])
+            """,
+        )
+        parser.add_argument(
+            "--transformer-enc-config",
+            type=str,
+            metavar="EXPR",
+            help=""""
+    a tuple containing the configuration of the encoder transformer layers
+    configurations:
+    [(input_dim,
+      num_heads,
+      ffn_dim,
+      normalize_before,
+      dropout,
+      attention_dropout,
+      relu_dropout), ...]')
+            """,
+        )
+        parser.add_argument(
+            "--enc-output-dim",
+            type=int,
+            metavar="N",
+            help="""
+    encoder output dimension, can be None. If specified, projecting the
+    transformer output to the specified dimension""",
+        )
+        parser.add_argument(
+            "--in-channels",
+            type=int,
+            metavar="N",
+            help="number of encoder input channels",
+        )
+        parser.add_argument(
+            "--tgt-embed-dim",
+            type=int,
+            metavar="N",
+            help="embedding dimension of the decoder target tokens",
+        )
+        parser.add_argument(
+            "--transformer-dec-config",
+            type=str,
+            metavar="EXPR",
+            help="""
+    a tuple containing the configuration of the decoder transformer layers
+    configurations:
+    [(input_dim,
+      num_heads,
+      ffn_dim,
+      normalize_before,
+      dropout,
+      attention_dropout,
+      relu_dropout), ...]
+            """,
+        )
+        parser.add_argument(
+            "--conv-dec-config",
+            type=str,
+            metavar="EXPR",
+            help="""
+    an array of tuples for the decoder 1-D convolution config
+        [(out_channels, conv_kernel_size, use_layer_norm), ...]""",
+        )
+    @classmethod
+    def build_encoder(cls, args, task):
+        return VGGTransformerEncoder(
+            input_feat_per_channel=args.input_feat_per_channel,
+            vggblock_config=eval(args.vggblock_enc_config),
+            transformer_config=eval(args.transformer_enc_config),
+            encoder_output_dim=args.enc_output_dim,
+            in_channels=args.in_channels,
+        )
+    @classmethod
+    def build_decoder(cls, args, task):
+        return TransformerDecoder(
+            dictionary=task.target_dictionary,
+            embed_dim=args.tgt_embed_dim,
+            transformer_config=eval(args.transformer_dec_config),
+            conv_config=eval(args.conv_dec_config),
+            encoder_output_dim=args.enc_output_dim,
+        )
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure that all args are properly defaulted
+        # (in case there are any new ones)
+        base_architecture(args)
+        encoder = cls.build_encoder(args, task)
+        decoder = cls.build_decoder(args, task)
+        return cls(encoder, decoder)
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        # net_output['encoder_out'] is a (B, T, D) tensor
+        lprobs = super().get_normalized_probs(net_output, log_probs, sample)
+        lprobs.batch_first = True
+        return lprobs
+DEFAULT_ENC_VGGBLOCK_CONFIG = ((32, 3, 2, 2, False),) * 2
+DEFAULT_ENC_TRANSFORMER_CONFIG = ((256, 4, 1024, True, 0.2, 0.2, 0.2),) * 2
+# 256: embedding dimension
+# 4: number of heads
+# 1024: FFN
+# True: apply layerNorm before (dropout + resiaul) instead of after
+# 0.2 (dropout): dropout after MultiheadAttention and second FC
+# 0.2 (attention_dropout): dropout in MultiheadAttention
+# 0.2 (relu_dropout): dropout after ReLu
+DEFAULT_DEC_TRANSFORMER_CONFIG = ((256, 2, 1024, True, 0.2, 0.2, 0.2),) * 2
+DEFAULT_DEC_CONV_CONFIG = ((256, 3, True),) * 2
+# TODO: repace transformer encoder config from one liner
+# to explicit args to get rid of this transformation
+def prepare_transformer_encoder_params(
+    input_dim,
+    num_heads,
+    ffn_dim,
+    normalize_before,
+    dropout,
+    attention_dropout,
+    relu_dropout,
+):
+    args = argparse.Namespace()
+    args.encoder_embed_dim = input_dim
+    args.encoder_attention_heads = num_heads
+    args.attention_dropout = attention_dropout
+    args.dropout = dropout
+    args.activation_dropout = relu_dropout
+    args.encoder_normalize_before = normalize_before
+    args.encoder_ffn_embed_dim = ffn_dim
+    return args
+def prepare_transformer_decoder_params(
+    input_dim,
+    num_heads,
+    ffn_dim,
+    normalize_before,
+    dropout,
+    attention_dropout,
+    relu_dropout,
+):
+    args = argparse.Namespace()
+    args.decoder_embed_dim = input_dim
+    args.decoder_attention_heads = num_heads
+    args.attention_dropout = attention_dropout
+    args.dropout = dropout
+    args.activation_dropout = relu_dropout
+    args.decoder_normalize_before = normalize_before
+    args.decoder_ffn_embed_dim = ffn_dim
+    return args
+class VGGTransformerEncoder(FairseqEncoder):
+    """VGG + Transformer encoder"""
+    def __init__(
+        self,
+        input_feat_per_channel,
+        vggblock_config=DEFAULT_ENC_VGGBLOCK_CONFIG,
+        transformer_config=DEFAULT_ENC_TRANSFORMER_CONFIG,
+        encoder_output_dim=512,
+        in_channels=1,
+        transformer_context=None,
+        transformer_sampling=None,
+    ):
+        """constructor for VGGTransformerEncoder
+        Args:
+            - input_feat_per_channel: feature dim (not including stacked,
+              just base feature)
+            - in_channel: # input channels (e.g., if stack 8 feature vector
+                together, this is 8)
+            - vggblock_config: configuration of vggblock, see comments on
+                DEFAULT_ENC_VGGBLOCK_CONFIG
+            - transformer_config: configuration of transformer layer, see comments
+                on DEFAULT_ENC_TRANSFORMER_CONFIG
+            - encoder_output_dim: final transformer output embedding dimension
+            - transformer_context: (left, right) if set, self-attention will be focused
+              on (t-left, t+right)
+            - transformer_sampling: an iterable of int, must match with
+              len(transformer_config), transformer_sampling[i] indicates sampling
+              factor for i-th transformer layer, after multihead att and feedfoward
+              part
+        """
+        super().__init__(None)
+        self.num_vggblocks = 0
+        if vggblock_config is not None:
+            if not isinstance(vggblock_config, Iterable):
+                raise ValueError("vggblock_config is not iterable")
+            self.num_vggblocks = len(vggblock_config)
+        self.conv_layers = nn.ModuleList()
+        self.in_channels = in_channels
+        self.input_dim = input_feat_per_channel
+        self.pooling_kernel_sizes = []
+        if vggblock_config is not None:
+            for _, config in enumerate(vggblock_config):
+                (
+                    out_channels,
+                    conv_kernel_size,
+                    pooling_kernel_size,
+                    num_conv_layers,
+                    layer_norm,
+                ) = config
+                self.conv_layers.append(
+                    VGGBlock(
+                        in_channels,
+                        out_channels,
+                        conv_kernel_size,
+                        pooling_kernel_size,
+                        num_conv_layers,
+                        input_dim=input_feat_per_channel,
+                        layer_norm=layer_norm,
+                    )
+                )
+                self.pooling_kernel_sizes.append(pooling_kernel_size)
+                in_channels = out_channels
+                input_feat_per_channel = self.conv_layers[-1].output_dim
+        transformer_input_dim = self.infer_conv_output_dim(
+            self.in_channels, self.input_dim
+        )
+        # transformer_input_dim is the output dimension of VGG part
+        self.validate_transformer_config(transformer_config)
+        self.transformer_context = self.parse_transformer_context(transformer_context)
+        self.transformer_sampling = self.parse_transformer_sampling(
+            transformer_sampling, len(transformer_config)
+        )
+        self.transformer_layers = nn.ModuleList()
+        if transformer_input_dim != transformer_config[0][0]:
+            self.transformer_layers.append(
+                Linear(transformer_input_dim, transformer_config[0][0])
+            )
+        self.transformer_layers.append(
+            TransformerEncoderLayer(
+                prepare_transformer_encoder_params(*transformer_config[0])
+            )
+        )
+        for i in range(1, len(transformer_config)):
+            if transformer_config[i - 1][0] != transformer_config[i][0]:
+                self.transformer_layers.append(
+                    Linear(transformer_config[i - 1][0], transformer_config[i][0])
+                )
+            self.transformer_layers.append(
+                TransformerEncoderLayer(
+                    prepare_transformer_encoder_params(*transformer_config[i])
+                )
+            )
+        self.encoder_output_dim = encoder_output_dim
+        self.transformer_layers.extend(
+            [
+                Linear(transformer_config[-1][0], encoder_output_dim),
+                LayerNorm(encoder_output_dim),
+            ]
+        )
+    def forward(self, src_tokens, src_lengths, **kwargs):
+        """
+        src_tokens: padded tensor (B, T, C * feat)
+        src_lengths: tensor of original lengths of input utterances (B,)
+        """
+        bsz, max_seq_len, _ = src_tokens.size()
+        x = src_tokens.view(bsz, max_seq_len, self.in_channels, self.input_dim)
+        x = x.transpose(1, 2).contiguous()
+        # (B, C, T, feat)
+        for layer_idx in range(len(self.conv_layers)):
+            x = self.conv_layers[layer_idx](x)
+        bsz, _, output_seq_len, _ = x.size()
+        # (B, C, T, feat) -> (B, T, C, feat) -> (T, B, C, feat) -> (T, B, C * feat)
+        x = x.transpose(1, 2).transpose(0, 1)
+        x = x.contiguous().view(output_seq_len, bsz, -1)
+        input_lengths = src_lengths.clone()
+        for s in self.pooling_kernel_sizes:
+            input_lengths = (input_lengths.float() / s).ceil().long()
+        encoder_padding_mask, _ = lengths_to_encoder_padding_mask(
+            input_lengths, batch_first=True
+        )
+        if not encoder_padding_mask.any():
+            encoder_padding_mask = None
+        subsampling_factor = int(max_seq_len * 1.0 / output_seq_len + 0.5)
+        attn_mask = self.lengths_to_attn_mask(input_lengths, subsampling_factor)
+        transformer_layer_idx = 0
+        for layer_idx in range(len(self.transformer_layers)):
+            if isinstance(self.transformer_layers[layer_idx], TransformerEncoderLayer):
+                x = self.transformer_layers[layer_idx](
+                    x, encoder_padding_mask, attn_mask
+                )
+                if self.transformer_sampling[transformer_layer_idx] != 1:
+                    sampling_factor = self.transformer_sampling[transformer_layer_idx]
+                    x, encoder_padding_mask, attn_mask = self.slice(
+                        x, encoder_padding_mask, attn_mask, sampling_factor
+                    )
+                transformer_layer_idx += 1
+            else:
+                x = self.transformer_layers[layer_idx](x)
+        # encoder_padding_maks is a (T x B) tensor, its [t, b] elements indicate
+        # whether encoder_output[t, b] is valid or not (valid=0, invalid=1)
+        return {
+            "encoder_out": x,  # (T, B, C)
+            "encoder_padding_mask": encoder_padding_mask.t()
+            if encoder_padding_mask is not None
+            else None,
+            # (B, T) --> (T, B)
+        }
+    def infer_conv_output_dim(self, in_channels, input_dim):
+        sample_seq_len = 200
+        sample_bsz = 10
+        x = torch.randn(sample_bsz, in_channels, sample_seq_len, input_dim)
+        for i, _ in enumerate(self.conv_layers):
+            x = self.conv_layers[i](x)
+        x = x.transpose(1, 2)
+        mb, seq = x.size()[:2]
+        return x.contiguous().view(mb, seq, -1).size(-1)
+    def validate_transformer_config(self, transformer_config):
+        for config in transformer_config:
+            input_dim, num_heads = config[:2]
+            if input_dim % num_heads != 0:
+                msg = (
+                    "ERROR in transformer config {}: ".format(config)
+                    + "input dimension {} ".format(input_dim)
+                    + "not dividable by number of heads {}".format(num_heads)
+                )
+                raise ValueError(msg)
+    def parse_transformer_context(self, transformer_context):
+        """
+        transformer_context can be the following:
+        -   None; indicates no context is used, i.e.,
+            transformer can access full context
+        -   a tuple/list of two int; indicates left and right context,
+            any number <0 indicates infinite context
+                * e.g., (5, 6) indicates that for query at x_t, transformer can
+                access [t-5, t+6] (inclusive)
+                * e.g., (-1, 6) indicates that for query at x_t, transformer can
+                access [0, t+6] (inclusive)
+        """
+        if transformer_context is None:
+            return None
+        if not isinstance(transformer_context, Iterable):
+            raise ValueError("transformer context must be Iterable if it is not None")
+        if len(transformer_context) != 2:
+            raise ValueError("transformer context must have length 2")
+        left_context = transformer_context[0]
+        if left_context < 0:
+            left_context = None
+        right_context = transformer_context[1]
+        if right_context < 0:
+            right_context = None
+        if left_context is None and right_context is None:
+            return None
+        return (left_context, right_context)
+    def parse_transformer_sampling(self, transformer_sampling, num_layers):
+        """
+        parsing transformer sampling configuration
+        Args:
+            - transformer_sampling, accepted input:
+                * None, indicating no sampling
+                * an Iterable with int (>0) as element
+            - num_layers, expected number of transformer layers, must match with
+              the length of transformer_sampling if it is not None
+        Returns:
+            - A tuple with length num_layers
+        """
+        if transformer_sampling is None:
+            return (1,) * num_layers
+        if not isinstance(transformer_sampling, Iterable):
+            raise ValueError(
+                "transformer_sampling must be an iterable if it is not None"
+            )
+        if len(transformer_sampling) != num_layers:
+            raise ValueError(
+                "transformer_sampling {} does not match with the number "
+                "of layers {}".format(transformer_sampling, num_layers)
+            )
+        for layer, value in enumerate(transformer_sampling):
+            if not isinstance(value, int):
+                raise ValueError("Invalid value in transformer_sampling: ")
+            if value < 1:
+                raise ValueError(
+                    "{} layer's subsampling is {}.".format(layer, value)
+                    + " This is not allowed! "
+                )
+        return transformer_sampling
+    def slice(self, embedding, padding_mask, attn_mask, sampling_factor):
+        """
+        embedding is a (T, B, D) tensor
+        padding_mask is a (B, T) tensor or None
+        attn_mask is a (T, T) tensor or None
+        """
+        embedding = embedding[::sampling_factor, :, :]
+        if padding_mask is not None:
+            padding_mask = padding_mask[:, ::sampling_factor]
+        if attn_mask is not None:
+            attn_mask = attn_mask[::sampling_factor, ::sampling_factor]
+        return embedding, padding_mask, attn_mask
+    def lengths_to_attn_mask(self, input_lengths, subsampling_factor=1):
+        """
+        create attention mask according to sequence lengths and transformer
+        context
+        Args:
+            - input_lengths: (B, )-shape Int/Long tensor; input_lengths[b] is
+              the length of b-th sequence
+            - subsampling_factor: int
+                * Note that the left_context and right_context is specified in
+                  the input frame-level while input to transformer may already
+                  go through subsampling (e.g., the use of striding in vggblock)
+                  we use subsampling_factor to scale the left/right context
+        Return:
+            - a (T, T) binary tensor or None, where T is max(input_lengths)
+                * if self.transformer_context is None, None
+                * if left_context is None,
+                    * attn_mask[t, t + right_context + 1:] = 1
+                    * others = 0
+                * if right_context is None,
+                    * attn_mask[t, 0:t - left_context] = 1
+                    * others = 0
+                * elsif
+                    * attn_mask[t, t - left_context: t + right_context + 1] = 0
+                    * others = 1
+        """
+        if self.transformer_context is None:
+            return None
+        maxT = torch.max(input_lengths).item()
+        attn_mask = torch.zeros(maxT, maxT)
+        left_context = self.transformer_context[0]
+        right_context = self.transformer_context[1]
+        if left_context is not None:
+            left_context = math.ceil(self.transformer_context[0] / subsampling_factor)
+        if right_context is not None:
+            right_context = math.ceil(self.transformer_context[1] / subsampling_factor)
+        for t in range(maxT):
+            if left_context is not None:
+                st = 0
+                en = max(st, t - left_context)
+                attn_mask[t, st:en] = 1
+            if right_context is not None:
+                st = t + right_context + 1
+                st = min(st, maxT - 1)
+                attn_mask[t, st:] = 1
+        return attn_mask.to(input_lengths.device)
+    def reorder_encoder_out(self, encoder_out, new_order):
+        encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select(
+            1, new_order
+        )
+        if encoder_out["encoder_padding_mask"] is not None:
+            encoder_out["encoder_padding_mask"] = encoder_out[
+                "encoder_padding_mask"
+            ].index_select(1, new_order)
+        return encoder_out
+class TransformerDecoder(FairseqIncrementalDecoder):
+    """
+    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`TransformerDecoderLayer`.
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs.
+            Default: ``False``
+        left_pad (bool, optional): whether the input is left-padded. Default:
+            ``False``
+    """
+    def __init__(
+        self,
+        dictionary,
+        embed_dim=512,
+        transformer_config=DEFAULT_ENC_TRANSFORMER_CONFIG,
+        conv_config=DEFAULT_DEC_CONV_CONFIG,
+        encoder_output_dim=512,
+    ):
+        super().__init__(dictionary)
+        vocab_size = len(dictionary)
+        self.padding_idx = dictionary.pad()
+        self.embed_tokens = Embedding(vocab_size, embed_dim, self.padding_idx)
+        self.conv_layers = nn.ModuleList()
+        for i in range(len(conv_config)):
+            out_channels, kernel_size, layer_norm = conv_config[i]
+            if i == 0:
+                conv_layer = LinearizedConv1d(
+                    embed_dim, out_channels, kernel_size, padding=kernel_size - 1
+                )
+            else:
+                conv_layer = LinearizedConv1d(
+                    conv_config[i - 1][0],
+                    out_channels,
+                    kernel_size,
+                    padding=kernel_size - 1,
+                )
+            self.conv_layers.append(conv_layer)
+            if layer_norm:
+                self.conv_layers.append(nn.LayerNorm(out_channels))
+            self.conv_layers.append(nn.ReLU())
+        self.layers = nn.ModuleList()
+        if conv_config[-1][0] != transformer_config[0][0]:
+            self.layers.append(Linear(conv_config[-1][0], transformer_config[0][0]))
+        self.layers.append(
+            TransformerDecoderLayer(
+                prepare_transformer_decoder_params(*transformer_config[0])
+            )
+        )
+        for i in range(1, len(transformer_config)):
+            if transformer_config[i - 1][0] != transformer_config[i][0]:
+                self.layers.append(
+                    Linear(transformer_config[i - 1][0], transformer_config[i][0])
+                )
+            self.layers.append(
+                TransformerDecoderLayer(
+                    prepare_transformer_decoder_params(*transformer_config[i])
+                )
+            )
+        self.fc_out = Linear(transformer_config[-1][0], vocab_size)
+    def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for input feeding/teacher forcing
+            encoder_out (Tensor, optional): output from the encoder, used for
+                encoder-side attention
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+        Returns:
+            tuple:
+                - the last decoder layer's output of shape `(batch, tgt_len,
+                  vocab)`
+                - the last decoder layer's attention weights of shape `(batch,
+                  tgt_len, src_len)`
+        """
+        target_padding_mask = (
+            (prev_output_tokens == self.padding_idx).to(prev_output_tokens.device)
+            if incremental_state is None
+            else None
+        )
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+        # embed tokens
+        x = self.embed_tokens(prev_output_tokens)
+        # B x T x C -> T x B x C
+        x = self._transpose_if_training(x, incremental_state)
+        for layer in self.conv_layers:
+            if isinstance(layer, LinearizedConvolution):
+                x = layer(x, incremental_state)
+            else:
+                x = layer(x)
+        # B x T x C -> T x B x C
+        x = self._transpose_if_inference(x, incremental_state)
+        # decoder layers
+        for layer in self.layers:
+            if isinstance(layer, TransformerDecoderLayer):
+                x, *_ = layer(
+                    x,
+                    (encoder_out["encoder_out"] if encoder_out is not None else None),
+                    (
+                        encoder_out["encoder_padding_mask"].t()
+                        if encoder_out["encoder_padding_mask"] is not None
+                        else None
+                    ),
+                    incremental_state,
+                    self_attn_mask=(
+                        self.buffered_future_mask(x)
+                        if incremental_state is None
+                        else None
+                    ),
+                    self_attn_padding_mask=(
+                        target_padding_mask if incremental_state is None else None
+                    ),
+                )
+            else:
+                x = layer(x)
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+        x = self.fc_out(x)
+        return x, None
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        if (
+            not hasattr(self, "_future_mask")
+            or self._future_mask is None
+            or self._future_mask.device != tensor.device
+        ):
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(tensor.new(dim, dim)), 1
+            )
+        if self._future_mask.size(0) < dim:
+            self._future_mask = torch.triu(
+                utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1
+            )
+        return self._future_mask[:dim, :dim]
+    def _transpose_if_training(self, x, incremental_state):
+        if incremental_state is None:
+            x = x.transpose(0, 1)
+        return x
+    def _transpose_if_inference(self, x, incremental_state):
+        if incremental_state:
+            x = x.transpose(0, 1)
+        return x
+@register_model("asr_vggtransformer_encoder")
+class VGGTransformerEncoderModel(FairseqEncoderModel):
+    def __init__(self, encoder):
+        super().__init__(encoder)
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument(
+            "--input-feat-per-channel",
+            type=int,
+            metavar="N",
+            help="encoder input dimension per input channel",
+        )
+        parser.add_argument(
+            "--vggblock-enc-config",
+            type=str,
+            metavar="EXPR",
+            help="""
+    an array of tuples each containing the configuration of one vggblock
+    [(out_channels, conv_kernel_size, pooling_kernel_size,num_conv_layers), ...]
+    """,
+        )
+        parser.add_argument(
+            "--transformer-enc-config",
+            type=str,
+            metavar="EXPR",
+            help="""
+    a tuple containing the configuration of the Transformer layers
+    configurations:
+    [(input_dim,
+      num_heads,
+      ffn_dim,
+      normalize_before,
+      dropout,
+      attention_dropout,
+      relu_dropout), ]""",
+        )
+        parser.add_argument(
+            "--enc-output-dim",
+            type=int,
+            metavar="N",
+            help="encoder output dimension, projecting the LSTM output",
+        )
+        parser.add_argument(
+            "--in-channels",
+            type=int,
+            metavar="N",
+            help="number of encoder input channels",
+        )
+        parser.add_argument(
+            "--transformer-context",
+            type=str,
+            metavar="EXPR",
+            help="""
+    either None or a tuple of two ints, indicating left/right context a
+    transformer can have access to""",
+        )
+        parser.add_argument(
+            "--transformer-sampling",
+            type=str,
+            metavar="EXPR",
+            help="""
+    either None or a tuple of ints, indicating sampling factor in each layer""",
+        )
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        base_architecture_enconly(args)
+        encoder = VGGTransformerEncoderOnly(
+            vocab_size=len(task.target_dictionary),
+            input_feat_per_channel=args.input_feat_per_channel,
+            vggblock_config=eval(args.vggblock_enc_config),
+            transformer_config=eval(args.transformer_enc_config),
+            encoder_output_dim=args.enc_output_dim,
+            in_channels=args.in_channels,
+            transformer_context=eval(args.transformer_context),
+            transformer_sampling=eval(args.transformer_sampling),
+        )
+        return cls(encoder)
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        # net_output['encoder_out'] is a (T, B, D) tensor
+        lprobs = super().get_normalized_probs(net_output, log_probs, sample)
+        # lprobs is a (T, B, D) tensor
+        # we need to transoose to get (B, T, D) tensor
+        lprobs = lprobs.transpose(0, 1).contiguous()
+        lprobs.batch_first = True
+        return lprobs
+class VGGTransformerEncoderOnly(VGGTransformerEncoder):
+    def __init__(
+        self,
+        vocab_size,
+        input_feat_per_channel,
+        vggblock_config=DEFAULT_ENC_VGGBLOCK_CONFIG,
+        transformer_config=DEFAULT_ENC_TRANSFORMER_CONFIG,
+        encoder_output_dim=512,
+        in_channels=1,
+        transformer_context=None,
+        transformer_sampling=None,
+    ):
+        super().__init__(
+            input_feat_per_channel=input_feat_per_channel,
+            vggblock_config=vggblock_config,
+            transformer_config=transformer_config,
+            encoder_output_dim=encoder_output_dim,
+            in_channels=in_channels,
+            transformer_context=transformer_context,
+            transformer_sampling=transformer_sampling,
+        )
+        self.fc_out = Linear(self.encoder_output_dim, vocab_size)
+    def forward(self, src_tokens, src_lengths, **kwargs):
+        """
+        src_tokens: padded tensor (B, T, C * feat)
+        src_lengths: tensor of original lengths of input utterances (B,)
+        """
+        enc_out = super().forward(src_tokens, src_lengths)
+        x = self.fc_out(enc_out["encoder_out"])
+        # x = F.log_softmax(x, dim=-1)
+        # Note: no need this line, because model.get_normalized_prob will call
+        # log_softmax
+        return {
+            "encoder_out": x,  # (T, B, C)
+            "encoder_padding_mask": enc_out["encoder_padding_mask"],  # (T, B)
+        }
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return (1e6, 1e6)  # an arbitrary large number
+def Embedding(num_embeddings, embedding_dim, padding_idx):
+    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    # nn.init.uniform_(m.weight, -0.1, 0.1)
+    # nn.init.constant_(m.weight[padding_idx], 0)
+    return m
+def Linear(in_features, out_features, bias=True, dropout=0):
+    """Linear layer (input: N x T x C)"""
+    m = nn.Linear(in_features, out_features, bias=bias)
+    # m.weight.data.uniform_(-0.1, 0.1)
+    # if bias:
+    #     m.bias.data.uniform_(-0.1, 0.1)
+    return m
+def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0, **kwargs):
+    """Weight-normalized Conv1d layer optimized for decoding"""
+    m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs)
+    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
+    nn.init.normal_(m.weight, mean=0, std=std)
+    nn.init.constant_(m.bias, 0)
+    return nn.utils.weight_norm(m, dim=2)
+def LayerNorm(embedding_dim):
+    m = nn.LayerNorm(embedding_dim)
+    return m
+# seq2seq models
+def base_architecture(args):
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 40)
+    args.vggblock_enc_config = getattr(
+        args, "vggblock_enc_config", DEFAULT_ENC_VGGBLOCK_CONFIG
+    )
+    args.transformer_enc_config = getattr(
+        args, "transformer_enc_config", DEFAULT_ENC_TRANSFORMER_CONFIG
+    )
+    args.enc_output_dim = getattr(args, "enc_output_dim", 512)
+    args.in_channels = getattr(args, "in_channels", 1)
+    args.tgt_embed_dim = getattr(args, "tgt_embed_dim", 128)
+    args.transformer_dec_config = getattr(
+        args, "transformer_dec_config", DEFAULT_ENC_TRANSFORMER_CONFIG
+    )
+    args.conv_dec_config = getattr(args, "conv_dec_config", DEFAULT_DEC_CONV_CONFIG)
+    args.transformer_context = getattr(args, "transformer_context", "None")
+@register_model_architecture("asr_vggtransformer", "vggtransformer_1")
+def vggtransformer_1(args):
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
+    args.vggblock_enc_config = getattr(
+        args, "vggblock_enc_config", "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
+    )
+    args.transformer_enc_config = getattr(
+        args,
+        "transformer_enc_config",
+        "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 14",
+    )
+    args.enc_output_dim = getattr(args, "enc_output_dim", 1024)
+    args.tgt_embed_dim = getattr(args, "tgt_embed_dim", 128)
+    args.conv_dec_config = getattr(args, "conv_dec_config", "((256, 3, True),) * 4")
+    args.transformer_dec_config = getattr(
+        args,
+        "transformer_dec_config",
+        "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 4",
+    )
+@register_model_architecture("asr_vggtransformer", "vggtransformer_2")
+def vggtransformer_2(args):
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
+    args.vggblock_enc_config = getattr(
+        args, "vggblock_enc_config", "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
+    )
+    args.transformer_enc_config = getattr(
+        args,
+        "transformer_enc_config",
+        "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 16",
+    )
+    args.enc_output_dim = getattr(args, "enc_output_dim", 1024)
+    args.tgt_embed_dim = getattr(args, "tgt_embed_dim", 512)
+    args.conv_dec_config = getattr(args, "conv_dec_config", "((256, 3, True),) * 4")
+    args.transformer_dec_config = getattr(
+        args,
+        "transformer_dec_config",
+        "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 6",
+    )
+@register_model_architecture("asr_vggtransformer", "vggtransformer_base")
+def vggtransformer_base(args):
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
+    args.vggblock_enc_config = getattr(
+        args, "vggblock_enc_config", "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
+    )
+    args.transformer_enc_config = getattr(
+        args, "transformer_enc_config", "((512, 8, 2048, True, 0.15, 0.15, 0.15),) * 12"
+    )
+    args.enc_output_dim = getattr(args, "enc_output_dim", 512)
+    args.tgt_embed_dim = getattr(args, "tgt_embed_dim", 512)
+    args.conv_dec_config = getattr(args, "conv_dec_config", "((256, 3, True),) * 4")
+    args.transformer_dec_config = getattr(
+        args, "transformer_dec_config", "((512, 8, 2048, True, 0.15, 0.15, 0.15),) * 6"
+    )
+    # Size estimations:
+    # Encoder:
+    #   - vggblock param: 64*1*3*3 + 64*64*3*3 + 128*64*3*3  + 128*128*3 = 258K
+    #   Transformer:
+    #   - input dimension adapter: 2560 x 512 -> 1.31M
+    #   - transformer_layers (x12) --> 37.74M
+    #       * MultiheadAttention: 512*512*3 (in_proj) + 512*512 (out_proj) = 1.048M
+    #       * FFN weight: 512*2048*2 = 2.097M
+    #   - output dimension adapter: 512 x 512 -> 0.26 M
+    # Decoder:
+    #   - LinearizedConv1d: 512 * 256 * 3 + 256 * 256 * 3 * 3
+    #   - transformer_layer: (x6) --> 25.16M
+    #        * MultiheadAttention (self-attention): 512*512*3 + 512*512 = 1.048M
+    #        * MultiheadAttention (encoder-attention): 512*512*3 + 512*512 = 1.048M
+    #        * FFN: 512*2048*2 = 2.097M
+    # Final FC:
+    #   - FC: 512*5000 = 256K (assuming vocab size 5K)
+    # In total:
+    #       ~65 M
+# CTC models
+def base_architecture_enconly(args):
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 40)
+    args.vggblock_enc_config = getattr(
+        args, "vggblock_enc_config", "[(32, 3, 2, 2, True)] * 2"
+    )
+    args.transformer_enc_config = getattr(
+        args, "transformer_enc_config", "((256, 4, 1024, True, 0.2, 0.2, 0.2),) * 2"
+    )
+    args.enc_output_dim = getattr(args, "enc_output_dim", 512)
+    args.in_channels = getattr(args, "in_channels", 1)
+    args.transformer_context = getattr(args, "transformer_context", "None")
+    args.transformer_sampling = getattr(args, "transformer_sampling", "None")
+@register_model_architecture("asr_vggtransformer_encoder", "vggtransformer_enc_1")
+def vggtransformer_enc_1(args):
+    # vggtransformer_1 is the same as vggtransformer_enc_big, except the number
+    # of layers is increased to 16
+    # keep it here for backward compatiablity purpose
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
+    args.vggblock_enc_config = getattr(
+        args, "vggblock_enc_config", "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
+    )
+    args.transformer_enc_config = getattr(
+        args,
+        "transformer_enc_config",
+        "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 16",
+    )
+    args.enc_output_dim = getattr(args, "enc_output_dim", 1024)
--- a/examples/speech_recognition/models/w2l_conv_glu_enc.py
+++ b/examples/speech_recognition/models/w2l_conv_glu_enc.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqEncoderModel,
+    register_model,
+    register_model_architecture,
+)
+from fairseq.modules.fairseq_dropout import FairseqDropout
+default_conv_enc_config = """[
+    (400, 13, 170, 0.2),
+    (440, 14, 0, 0.214),
+    (484, 15, 0, 0.22898),
+    (532, 16, 0, 0.2450086),
+    (584, 17, 0, 0.262159202),
+    (642, 18, 0, 0.28051034614),
+    (706, 19, 0, 0.30014607037),
+    (776, 20, 0, 0.321156295296),
+    (852, 21, 0, 0.343637235966),
+    (936, 22, 0, 0.367691842484),
+    (1028, 23, 0, 0.393430271458),
+    (1130, 24, 0, 0.42097039046),
+    (1242, 25, 0, 0.450438317792),
+    (1366, 26, 0, 0.481969000038),
+    (1502, 27, 0, 0.51570683004),
+    (1652, 28, 0, 0.551806308143),
+    (1816, 29, 0, 0.590432749713),
+]"""
+@register_model("asr_w2l_conv_glu_encoder")
+class W2lConvGluEncoderModel(FairseqEncoderModel):
+    def __init__(self, encoder):
+        super().__init__(encoder)
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument(
+            "--input-feat-per-channel",
+            type=int,
+            metavar="N",
+            help="encoder input dimension per input channel",
+        )
+        parser.add_argument(
+            "--in-channels",
+            type=int,
+            metavar="N",
+            help="number of encoder input channels",
+        )
+        parser.add_argument(
+            "--conv-enc-config",
+            type=str,
+            metavar="EXPR",
+            help="""
+    an array of tuples each containing the configuration of one conv layer
+    [(out_channels, kernel_size, padding, dropout), ...]
+            """,
+        )
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        conv_enc_config = getattr(args, "conv_enc_config", default_conv_enc_config)
+        encoder = W2lConvGluEncoder(
+            vocab_size=len(task.target_dictionary),
+            input_feat_per_channel=args.input_feat_per_channel,
+            in_channels=args.in_channels,
+            conv_enc_config=eval(conv_enc_config),
+        )
+        return cls(encoder)
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        lprobs = super().get_normalized_probs(net_output, log_probs, sample)
+        lprobs.batch_first = False
+        return lprobs
+class W2lConvGluEncoder(FairseqEncoder):
+    def __init__(
+        self, vocab_size, input_feat_per_channel, in_channels, conv_enc_config
+    ):
+        super().__init__(None)
+        self.input_dim = input_feat_per_channel
+        if in_channels != 1:
+            raise ValueError("only 1 input channel is currently supported")
+        self.conv_layers = nn.ModuleList()
+        self.linear_layers = nn.ModuleList()
+        self.dropouts = []
+        cur_channels = input_feat_per_channel
+        for out_channels, kernel_size, padding, dropout in conv_enc_config:
+            layer = nn.Conv1d(cur_channels, out_channels, kernel_size, padding=padding)
+            layer.weight.data.mul_(math.sqrt(3))  # match wav2letter init
+            self.conv_layers.append(nn.utils.weight_norm(layer))
+            self.dropouts.append(
+                FairseqDropout(dropout, module_name=self.__class__.__name__)
+            )
+            if out_channels % 2 != 0:
+                raise ValueError("odd # of out_channels is incompatible with GLU")
+            cur_channels = out_channels // 2  # halved by GLU
+        for out_channels in [2 * cur_channels, vocab_size]:
+            layer = nn.Linear(cur_channels, out_channels)
+            layer.weight.data.mul_(math.sqrt(3))
+            self.linear_layers.append(nn.utils.weight_norm(layer))
+            cur_channels = out_channels // 2
+    def forward(self, src_tokens, src_lengths, **kwargs):
+        """
+        src_tokens: padded tensor (B, T, C * feat)
+        src_lengths: tensor of original lengths of input utterances (B,)
+        """
+        B, T, _ = src_tokens.size()
+        x = src_tokens.transpose(1, 2).contiguous()  # (B, feat, T) assuming C == 1
+        for layer_idx in range(len(self.conv_layers)):
+            x = self.conv_layers[layer_idx](x)
+            x = F.glu(x, dim=1)
+            x = self.dropouts[layer_idx](x)
+        x = x.transpose(1, 2).contiguous()  # (B, T, 908)
+        x = self.linear_layers[0](x)
+        x = F.glu(x, dim=2)
+        x = self.dropouts[-1](x)
+        x = self.linear_layers[1](x)
+        assert x.size(0) == B
+        assert x.size(1) == T
+        encoder_out = x.transpose(0, 1)  # (T, B, vocab_size)
+        # need to debug this -- find a simpler/elegant way in pytorch APIs
+        encoder_padding_mask = (
+            torch.arange(T).view(1, T).expand(B, -1).to(x.device)
+            >= src_lengths.view(B, 1).expand(-1, T)
+        ).t()  # (B x T) -> (T x B)
+        return {
+            "encoder_out": encoder_out,  # (T, B, vocab_size)
+            "encoder_padding_mask": encoder_padding_mask,  # (T, B)
+        }
+    def reorder_encoder_out(self, encoder_out, new_order):
+        encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select(
+            1, new_order
+        )
+        encoder_out["encoder_padding_mask"] = encoder_out[
+            "encoder_padding_mask"
+        ].index_select(1, new_order)
+        return encoder_out
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return (1e6, 1e6)  # an arbitrary large number
+@register_model_architecture("asr_w2l_conv_glu_encoder", "w2l_conv_glu_enc")
+def w2l_conv_glu_enc(args):
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
+    args.in_channels = getattr(args, "in_channels", 1)
+    args.conv_enc_config = getattr(args, "conv_enc_config", default_conv_enc_config)
--- a/examples/speech_recognition/tasks/__init__.py
+++ b/examples/speech_recognition/tasks/__init__.py
+import importlib
+import os
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith(".py") and not file.startswith("_"):
+        task_name = file[: file.find(".py")]
+        importlib.import_module("examples.speech_recognition.tasks." + task_name)
--- a/examples/speech_recognition/tasks/speech_recognition.py
+++ b/examples/speech_recognition/tasks/speech_recognition.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import re
+import sys
+import torch
+from examples.speech_recognition.data import AsrDataset
+from examples.speech_recognition.data.replabels import replabel_symbol
+from fairseq.data import Dictionary
+from fairseq.tasks import LegacyFairseqTask, register_task
+def get_asr_dataset_from_json(data_json_path, tgt_dict):
+    """
+    Parse data json and create dataset.
+    See scripts/asr_prep_json.py which pack json from raw files
+    Json example:
+    {
+    "utts": {
+        "4771-29403-0025": {
+            "input": {
+                "length_ms": 170,
+                "path": "/tmp/file1.flac"
+            },
+            "output": {
+                "text": "HELLO \n",
+                "token": "HE LLO",
+                "tokenid": "4815, 861"
+            }
+        },
+        "1564-142299-0096": {
+            ...
+        }
+    }
+    """
+    if not os.path.isfile(data_json_path):
+        raise FileNotFoundError("Dataset not found: {}".format(data_json_path))
+    with open(data_json_path, "rb") as f:
+        data_samples = json.load(f)["utts"]
+        assert len(data_samples) != 0
+        sorted_samples = sorted(
+            data_samples.items(),
+            key=lambda sample: int(sample[1]["input"]["length_ms"]),
+            reverse=True,
+        )
+        aud_paths = [s[1]["input"]["path"] for s in sorted_samples]
+        ids = [s[0] for s in sorted_samples]
+        speakers = []
+        for s in sorted_samples:
+            m = re.search("(.+?)-(.+?)-(.+?)", s[0])
+            speakers.append(m.group(1) + "_" + m.group(2))
+        frame_sizes = [s[1]["input"]["length_ms"] for s in sorted_samples]
+        tgt = [
+            [int(i) for i in s[1]["output"]["tokenid"].split(", ")]
+            for s in sorted_samples
+        ]
+        # append eos
+        tgt = [[*t, tgt_dict.eos()] for t in tgt]
+        return AsrDataset(aud_paths, frame_sizes, tgt, tgt_dict, ids, speakers)
+@register_task("speech_recognition")
+class SpeechRecognitionTask(LegacyFairseqTask):
+    """
+    Task for training speech recognition model.
+    """
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument("data", help="path to data directory")
+        parser.add_argument(
+            "--silence-token", default="\u2581", help="token for silence (used by w2l)"
+        )
+        parser.add_argument(
+            "--max-source-positions",
+            default=sys.maxsize,
+            type=int,
+            metavar="N",
+            help="max number of frames in the source sequence",
+        )
+        parser.add_argument(
+            "--max-target-positions",
+            default=1024,
+            type=int,
+            metavar="N",
+            help="max number of tokens in the target sequence",
+        )
+    def __init__(self, args, tgt_dict):
+        super().__init__(args)
+        self.tgt_dict = tgt_dict
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task (e.g., load dictionaries)."""
+        dict_path = os.path.join(args.data, "dict.txt")
+        if not os.path.isfile(dict_path):
+            raise FileNotFoundError("Dict not found: {}".format(dict_path))
+        tgt_dict = Dictionary.load(dict_path)
+        if args.criterion == "ctc_loss":
+            tgt_dict.add_symbol("<ctc_blank>")
+        elif args.criterion == "asg_loss":
+            for i in range(1, args.max_replabel + 1):
+                tgt_dict.add_symbol(replabel_symbol(i))
+        print("| dictionary: {} types".format(len(tgt_dict)))
+        return cls(args, tgt_dict)
+    def load_dataset(self, split, combine=False, **kwargs):
+        """Load a given dataset split.
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        data_json_path = os.path.join(self.args.data, "{}.json".format(split))
+        self.datasets[split] = get_asr_dataset_from_json(data_json_path, self.tgt_dict)
+    def build_generator(self, models, args, **unused):
+        w2l_decoder = getattr(args, "w2l_decoder", None)
+        if w2l_decoder == "viterbi":
+            from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder
+            return W2lViterbiDecoder(args, self.target_dictionary)
+        elif w2l_decoder == "kenlm":
+            from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder
+            return W2lKenLMDecoder(args, self.target_dictionary)
+        elif w2l_decoder == "fairseqlm":
+            from examples.speech_recognition.w2l_decoder import W2lFairseqLMDecoder
+            return W2lFairseqLMDecoder(args, self.target_dictionary)
+        else:
+            return super().build_generator(models, args)
+    @property
+    def target_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self.tgt_dict
+    @property
+    def source_dictionary(self):
+        """Return the source :class:`~fairseq.data.Dictionary` (if applicable
+        for this task)."""
+        return None
+    def max_positions(self):
+        """Return the max speech and sentence length allowed by the task."""
+        return (self.args.max_source_positions, self.args.max_target_positions)
--- a/examples/speech_recognition/utils/wer_utils.py
+++ b/examples/speech_recognition/utils/wer_utils.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import absolute_import, division, print_function, unicode_literals
+import re
+from collections import deque
+from enum import Enum
+import numpy as np
+"""
+    Utility modules for computation of Word Error Rate,
+    Alignments, as well as more granular metrics like
+    deletion, insersion and substitutions.
+"""
+class Code(Enum):
+    match = 1
+    substitution = 2
+    insertion = 3
+    deletion = 4
+class Token(object):
+    def __init__(self, lbl="", st=np.nan, en=np.nan):
+        if np.isnan(st):
+            self.label, self.start, self.end = "", 0.0, 0.0
+        else:
+            self.label, self.start, self.end = lbl, st, en
+class AlignmentResult(object):
+    def __init__(self, refs, hyps, codes, score):
+        self.refs = refs  # std::deque<int>
+        self.hyps = hyps  # std::deque<int>
+        self.codes = codes  # std::deque<Code>
+        self.score = score  # float
+def coordinate_to_offset(row, col, ncols):
+    return int(row * ncols + col)
+def offset_to_row(offset, ncols):
+    return int(offset / ncols)
+def offset_to_col(offset, ncols):
+    return int(offset % ncols)
+def trimWhitespace(str):
+    return re.sub(" +", " ", re.sub(" *$", "", re.sub("^ *", "", str)))
+def str2toks(str):
+    pieces = trimWhitespace(str).split(" ")
+    toks = []
+    for p in pieces:
+        toks.append(Token(p, 0.0, 0.0))
+    return toks
+class EditDistance(object):
+    def __init__(self, time_mediated):
+        self.time_mediated_ = time_mediated
+        self.scores_ = np.nan  # Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>
+        self.backtraces_ = (
+            np.nan
+        )  # Eigen::Matrix<size_t, Eigen::Dynamic, Eigen::Dynamic> backtraces_;
+        self.confusion_pairs_ = {}
+    def cost(self, ref, hyp, code):
+        if self.time_mediated_:
+            if code == Code.match:
+                return abs(ref.start - hyp.start) + abs(ref.end - hyp.end)
+            elif code == Code.insertion:
+                return hyp.end - hyp.start
+            elif code == Code.deletion:
+                return ref.end - ref.start
+            else:  # substitution
+                return abs(ref.start - hyp.start) + abs(ref.end - hyp.end) + 0.1
+        else:
+            if code == Code.match:
+                return 0
+            elif code == Code.insertion or code == Code.deletion:
+                return 3
+            else:  # substitution
+                return 4
+    def get_result(self, refs, hyps):
+        res = AlignmentResult(refs=deque(), hyps=deque(), codes=deque(), score=np.nan)
+        num_rows, num_cols = self.scores_.shape
+        res.score = self.scores_[num_rows - 1, num_cols - 1]
+        curr_offset = coordinate_to_offset(num_rows - 1, num_cols - 1, num_cols)
+        while curr_offset != 0:
+            curr_row = offset_to_row(curr_offset, num_cols)
+            curr_col = offset_to_col(curr_offset, num_cols)
+            prev_offset = self.backtraces_[curr_row, curr_col]
+            prev_row = offset_to_row(prev_offset, num_cols)
+            prev_col = offset_to_col(prev_offset, num_cols)
+            res.refs.appendleft(curr_row - 1)  # Note: this was .push_front() in C++
+            res.hyps.appendleft(curr_col - 1)
+            if curr_row - 1 == prev_row and curr_col == prev_col:
+                res.codes.appendleft(Code.deletion)
+            elif curr_row == prev_row and curr_col - 1 == prev_col:
+                res.codes.appendleft(Code.insertion)
+            else:
+                # assert(curr_row - 1 == prev_row and curr_col - 1 == prev_col)
+                ref_str = refs[res.refs[0]].label
+                hyp_str = hyps[res.hyps[0]].label
+                if ref_str == hyp_str:
+                    res.codes.appendleft(Code.match)
+                else:
+                    res.codes.appendleft(Code.substitution)
+                    confusion_pair = "%s -> %s" % (ref_str, hyp_str)
+                    if confusion_pair not in self.confusion_pairs_:
+                        self.confusion_pairs_[confusion_pair] = 1
+                    else:
+                        self.confusion_pairs_[confusion_pair] += 1
+            curr_offset = prev_offset
+        return res
+    def align(self, refs, hyps):
+        if len(refs) == 0 and len(hyps) == 0:
+            return np.nan
+        # NOTE: we're not resetting the values in these matrices because every value
+        # will be overridden in the loop below. If this assumption doesn't hold,
+        # be sure to set all entries in self.scores_ and self.backtraces_ to 0.
+        self.scores_ = np.zeros((len(refs) + 1, len(hyps) + 1))
+        self.backtraces_ = np.zeros((len(refs) + 1, len(hyps) + 1))
+        num_rows, num_cols = self.scores_.shape
+        for i in range(num_rows):
+            for j in range(num_cols):
+                if i == 0 and j == 0:
+                    self.scores_[i, j] = 0.0
+                    self.backtraces_[i, j] = 0
+                    continue
+                if i == 0:
+                    self.scores_[i, j] = self.scores_[i, j - 1] + self.cost(
+                        None, hyps[j - 1], Code.insertion
+                    )
+                    self.backtraces_[i, j] = coordinate_to_offset(i, j - 1, num_cols)
+                    continue
+                if j == 0:
+                    self.scores_[i, j] = self.scores_[i - 1, j] + self.cost(
+                        refs[i - 1], None, Code.deletion
+                    )
+                    self.backtraces_[i, j] = coordinate_to_offset(i - 1, j, num_cols)
+                    continue
+                # Below here both i and j are greater than 0
+                ref = refs[i - 1]
+                hyp = hyps[j - 1]
+                best_score = self.scores_[i - 1, j - 1] + (
+                    self.cost(ref, hyp, Code.match)
+                    if (ref.label == hyp.label)
+                    else self.cost(ref, hyp, Code.substitution)
+                )
+                prev_row = i - 1
+                prev_col = j - 1
+                ins = self.scores_[i, j - 1] + self.cost(None, hyp, Code.insertion)
+                if ins < best_score:
+                    best_score = ins
+                    prev_row = i
+                    prev_col = j - 1
+                delt = self.scores_[i - 1, j] + self.cost(ref, None, Code.deletion)
+                if delt < best_score:
+                    best_score = delt
+                    prev_row = i - 1
+                    prev_col = j
+                self.scores_[i, j] = best_score
+                self.backtraces_[i, j] = coordinate_to_offset(
+                    prev_row, prev_col, num_cols
+                )
+        return self.get_result(refs, hyps)
+class WERTransformer(object):
+    def __init__(self, hyp_str, ref_str, verbose=True):
+        self.ed_ = EditDistance(False)
+        self.id2oracle_errs_ = {}
+        self.utts_ = 0
+        self.words_ = 0
+        self.insertions_ = 0
+        self.deletions_ = 0
+        self.substitutions_ = 0
+        self.process(["dummy_str", hyp_str, ref_str])
+        if verbose:
+            print("'%s' vs '%s'" % (hyp_str, ref_str))
+            self.report_result()
+    def process(self, input):  # std::vector<std::string>&& input
+        if len(input) < 3:
+            print(
+                "Input must be of the form <id> ... <hypo> <ref> , got ",
+                len(input),
+                " inputs:",
+            )
+            return None
+        # Align
+        # std::vector<Token> hyps;
+        # std::vector<Token> refs;
+        hyps = str2toks(input[-2])
+        refs = str2toks(input[-1])
+        alignment = self.ed_.align(refs, hyps)
+        if alignment is None:
+            print("Alignment is null")
+            return np.nan
+        # Tally errors
+        ins = 0
+        dels = 0
+        subs = 0
+        for code in alignment.codes:
+            if code == Code.substitution:
+                subs += 1
+            elif code == Code.insertion:
+                ins += 1
+            elif code == Code.deletion:
+                dels += 1
+        # Output
+        row = input
+        row.append(str(len(refs)))
+        row.append(str(ins))
+        row.append(str(dels))
+        row.append(str(subs))
+        # print(row)
+        # Accumulate
+        kIdIndex = 0
+        kNBestSep = "/"
+        pieces = input[kIdIndex].split(kNBestSep)
+        if len(pieces) == 0:
+            print(
+                "Error splitting ",
+                input[kIdIndex],
+                " on '",
+                kNBestSep,
+                "', got empty list",
+            )
+            return np.nan
+        id = pieces[0]
+        if id not in self.id2oracle_errs_:
+            self.utts_ += 1
+            self.words_ += len(refs)
+            self.insertions_ += ins
+            self.deletions_ += dels
+            self.substitutions_ += subs
+            self.id2oracle_errs_[id] = [ins, dels, subs]
+        else:
+            curr_err = ins + dels + subs
+            prev_err = np.sum(self.id2oracle_errs_[id])
+            if curr_err < prev_err:
+                self.id2oracle_errs_[id] = [ins, dels, subs]
+        return 0
+    def report_result(self):
+        # print("----------  Summary ---------------")
+        if self.words_ == 0:
+            print("No words counted")
+            return
+        # 1-best
+        best_wer = (
+            100.0
+            * (self.insertions_ + self.deletions_ + self.substitutions_)
+            / self.words_
+        )
+        print(
+            "\tWER = %0.2f%% (%i utts, %i words, %0.2f%% ins, "
+            "%0.2f%% dels, %0.2f%% subs)"
+            % (
+                best_wer,
+                self.utts_,
+                self.words_,
+                100.0 * self.insertions_ / self.words_,
+                100.0 * self.deletions_ / self.words_,
+                100.0 * self.substitutions_ / self.words_,
+            )
+        )
+    def wer(self):
+        if self.words_ == 0:
+            wer = np.nan
+        else:
+            wer = (
+                100.0
+                * (self.insertions_ + self.deletions_ + self.substitutions_)
+                / self.words_
+            )
+        return wer
+    def stats(self):
+        if self.words_ == 0:
+            stats = {}
+        else:
+            wer = (
+                100.0
+                * (self.insertions_ + self.deletions_ + self.substitutions_)
+                / self.words_
+            )
+            stats = dict(
+                {
+                    "wer": wer,
+                    "utts": self.utts_,
+                    "numwords": self.words_,
+                    "ins": self.insertions_,
+                    "dels": self.deletions_,
+                    "subs": self.substitutions_,
+                    "confusion_pairs": self.ed_.confusion_pairs_,
+                }
+            )
+        return stats
+def calc_wer(hyp_str, ref_str):
+    t = WERTransformer(hyp_str, ref_str, verbose=0)
+    return t.wer()
+def calc_wer_stats(hyp_str, ref_str):
+    t = WERTransformer(hyp_str, ref_str, verbose=0)
+    return t.stats()
+def get_wer_alignment_codes(hyp_str, ref_str):
+    """
+    INPUT: hypothesis string, reference string
+    OUTPUT: List of alignment codes (intermediate results from WER computation)
+    """
+    t = WERTransformer(hyp_str, ref_str, verbose=0)
+    return t.ed_.align(str2toks(ref_str), str2toks(hyp_str)).codes
+def merge_counts(x, y):
+    # Merge two hashes which have 'counts' as their values
+    # This can be used for example to merge confusion pair counts
+    #   conf_pairs = merge_counts(conf_pairs, stats['confusion_pairs'])
+    for k, v in y.items():
+        if k not in x:
+            x[k] = 0
+        x[k] += v
+    return x
--- a/examples/speech_recognition/w2l_decoder.py
+++ b/examples/speech_recognition/w2l_decoder.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Wav2letter decoders.
+"""
+import gc
+import itertools as it
+import os.path as osp
+import warnings
+from collections import deque, namedtuple
+import numpy as np
+import torch
+from examples.speech_recognition.data.replabels import unpack_replabels
+from fairseq import tasks
+from fairseq.utils import apply_to_sample
+try:
+    from wav2letter.common import create_word_dict, load_words
+    from wav2letter.criterion import CpuViterbiPath, get_data_ptr_as_bytes
+    from wav2letter.decoder import (
+        CriterionType,
+        DecoderOptions,
+        KenLM,
+        LM,
+        LMState,
+        SmearingMode,
+        Trie,
+        LexiconDecoder,
+        LexiconFreeDecoder,
+    )
+except:
+    warnings.warn(
+        "wav2letter python bindings are required to use this functionality. Please install from https://github.com/facebookresearch/wav2letter/wiki/Python-bindings"
+    )
+    LM = object
+    LMState = object
+class W2lDecoder(object):
+    def __init__(self, args, tgt_dict):
+        self.tgt_dict = tgt_dict
+        self.vocab_size = len(tgt_dict)
+        self.nbest = args.nbest
+        # criterion-specific init
+        if args.criterion == "ctc":
+            self.criterion_type = CriterionType.CTC
+            self.blank = (
+                tgt_dict.index("<ctc_blank>")
+                if "<ctc_blank>" in tgt_dict.indices
+                else tgt_dict.bos()
+            )
+            self.asg_transitions = None
+        elif args.criterion == "asg_loss":
+            self.criterion_type = CriterionType.ASG
+            self.blank = -1
+            self.asg_transitions = args.asg_transitions
+            self.max_replabel = args.max_replabel
+            assert len(self.asg_transitions) == self.vocab_size ** 2
+        else:
+            raise RuntimeError(f"unknown criterion: {args.criterion}")
+    def generate(self, models, sample, **unused):
+        """Generate a batch of inferences."""
+        # model.forward normally channels prev_output_tokens into the decoder
+        # separately, but SequenceGenerator directly calls model.encoder
+        encoder_input = {
+            k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens"
+        }
+        emissions = self.get_emissions(models, encoder_input)
+        return self.decode(emissions)
+    def get_emissions(self, models, encoder_input):
+        """Run encoder and normalize emissions"""
+        # encoder_out = models[0].encoder(**encoder_input)
+        encoder_out = models[0](**encoder_input)
+        if self.criterion_type == CriterionType.CTC:
+            emissions = models[0].get_normalized_probs(encoder_out, log_probs=True)
+        elif self.criterion_type == CriterionType.ASG:
+            emissions = encoder_out["encoder_out"]
+        return emissions.transpose(0, 1).float().cpu().contiguous()
+    def get_tokens(self, idxs):
+        """Normalize tokens by handling CTC blank, ASG replabels, etc."""
+        idxs = (g[0] for g in it.groupby(idxs))
+        if self.criterion_type == CriterionType.CTC:
+            idxs = filter(lambda x: x != self.blank, idxs)
+        elif self.criterion_type == CriterionType.ASG:
+            idxs = filter(lambda x: x >= 0, idxs)
+            idxs = unpack_replabels(list(idxs), self.tgt_dict, self.max_replabel)
+        return torch.LongTensor(list(idxs))
+class W2lViterbiDecoder(W2lDecoder):
+    def __init__(self, args, tgt_dict):
+        super().__init__(args, tgt_dict)
+    def decode(self, emissions):
+        B, T, N = emissions.size()
+        hypos = []
+        if self.asg_transitions is None:
+            transitions = torch.FloatTensor(N, N).zero_()
+        else:
+            transitions = torch.FloatTensor(self.asg_transitions).view(N, N)
+        viterbi_path = torch.IntTensor(B, T)
+        workspace = torch.ByteTensor(CpuViterbiPath.get_workspace_size(B, T, N))
+        CpuViterbiPath.compute(
+            B,
+            T,
+            N,
+            get_data_ptr_as_bytes(emissions),
+            get_data_ptr_as_bytes(transitions),
+            get_data_ptr_as_bytes(viterbi_path),
+            get_data_ptr_as_bytes(workspace),
+        )
+        return [
+            [{"tokens": self.get_tokens(viterbi_path[b].tolist()), "score": 0}]
+            for b in range(B)
+        ]
+class W2lKenLMDecoder(W2lDecoder):
+    def __init__(self, args, tgt_dict):
+        super().__init__(args, tgt_dict)
+        self.silence = (
+            tgt_dict.index("<ctc_blank>")
+            if "<ctc_blank>" in tgt_dict.indices
+            else tgt_dict.bos()
+        )
+        self.lexicon = load_words(args.lexicon)
+        self.word_dict = create_word_dict(self.lexicon)
+        self.unk_word = self.word_dict.get_index("<unk>")
+        self.lm = KenLM(args.kenlm_model, self.word_dict)
+        self.trie = Trie(self.vocab_size, self.silence)
+        start_state = self.lm.start(False)
+        for i, (word, spellings) in enumerate(self.lexicon.items()):
+            word_idx = self.word_dict.get_index(word)
+            _, score = self.lm.score(start_state, word_idx)
+            for spelling in spellings:
+                spelling_idxs = [tgt_dict.index(token) for token in spelling]
+                assert (
+                    tgt_dict.unk() not in spelling_idxs
+                ), f"{spelling} {spelling_idxs}"
+                self.trie.insert(spelling_idxs, word_idx, score)
+        self.trie.smear(SmearingMode.MAX)
+        self.decoder_opts = DecoderOptions(
+            args.beam,
+            int(getattr(args, "beam_size_token", len(tgt_dict))),
+            args.beam_threshold,
+            args.lm_weight,
+            args.word_score,
+            args.unk_weight,
+            args.sil_weight,
+            0,
+            False,
+            self.criterion_type,
+        )
+        if self.asg_transitions is None:
+            N = 768
+            # self.asg_transitions = torch.FloatTensor(N, N).zero_()
+            self.asg_transitions = []
+        self.decoder = LexiconDecoder(
+            self.decoder_opts,
+            self.trie,
+            self.lm,
+            self.silence,
+            self.blank,
+            self.unk_word,
+            self.asg_transitions,
+            False,
+        )
+    def decode(self, emissions):
+        B, T, N = emissions.size()
+        hypos = []
+        for b in range(B):
+            emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0)
+            results = self.decoder.decode(emissions_ptr, T, N)
+            nbest_results = results[: self.nbest]
+            hypos.append(
+                [
+                    {
+                        "tokens": self.get_tokens(result.tokens),
+                        "score": result.score,
+                        "words": [
+                            self.word_dict.get_entry(x) for x in result.words if x >= 0
+                        ],
+                    }
+                    for result in nbest_results
+                ]
+            )
+        return hypos
+FairseqLMState = namedtuple("FairseqLMState", ["prefix", "incremental_state", "probs"])
+class FairseqLM(LM):
+    def __init__(self, dictionary, model):
+        LM.__init__(self)
+        self.dictionary = dictionary
+        self.model = model
+        self.unk = self.dictionary.unk()
+        self.save_incremental = False  # this currently does not work properly
+        self.max_cache = 20_000
+        model.cuda()
+        model.eval()
+        model.make_generation_fast_()
+        self.states = {}
+        self.stateq = deque()
+    def start(self, start_with_nothing):
+        state = LMState()
+        prefix = torch.LongTensor([[self.dictionary.eos()]])
+        incremental_state = {} if self.save_incremental else None
+        with torch.no_grad():
+            res = self.model(prefix.cuda(), incremental_state=incremental_state)
+            probs = self.model.get_normalized_probs(res, log_probs=True, sample=None)
+        if incremental_state is not None:
+            incremental_state = apply_to_sample(lambda x: x.cpu(), incremental_state)
+        self.states[state] = FairseqLMState(
+            prefix.numpy(), incremental_state, probs[0, -1].cpu().numpy()
+        )
+        self.stateq.append(state)
+        return state
+    def score(self, state: LMState, token_index: int, no_cache: bool = False):
+        """
+        Evaluate language model based on the current lm state and new word
+        Parameters:
+        -----------
+        state: current lm state
+        token_index: index of the word
+                     (can be lexicon index then you should store inside LM the
+                      mapping between indices of lexicon and lm, or lm index of a word)
+        Returns:
+        --------
+        (LMState, float): pair of (new state, score for the current word)
+        """
+        curr_state = self.states[state]
+        def trim_cache(targ_size):
+            while len(self.stateq) > targ_size:
+                rem_k = self.stateq.popleft()
+                rem_st = self.states[rem_k]
+                rem_st = FairseqLMState(rem_st.prefix, None, None)
+                self.states[rem_k] = rem_st
+        if curr_state.probs is None:
+            new_incremental_state = (
+                curr_state.incremental_state.copy()
+                if curr_state.incremental_state is not None
+                else None
+            )
+            with torch.no_grad():
+                if new_incremental_state is not None:
+                    new_incremental_state = apply_to_sample(
+                        lambda x: x.cuda(), new_incremental_state
+                    )
+                elif self.save_incremental:
+                    new_incremental_state = {}
+                res = self.model(
+                    torch.from_numpy(curr_state.prefix).cuda(),
+                    incremental_state=new_incremental_state,
+                )
+                probs = self.model.get_normalized_probs(
+                    res, log_probs=True, sample=None
+                )
+                if new_incremental_state is not None:
+                    new_incremental_state = apply_to_sample(
+                        lambda x: x.cpu(), new_incremental_state
+                    )
+                curr_state = FairseqLMState(
+                    curr_state.prefix, new_incremental_state, probs[0, -1].cpu().numpy()
+                )
+            if not no_cache:
+                self.states[state] = curr_state
+                self.stateq.append(state)
+        score = curr_state.probs[token_index].item()
+        trim_cache(self.max_cache)
+        outstate = state.child(token_index)
+        if outstate not in self.states and not no_cache:
+            prefix = np.concatenate(
+                [curr_state.prefix, torch.LongTensor([[token_index]])], -1
+            )
+            incr_state = curr_state.incremental_state
+            self.states[outstate] = FairseqLMState(prefix, incr_state, None)
+        if token_index == self.unk:
+            score = float("-inf")
+        return outstate, score
+    def finish(self, state: LMState):
+        """
+        Evaluate eos for language model based on the current lm state
+        Returns:
+        --------
+        (LMState, float): pair of (new state, score for the current word)
+        """
+        return self.score(state, self.dictionary.eos())
+    def empty_cache(self):
+        self.states = {}
+        self.stateq = deque()
+        gc.collect()
+class W2lFairseqLMDecoder(W2lDecoder):
+    def __init__(self, args, tgt_dict):
+        super().__init__(args, tgt_dict)
+        self.silence = tgt_dict.bos()
+        self.unit_lm = getattr(args, "unit_lm", False)
+        self.lexicon = load_words(args.lexicon) if args.lexicon else None
+        self.idx_to_wrd = {}
+        checkpoint = torch.load(args.kenlm_model, map_location="cpu")
+        lm_args = checkpoint["args"]
+        lm_args.data = osp.dirname(args.kenlm_model)
+        print(lm_args)
+        task = tasks.setup_task(lm_args)
+        model = task.build_model(lm_args)
+        model.load_state_dict(checkpoint["model"], strict=False)
+        self.trie = Trie(self.vocab_size, self.silence)
+        self.word_dict = task.dictionary
+        self.unk_word = self.word_dict.unk()
+        self.lm = FairseqLM(self.word_dict, model)
+        self.decoder_opts = DecoderOptions(
+            args.beam,
+            int(getattr(args, "beam_size_token", len(tgt_dict))),
+            args.beam_threshold,
+            args.lm_weight,
+            args.word_score,
+            args.unk_weight,
+            args.sil_weight,
+            0,
+            False,
+            self.criterion_type,
+        )
+        if self.lexicon:
+            start_state = self.lm.start(False)
+            for i, (word, spellings) in enumerate(self.lexicon.items()):
+                if self.unit_lm:
+                    word_idx = i
+                    self.idx_to_wrd[i] = word
+                    score = 0
+                else:
+                    word_idx = self.word_dict.index(word)
+                    _, score = self.lm.score(start_state, word_idx, no_cache=True)
+                for spelling in spellings:
+                    spelling_idxs = [tgt_dict.index(token) for token in spelling]
+                    assert (
+                        tgt_dict.unk() not in spelling_idxs
+                    ), f"{spelling} {spelling_idxs}"
+                    self.trie.insert(spelling_idxs, word_idx, score)
+            self.trie.smear(SmearingMode.MAX)
+            self.decoder = LexiconDecoder(
+                self.decoder_opts,
+                self.trie,
+                self.lm,
+                self.silence,
+                self.blank,
+                self.unk_word,
+                [],
+                self.unit_lm,
+            )
+        else:
+            self.decoder = LexiconFreeDecoder(
+                self.decoder_opts, self.lm, self.silence, self.blank, []
+            )
+    def decode(self, emissions):
+        B, T, N = emissions.size()
+        hypos = []
+        def idx_to_word(idx):
+            if self.unit_lm:
+                return self.idx_to_wrd[idx]
+            else:
+                return self.word_dict[idx]
+        def make_hypo(result):
+            hypo = {"tokens": self.get_tokens(result.tokens), "score": result.score}
+            if self.lexicon:
+                hypo["words"] = [idx_to_word(x) for x in result.words if x >= 0]
+            return hypo
+        for b in range(B):
+            emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0)
+            results = self.decoder.decode(emissions_ptr, T, N)
+            nbest_results = results[: self.nbest]
+            hypos.append([make_hypo(result) for result in nbest_results])
+            self.lm.empty_cache()
+        return hypos
--- a/examples/speech_to_text/README.md
+++ b/examples/speech_to_text/README.md
+# Speech-to-Text (S2T) Modeling
+## Data Preparation
+S2T modeling data consists of source speech features, target text and other optional information
+(source text, speaker id, etc.). Fairseq S2T uses per-dataset-split TSV manifest files
+to store these information. Each data field is represented by a column in the TSV file.
+Unlike text token embeddings, speech features (e.g. log mel-filter banks) are usually fixed
+during model training and can be pre-computed. The manifest file contains the path to
+either the feature file in NumPy format or the WAV/FLAC audio file. For the latter,
+features will be extracted on-the-fly by fairseq S2T. Optionally, feature/audio files can be packed
+into uncompressed ZIP files (then accessed via byte offset and length) to improve I/O performance.
+Fairseq S2T also employs a YAML file for data related configurations: tokenizer type and dictionary path
+for the target text, feature transforms such as CMVN (cepstral mean and variance normalization) and SpecAugment,
+temperature-based resampling, etc.
+## Model Training & Evaluation
+Fairseq S2T uses the unified `fairseq-train`/`fairseq-generate` interface for model training and evaluation.
+It requires arguments `--task speech_to_text` and `--arch <arch in fairseq.models.speech_to_text.*>`.
+## Example 1: Speech Recognition (ASR) on LibriSpeech
+#### Data preparation
+Download and preprocess LibriSpeech data with
+```bash
+python examples/speech_to_text/prep_librispeech_data.py \
+    --output-root ${LS_ROOT} --vocab-type unigram --vocab-size 10000
+```
+where `LS_ROOT` is the root path for downloaded data as well as generated manifest and feature files.
+#### Training
+```bash
+fairseq-train ${LS_ROOT} --train-subset train --valid-subset dev --save-dir ${SAVE_DIR} --num-workers 4 \
+    --max-tokens 40000 --task speech_to_text --criterion label_smoothed_cross_entropy --max-update 300000 \
+    --arch s2t_transformer_s --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt --warmup-updates 10000 \
+    --clip-norm 10.0 --seed 1 --update-freq 8
+```
+where `SAVE_DIR` is the checkpoint root path. Here we use `--arch s2t_transformer_s` (31M parameters) as example.
+You may switch to `s2t_transformer_m` (71M) or `s2t_transformer_l` (268M) for better performance. We set
+`--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to update it accordingly when using more than 1 GPU.
+#### Inference & Evaluation
+Average the last 10 checkpoints and evaluate on the 4 splits
+(`dev-clean`, `dev-other`, `test-clean` and `test-other`):
+```bash
+CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt
+python scripts/average_checkpoints.py \
+    --inputs ${SAVE_DIR} --num-epoch-checkpoints 10 --output "${SAVE_DIR}/${CHECKPOINT_FILENAME}"
+for SUBSET in dev-clean dev-other test-clean test-other; do
+    fairseq-generate ${LS_ROOT} --gen-subset ${SUBSET} --task speech_to_text \
+        --path ${SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 --scoring wer
+done
+```
+#### Result
+| --arch | Params | dev-clean | dev-other | test-clean | test-other |
+|---|---|---|---|---|---|
+| s2t_transformer_s | 30M | 4.1 | 9.3 | 4.4 | 9.2 |
+| s2t_transformer_sp | 35M | 3.9 | 9.3 | 4.3 | 8.8 |
+| s2t_transformer_m | 71M | 3.5 | 8.1 | 3.7 | 8.1 |
+| s2t_transformer_mp | 84M | 3.3 | 7.8 | 3.7 | 8.2 |
+| s2t_transformer_l | 268M | 3.3 | 7.7 | 3.5 | 7.8 |
+| s2t_transformer_lp | 318M | 3.1 | 7.5 | 3.4 | 7.6 |
+## Example 2: Speech Translation (ST) on MuST-C
+#### Data Preparation
+[Download](https://ict.fbk.eu/must-c) and unpack MuST-C data to a path `MUSTC_ROOT`, then preprocess it with
+```bash
+python examples/speech_to_text/prep_mustc_data.py --data-root ${MUSTC_ROOT} \
+    --asr-vocab-type unigram --asr-vocab-size 5000 \
+    --st-vocab-type unigram --st-vocab-size 8000
+```
+The generated manifest and feature files will be available under `MUSTC_ROOT`.
+#### ASR
+###### Training
+```bash
+fairseq-train ${MUSTC_ROOT} --train-subset train_asr --valid-subset dev_asr --save-dir ${ASR_SAVE_DIR} \
+    --num-workers 4 --max-tokens 40000 --task speech_to_text --criterion label_smoothed_cross_entropy \
+    --report-accuracy --max-update 100000 --arch s2t_transformer_s --optimizer adam --lr 1e-3 \
+    --lr-scheduler inverse_sqrt --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8
+```
+where `ASR_SAVE_DIR` is the checkpoint root path. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU.
+You may want to update it accordingly when using more than 1 GPU.
+###### Inference & Evaluation
+```bash
+CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt
+python scripts/average_checkpoints.py \
+    --inputs ${ASR_SAVE_DIR} --num-epoch-checkpoints 10 --output "${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}"
+fairseq-generate ${MUSTC_ROOT} --gen-subset tst-COMMON_asr --task speech_to_text \
+    --path ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 \
+    --scoring wer --wer-tokenizer 13a --wer-lowercase --wer-remove-punct
+```
+###### Result
+| --arch | Params | En-De | En-Nl | En-Es | En-Fr | En-It | En-Pt | En-Ro | En-Ru |
+|---|---|---|---|---|---|---|---|---|---|
+| s2t_transformer_s | 31M | 18.2 | 17.6 | 17.7 | 17.2 | 17.9 | 19.1 | 18.1 | 17.7 |
+#### ST
+###### Training
+```bash
+fairseq-train ${MUSTC_ROOT} --train-subset train_st --valid-subset dev_st --save-dir ${ST_SAVE_DIR} \
+    --num-workers 4 --max-tokens 40000 --task speech_to_text --criterion label_smoothed_cross_entropy \
+    --report-accuracy --max-update 100000 --arch s2t_transformer_s --optimizer adam --lr 2e-3 \
+    --lr-scheduler inverse_sqrt --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 \
+    --load-pretrained-encoder-from ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}
+```
+where `ST_SAVE_DIR` is the checkpoint root path. The ST encoder is pre-trained by ASR for faster training and better
+performance: `--load-pretrained-encoder-from <ASR checkpoint path>`. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU.
+You may want to update it accordingly when using more than 1 GPU.
+###### Inference & Evaluation
+Average the last 10 checkpoints and evaluate on the `tst-COMMON` split:
+```bash
+CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt
+python scripts/average_checkpoints.py \
+    --inputs ${ST_SAVE_DIR} --num-epoch-checkpoints 10 --output "${ST_SAVE_DIR}/${CHECKPOINT_FILENAME}"
+fairseq-generate ${MUSTC_ROOT} --gen-subset tst-COMMON_st --task speech_to_text \
+    --path ${ST_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 --scoring sacrebleu
+```
+###### Result
+| --arch | Params | En-De | En-Nl | En-Es | En-Fr | En-It | En-Pt | En-Ro | En-Ru |
+|---|---|---|---|---|---|---|---|---|---|
+| s2t_transformer_s | 31M | 22.7 | 27.3 | 27.2 | 32.9 | 22.7 | 28.1 | 21.9 | 15.3 |
+## Example 3: ST on CoVoST
+#### Data Preparation
+Download and preprocess CoVoST data with
+```bash
+# En ASR
+python examples/speech_to_text/prep_covost_data.py --data-root ${COVOST_ROOT} \
+    --vocab-type char --src-lang en
+# ST
+python examples/speech_to_text/prep_covost_data.py --data-root ${COVOST_ROOT} \
+    --vocab-type char --src-lang fr --tgt-lang en
+```
+where `COVOST_ROOT` is the root path for downloaded data as well as generated manifest and feature files.
+#### ASR
+###### Training
+```bash
+fairseq-train ${COVOST_ROOT} --train-subset train_asr --valid-subset dev_asr --save-dir ${ASR_SAVE_DIR} \
+    --num-workers 4 --max-tokens 40000 --task speech_to_text --criterion label_smoothed_cross_entropy \
+    --report-accuracy --max-update 100000 --arch s2t_transformer_s --optimizer adam --lr 1e-3 \
+    --lr-scheduler inverse_sqrt --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8
+```
+where `ASR_SAVE_DIR` is the checkpoint root path. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU.
+You may want to update it accordingly when using more than 1 GPU.
+###### Inference & Evaluation
+```bash
+CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt
+python scripts/average_checkpoints.py \
+    --inputs ${ASR_SAVE_DIR} --num-epoch-checkpoints 10 --output "${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}"
+fairseq-generate ${COVOST_ROOT} --gen-subset test_asr_en --task speech_to_text \
+    --path ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 \
+    --scoring wer --wer-tokenizer 13a --wer-lowercase --wer-remove-punct
+```
+###### Result
+| --arch | Params | En |
+|---|---|---|
+| s2t_transformer_s | 31M | 25.6 |
+#### ST
+###### Training
+```bash
+fairseq-train ${COVOST_ROOT} --train-subset train_st_fr_en --valid-subset dev_st_fr_en --save-dir ${ST_SAVE_DIR} \
+    --num-workers 4 --max-tokens 40000 --task speech_to_text --criterion label_smoothed_cross_entropy \
+    --report-accuracy --max-update 100000 --arch s2t_transformer_s --optimizer adam --lr 2e-3 \
+    --lr-scheduler inverse_sqrt --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 \
+    --load-pretrained-encoder-from ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}
+```
+where `ST_SAVE_DIR` is the checkpoint root path. The ST encoder is pre-trained by En ASR for faster training and better
+performance: `--load-pretrained-encoder-from <ASR checkpoint path>`. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU.
+You may want to update it accordingly when using more than 1 GPU.
+###### Inference & Evaluation
+Average the last 10 checkpoints and evaluate on test split:
+```bash
+CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt
+python scripts/average_checkpoints.py \
+    --inputs ${ST_SAVE_DIR} --num-epoch-checkpoints 10 --output "${ST_SAVE_DIR}/${CHECKPOINT_FILENAME}"
+fairseq-generate ${COVOST_ROOT} --gen-subset test_st_fr_en --task speech_to_text \
+    --path ${ST_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 --scoring sacrebleu
+```
+###### Result
+| --arch | Params | Fr-En | De-En | Es-En | Ca-En | En-De | En-Ca | En-Fa | En-Et |
+|---|---|---|---|---|---|---|---|---|---|
+| s2t_transformer_s | 31M | 26.3 | 17.1 | 23.0 | 18.8 | 16.3 | 21.8 | 13.1 | 13.2 |
+## Citation
+Please cite as:
+```
+@inproceedings{wang2020fairseqs2t,
+  title = {fairseq S2T: Fast Speech-to-Text Modeling with fairseq},
+  author = {Changhan Wang and Yun Tang and Xutai Ma and Anne Wu and Dmytro Okhonko and Juan Pino},
+  booktitle = {Proceedings of the 2020 Conference of the Asian Chapter of the Association for Computational Linguistics (AACL): System Demonstrations},
+  year = {2020},
+}
+@inproceedings{ott2019fairseq,
+  title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling},
+  author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli},
+  booktitle = {Proceedings of NAACL-HLT 2019: Demonstrations},
+  year = {2019},
+}
+```