Add ctc loss to ASR task (#1233)

Summary: Adds CTC loss and corresponding transformer ctc based models. Tested with `CUDA_VISIBLE_DEVICES=0 python train.py $DATA_PATH --save-dir $SAVE_DIR --max-epoch 30 --task speech_recognition --arch vggtransformer_enc_1 --optimizer adadelta --lr 1.0 --adadelta-eps 1e-8 --adadelta-rho 0.95 --clip-norm 10.0 --max-tokens 10000 --log-format json --log-interval 1 --criterion ctc_loss --user-dir examples/speech_recognition/ --validate-interval=10` Pull Request resolved: https://github.com/pytorch/fairseq/pull/1233 Reviewed By: jcai1 Differential Revision: D17856824 Pulled By: okhonko fbshipit-source-id: f3eac64d3fdd0c37cf8c539dd360cfb610d8a6ef

Add ctc loss to ASR task (#1233)
Summary: Adds CTC loss and corresponding transformer ctc based models. Tested with `CUDA_VISIBLE_DEVICES=0 python train.py $DATA_PATH --save-dir $SAVE_DIR --max-epoch 30 --task speech_recognition --arch vggtransformer_enc_1 --optimizer adadelta --lr 1.0 --adadelta-eps 1e-8 --adadelta-rho 0.95 --clip-norm 10.0 --max-tokens 10000 --log-format json --log-interval 1 --criterion ctc_loss --user-dir examples/speech_recognition/ --validate-interval=10` Pull Request resolved: https://github.com/pytorch/fairseq/pull/1233 Reviewed By: jcai1 Differential Revision: D17856824 Pulled By: okhonko fbshipit-source-id: f3eac64d3fdd0c37cf8c539dd360cfb610d8a6ef
c4893ca6 · Dmytro Okhonko · Facebook Github Bot · 33646ac9 · c4893ca6 · c4893ca6
Commit c4893ca6 authored Oct 10, 2019 by Dmytro Okhonko Committed by Facebook Github Bot Oct 10, 2019
4 changed files
--- a/examples/speech_recognition/criterions/CTC_loss.py
+++ b/examples/speech_recognition/criterions/CTC_loss.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import math
+from itertools import groupby
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+from examples.speech_recognition.data.data_utils import encoder_padding_mask_to_lengths
+from examples.speech_recognition.utils.wer_utils import Code, EditDistance, Token
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+def arr_to_toks(arr):
+    toks = []
+    for a in arr:
+        toks.append(Token(str(a), 0.0, 0.0))
+    return toks
+def compute_ctc_uer(logprobs, targets, input_lengths, target_lengths, blank_idx):
+    """
+        Computes utterance error rate for CTC outputs
+        Args:
+            logprobs: (Torch.tensor)  N, T1, D tensor of log probabilities out
+                of the encoder
+            targets: (Torch.tensor) N, T2 tensor of targets
+            input_lengths: (Torch.tensor) lengths of inputs for each sample
+            target_lengths: (Torch.tensor) lengths of targets for each sample
+            blank_idx: (integer) id of blank symbol in target dictionary
+        Returns:
+            batch_errors: (float) errors in the batch
+            batch_total: (float)  total number of valid samples in batch
+    """
+    batch_errors = 0.0
+    batch_total = 0.0
+    for b in range(logprobs.shape[0]):
+        predicted = logprobs[b][: input_lengths[b]].argmax(1).tolist()
+        target = targets[b][: target_lengths[b]].tolist()
+        # dedup predictions
+        predicted = [p[0] for p in groupby(predicted)]
+        # remove blanks
+        nonblanks = []
+        for p in predicted:
+            if p != blank_idx:
+                nonblanks.append(p)
+        predicted = nonblanks
+        # compute the alignment based on EditDistance
+        alignment = EditDistance(False).align(
+            arr_to_toks(predicted), arr_to_toks(target)
+        )
+        # compute the number of errors
+        # note that alignment.codes can also be used for computing
+        # deletion, insersion and substitution error breakdowns in future
+        for a in alignment.codes:
+            if a != Code.match:
+                batch_errors += 1
+        batch_total += len(target)
+    return batch_errors, batch_total
+@register_criterion("ctc_loss")
+class CTCCriterion(FairseqCriterion):
+    def __init__(self, args, task):
+        super().__init__(args, task)
+        self.blank_idx = task.target_dictionary.index("<ctc_blank>")
+        self.pad_idx = task.target_dictionary.pad()
+        self.task = task
+    @staticmethod
+    def add_args(parser):
+        parser.add_argument(
+            "--use-source-side-sample-size",
+            action="store_true",
+            default=False,
+            help=(
+                "when compute average loss, using number of source tokens "
+                + "as denominator. "
+                + "This argument will be no-op if sentence-avg is used."
+            ),
+        )
+    def forward(self, model, sample, reduce=True, log_probs=True):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample["net_input"])
+        lprobs = model.get_normalized_probs(net_output, log_probs=log_probs)
+        if not hasattr(lprobs, "batch_first"):
+            logging.warning(
+                "ERROR: we need to know whether "
+                "batch first for the encoder output; "
+                "you need to set batch_first attribute for the return value of "
+                "model.get_normalized_probs. Now, we assume this is true, but "
+                "in the future, we will raise exception instead. "
+            )
+        batch_first = getattr(lprobs, "batch_first", True)
+        if not batch_first:
+            max_seq_len = lprobs.size(0)
+            bsz = lprobs.size(1)
+        else:
+            max_seq_len = lprobs.size(1)
+            bsz = lprobs.size(0)
+        device = net_output["encoder_out"].device
+        input_lengths = encoder_padding_mask_to_lengths(
+            net_output["encoder_padding_mask"], max_seq_len, bsz, device
+        )
+        target_lengths = sample["target_lengths"]
+        targets = sample["target"]
+        if batch_first:
+            # N T D -> T N D (F.ctc_loss expects this)
+            lprobs = lprobs.transpose(0, 1)
+        pad_mask = sample["target"] != self.pad_idx
+        targets_flat = targets.masked_select(pad_mask)
+        loss = F.ctc_loss(
+            lprobs,
+            targets_flat,
+            input_lengths,
+            target_lengths,
+            blank=self.blank_idx,
+            reduction="sum",
+            zero_infinity=True,
+        )
+        lprobs = lprobs.transpose(0, 1)  # T N D -> N T D
+        errors, total = compute_ctc_uer(
+            lprobs, targets, input_lengths, target_lengths, self.blank_idx
+        )
+        if self.args.sentence_avg:
+            sample_size = sample["target"].size(0)
+        else:
+            if self.args.use_source_side_sample_size:
+                sample_size = torch.sum(input_lengths).item()
+            else:
+                sample_size = sample["ntokens"]
+        logging_output = {
+            "loss": utils.item(loss.data) if reduce else loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+            "errors": errors,
+            "total": total,
+            "nframes": torch.sum(sample["net_input"]["src_lengths"]).item(),
+        }
+        return loss, sample_size, logging_output
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        errors = sum(log.get("errors", 0) for log in logging_outputs)
+        total = sum(log.get("total", 0) for log in logging_outputs)
+        nframes = sum(log.get("nframes", 0) for log in logging_outputs)
+        agg_output = {
+            "loss": loss_sum / sample_size / math.log(2),
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "nframes": nframes,
+            "sample_size": sample_size,
+            "acc": 100.0 - min(errors * 100.0 / total, 100.0),
+        }
+        if sample_size != ntokens:
+            agg_output["nll_loss"] = loss_sum / ntokens / math.log(2)
+        return agg_output
--- a/examples/speech_recognition/data/data_utils.py
+++ b/examples/speech_recognition/data/data_utils.py
@@ -58,3 +58,39 @@ def lengths_to_encoder_padding_mask(lengths, batch_first=False):
        return encoder_padding_mask.t(), max_lengths
    else:
        return encoder_padding_mask, max_lengths
+def encoder_padding_mask_to_lengths(
+    encoder_padding_mask, max_lengths, batch_size, device
+):
+    """
+    convert encoder_padding_mask (2-D binary tensor) to a 1-D tensor
+    Conventionally, encoder output contains a encoder_padding_mask, which is
+    a 2-D mask in a shape (T, B), whose (t, b) element indicate whether
+    encoder_out[t, b] is a valid output (=0) or not (=1). Occasionally, we
+    need to convert this mask tensor to a 1-D tensor in shape (B, ), where
+    [b] denotes the valid length of b-th sequence
+    Args:
+        encoder_padding_mask: a (T, B)-shaped binary tensor or None; if None,
+        indicating all are valid
+    Return:
+        seq_lengths: a (B,)-shaped tensor, where its (b, )-th element is the
+        number of valid elements of b-th sequence
+        max_lengths: maximum length of all sequence, if encoder_padding_mask is
+        not None, max_lengths must equal to encoder_padding_mask.size(0)
+        batch_size: batch size; if encoder_padding_mask is
+        not None, max_lengths must equal to encoder_padding_mask.size(1)
+        device: which device to put the result on
+    """
+    if encoder_padding_mask is None:
+        return torch.Tensor([max_lengths] * batch_size).to(torch.int32).to(device)
+    assert encoder_padding_mask.size(0) == max_lengths, "max_lengths does not match"
+    assert encoder_padding_mask.size(1) == batch_size, "batch_size does not match"
+    return max_lengths - torch.sum(encoder_padding_mask, dim=0)
--- a/examples/speech_recognition/models/vggtransformer.py
+++ b/examples/speech_recognition/models/vggtransformer.py
@@ -12,6 +12,7 @@ import torch.nn as nn
 from fairseq import utils
 from fairseq.models import (
    FairseqEncoder,
+    FairseqEncoderModel,
    FairseqIncrementalDecoder,
    FairseqEncoderDecoderModel,
    register_model,
@@ -709,6 +710,141 @@ class TransformerDecoder(FairseqIncrementalDecoder):
            x = x.transpose(0, 1)
        return x
+@register_model("asr_vggtransformer_encoder")
+class VGGTransformerEncoderModel(FairseqEncoderModel):
+    def __init__(self, encoder):
+        super().__init__(encoder)
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument(
+            "--input-feat-per-channel",
+            type=int,
+            metavar="N",
+            help="encoder input dimension per input channel",
+        )
+        parser.add_argument(
+            "--vggblock-enc-config",
+            type=str,
+            metavar="EXPR",
+            help="""
+    an array of tuples each containing the configuration of one vggblock
+    [(out_channels, conv_kernel_size, pooling_kernel_size,num_conv_layers), ...]
+    """,
+        )
+        parser.add_argument(
+            "--transformer-enc-config",
+            type=str,
+            metavar="EXPR",
+            help="""
+    a tuple containing the configuration of the Transformer layers
+    configurations:
+    [(input_dim,
+      num_heads,
+      ffn_dim,
+      normalize_before,
+      dropout,
+      attention_dropout,
+      relu_dropout), ]""",
+        )
+        parser.add_argument(
+            "--enc-output-dim",
+            type=int,
+            metavar="N",
+            help="encoder output dimension, projecting the LSTM output",
+        )
+        parser.add_argument(
+            "--in-channels",
+            type=int,
+            metavar="N",
+            help="number of encoder input channels",
+        )
+        parser.add_argument(
+            "--transformer-context",
+            type=str,
+            metavar="EXPR",
+            help="""
+    either None or a tuple of two ints, indicating left/right context a
+    transformer can have access to""",
+        )
+        parser.add_argument(
+            "--transformer-sampling",
+            type=str,
+            metavar="EXPR",
+            help="""
+    either None or a tuple of ints, indicating sampling factor in each layer""",
+        )
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        base_architecture_enconly(args)
+        encoder = VGGTransformerEncoderOnly(
+            vocab_size=len(task.target_dictionary),
+            input_feat_per_channel=args.input_feat_per_channel,
+            vggblock_config=eval(args.vggblock_enc_config),
+            transformer_config=eval(args.transformer_enc_config),
+            encoder_output_dim=args.enc_output_dim,
+            in_channels=args.in_channels,
+            transformer_context=eval(args.transformer_context),
+            transformer_sampling=eval(args.transformer_sampling),
+        )
+        return cls(encoder)
+    def get_normalized_probs(self, net_output, log_probs, sample=None):
+        # net_output['encoder_out'] is a (T, B, D) tensor
+        lprobs = super().get_normalized_probs(net_output, log_probs, sample)
+        # lprobs is a (T, B, D) tensor
+        # we need to transoose to get (B, T, D) tensor
+        lprobs = lprobs.transpose(0, 1).contiguous()
+        lprobs.batch_first = True
+        return lprobs
+class VGGTransformerEncoderOnly(VGGTransformerEncoder):
+    def __init__(
+        self,
+        vocab_size,
+        input_feat_per_channel,
+        vggblock_config=DEFAULT_ENC_VGGBLOCK_CONFIG,
+        transformer_config=DEFAULT_ENC_TRANSFORMER_CONFIG,
+        encoder_output_dim=512,
+        in_channels=1,
+        transformer_context=None,
+        transformer_sampling=None,
+    ):
+        super().__init__(
+            input_feat_per_channel=input_feat_per_channel,
+            vggblock_config=vggblock_config,
+            transformer_config=transformer_config,
+            encoder_output_dim=encoder_output_dim,
+            in_channels=in_channels,
+            transformer_context=transformer_context,
+            transformer_sampling=transformer_sampling,
+        )
+        self.fc_out = Linear(self.encoder_output_dim, vocab_size)
+    def forward(self, src_tokens, src_lengths, **kwargs):
+        """
+        src_tokens: padded tensor (B, T, C * feat)
+        src_lengths: tensor of original lengths of input utterances (B,)
+        """
+        enc_out = super().forward(src_tokens, src_lengths)
+        x = self.fc_out(enc_out["encoder_out"])
+        # x = F.log_softmax(x, dim=-1)
+        # Note: no need this line, because model.get_normalized_prob will call
+        # log_softmax
+        return {
+            "encoder_out": x,  # (T, B, C)
+            "encoder_padding_mask": enc_out["encoder_padding_mask"],  # (T, B)
+        }
+    def max_positions(self):
+        """Maximum input length supported by the encoder."""
+        return (1e6, 1e6)  # an arbitrary large number
 def Embedding(num_embeddings, embedding_dim, padding_idx):
    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
@@ -836,3 +972,35 @@ def vggtransformer_base(args):
    #   - FC: 512*5000 = 256K (assuming vocab size 5K)
    # In total:
    #       ~65 M
+# CTC models
+def base_architecture_enconly(args):
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 40)
+    args.vggblock_enc_config = getattr(
+        args, "vggblock_enc_config", "[(32, 3, 2, 2, True)] * 2"
+    )
+    args.transformer_enc_config = getattr(
+        args, "transformer_enc_config", "((256, 4, 1024, True, 0.2, 0.2, 0.2),) * 2"
+    )
+    args.enc_output_dim = getattr(args, "enc_output_dim", 512)
+    args.in_channels = getattr(args, "in_channels", 1)
+    args.transformer_context = getattr(args, "transformer_context", "None")
+    args.transformer_sampling = getattr(args, "transformer_sampling", "None")
+@register_model_architecture("asr_vggtransformer_encoder", "vggtransformer_enc_1")
+def vggtransformer_enc_1(args):
+    # vggtransformer_1 is the same as vggtransformer_enc_big, except the number
+    # of layers is increased to 16
+    # keep it here for backward compatiablity purpose
+    args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
+    args.vggblock_enc_config = getattr(
+        args, "vggblock_enc_config", "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
+    )
+    args.transformer_enc_config = getattr(
+        args,
+        "transformer_enc_config",
+        "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 16",
+    )
+    args.enc_output_dim = getattr(args, "enc_output_dim", 1024)
--- a/examples/speech_recognition/utils/wer_utils.py
+++ b/examples/speech_recognition/utils/wer_utils.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import absolute_import, division, print_function, unicode_literals
+import re
+from collections import deque
+from enum import Enum
+import numpy as np
+"""
+    Utility modules for computation of Word Error Rate,
+    Alignments, as well as more granular metrics like
+    deletion, insersion and substitutions.
+"""
+class Code(Enum):
+    match = 1
+    substitution = 2
+    insertion = 3
+    deletion = 4
+class Token(object):
+    def __init__(self, lbl="", st=np.nan, en=np.nan):
+        if np.isnan(st):
+            self.label, self.start, self.end = "", 0.0, 0.0
+        else:
+            self.label, self.start, self.end = lbl, st, en
+class AlignmentResult(object):
+    def __init__(self, refs, hyps, codes, score):
+        self.refs = refs  # std::deque<int>
+        self.hyps = hyps  # std::deque<int>
+        self.codes = codes  # std::deque<Code>
+        self.score = score  # float
+def coordinate_to_offset(row, col, ncols):
+    return int(row * ncols + col)
+def offset_to_row(offset, ncols):
+    return int(offset / ncols)
+def offset_to_col(offset, ncols):
+    return int(offset % ncols)
+def trimWhitespace(str):
+    return re.sub(" +", " ", re.sub(" *$", "", re.sub("^ *", "", str)))
+def str2toks(str):
+    pieces = trimWhitespace(str).split(" ")
+    toks = []
+    for p in pieces:
+        toks.append(Token(p, 0.0, 0.0))
+    return toks
+class EditDistance(object):
+    def __init__(self, time_mediated):
+        self.time_mediated_ = time_mediated
+        self.scores_ = np.nan  # Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>
+        self.backtraces_ = (
+            np.nan
+        )  # Eigen::Matrix<size_t, Eigen::Dynamic, Eigen::Dynamic> backtraces_;
+        self.confusion_pairs_ = {}
+    def cost(self, ref, hyp, code):
+        if self.time_mediated_:
+            if code == Code.match:
+                return abs(ref.start - hyp.start) + abs(ref.end - hyp.end)
+            elif code == Code.insertion:
+                return hyp.end - hyp.start
+            elif code == Code.deletion:
+                return ref.end - ref.start
+            else:  # substitution
+                return abs(ref.start - hyp.start) + abs(ref.end - hyp.end) + 0.1
+        else:
+            if code == Code.match:
+                return 0
+            elif code == Code.insertion or code == Code.deletion:
+                return 3
+            else:  # substitution
+                return 4
+    def get_result(self, refs, hyps):
+        res = AlignmentResult(refs=deque(), hyps=deque(), codes=deque(), score=np.nan)
+        num_rows, num_cols = self.scores_.shape
+        res.score = self.scores_[num_rows - 1, num_cols - 1]
+        curr_offset = coordinate_to_offset(num_rows - 1, num_cols - 1, num_cols)
+        while curr_offset != 0:
+            curr_row = offset_to_row(curr_offset, num_cols)
+            curr_col = offset_to_col(curr_offset, num_cols)
+            prev_offset = self.backtraces_[curr_row, curr_col]
+            prev_row = offset_to_row(prev_offset, num_cols)
+            prev_col = offset_to_col(prev_offset, num_cols)
+            res.refs.appendleft(curr_row - 1)  # Note: this was .push_front() in C++
+            res.hyps.appendleft(curr_col - 1)
+            if curr_row - 1 == prev_row and curr_col == prev_col:
+                res.codes.appendleft(Code.deletion)
+            elif curr_row == prev_row and curr_col - 1 == prev_col:
+                res.codes.appendleft(Code.insertion)
+            else:
+                # assert(curr_row - 1 == prev_row and curr_col - 1 == prev_col)
+                ref_str = refs[res.refs[0]].label
+                hyp_str = hyps[res.hyps[0]].label
+                if ref_str == hyp_str:
+                    res.codes.appendleft(Code.match)
+                else:
+                    res.codes.appendleft(Code.substitution)
+                    confusion_pair = "%s -> %s" % (ref_str, hyp_str)
+                    if confusion_pair not in self.confusion_pairs_:
+                        self.confusion_pairs_[confusion_pair] = 1
+                    else:
+                        self.confusion_pairs_[confusion_pair] += 1
+            curr_offset = prev_offset
+        return res
+    def align(self, refs, hyps):
+        if len(refs) == 0 and len(hyps) == 0:
+            return np.nan
+        # NOTE: we're not resetting the values in these matrices because every value
+        # will be overridden in the loop below. If this assumption doesn't hold,
+        # be sure to set all entries in self.scores_ and self.backtraces_ to 0.
+        self.scores_ = np.zeros((len(refs) + 1, len(hyps) + 1))
+        self.backtraces_ = np.zeros((len(refs) + 1, len(hyps) + 1))
+        num_rows, num_cols = self.scores_.shape
+        for i in range(num_rows):
+            for j in range(num_cols):
+                if i == 0 and j == 0:
+                    self.scores_[i, j] = 0.0
+                    self.backtraces_[i, j] = 0
+                    continue
+                if i == 0:
+                    self.scores_[i, j] = self.scores_[i, j - 1] + self.cost(
+                        None, hyps[j - 1], Code.insertion
+                    )
+                    self.backtraces_[i, j] = coordinate_to_offset(i, j - 1, num_cols)
+                    continue
+                if j == 0:
+                    self.scores_[i, j] = self.scores_[i - 1, j] + self.cost(
+                        refs[i - 1], None, Code.deletion
+                    )
+                    self.backtraces_[i, j] = coordinate_to_offset(i - 1, j, num_cols)
+                    continue
+                # Below here both i and j are greater than 0
+                ref = refs[i - 1]
+                hyp = hyps[j - 1]
+                best_score = self.scores_[i - 1, j - 1] + (
+                    self.cost(ref, hyp, Code.match)
+                    if (ref.label == hyp.label)
+                    else self.cost(ref, hyp, Code.substitution)
+                )
+                prev_row = i - 1
+                prev_col = j - 1
+                ins = self.scores_[i, j - 1] + self.cost(None, hyp, Code.insertion)
+                if ins < best_score:
+                    best_score = ins
+                    prev_row = i
+                    prev_col = j - 1
+                delt = self.scores_[i - 1, j] + self.cost(ref, None, Code.deletion)
+                if delt < best_score:
+                    best_score = delt
+                    prev_row = i - 1
+                    prev_col = j
+                self.scores_[i, j] = best_score
+                self.backtraces_[i, j] = coordinate_to_offset(
+                    prev_row, prev_col, num_cols
+                )
+        return self.get_result(refs, hyps)
+class WERTransformer(object):
+    def __init__(self, hyp_str, ref_str, verbose=True):
+        self.ed_ = EditDistance(False)
+        self.id2oracle_errs_ = {}
+        self.utts_ = 0
+        self.words_ = 0
+        self.insertions_ = 0
+        self.deletions_ = 0
+        self.substitutions_ = 0
+        self.process(["dummy_str", hyp_str, ref_str])
+        if verbose:
+            print("'%s' vs '%s'" % (hyp_str, ref_str))
+            self.report_result()
+    def process(self, input):  # std::vector<std::string>&& input
+        if len(input) < 3:
+            print(
+                "Input must be of the form <id> ... <hypo> <ref> , got ",
+                len(input),
+                " inputs:",
+            )
+            return None
+        # Align
+        # std::vector<Token> hyps;
+        # std::vector<Token> refs;
+        hyps = str2toks(input[-2])
+        refs = str2toks(input[-1])
+        alignment = self.ed_.align(refs, hyps)
+        if alignment is None:
+            print("Alignment is null")
+            return np.nan
+        # Tally errors
+        ins = 0
+        dels = 0
+        subs = 0
+        for code in alignment.codes:
+            if code == Code.substitution:
+                subs += 1
+            elif code == Code.insertion:
+                ins += 1
+            elif code == Code.deletion:
+                dels += 1
+        # Output
+        row = input
+        row.append(str(len(refs)))
+        row.append(str(ins))
+        row.append(str(dels))
+        row.append(str(subs))
+        # print(row)
+        # Accumulate
+        kIdIndex = 0
+        kNBestSep = "/"
+        pieces = input[kIdIndex].split(kNBestSep)
+        if len(pieces) == 0:
+            print(
+                "Error splitting ",
+                input[kIdIndex],
+                " on '",
+                kNBestSep,
+                "', got empty list",
+            )
+            return np.nan
+        id = pieces[0]
+        if id not in self.id2oracle_errs_:
+            self.utts_ += 1
+            self.words_ += len(refs)
+            self.insertions_ += ins
+            self.deletions_ += dels
+            self.substitutions_ += subs
+            self.id2oracle_errs_[id] = [ins, dels, subs]
+        else:
+            curr_err = ins + dels + subs
+            prev_err = np.sum(self.id2oracle_errs_[id])
+            if curr_err < prev_err:
+                self.id2oracle_errs_[id] = [ins, dels, subs]
+        return 0
+    def report_result(self):
+        # print("----------  Summary ---------------")
+        if self.words_ == 0:
+            print("No words counted")
+            return
+        # 1-best
+        best_wer = (
+            100.0
+            * (self.insertions_ + self.deletions_ + self.substitutions_)
+            / self.words_
+        )
+        print(
+            "\tWER = %0.2f%% (%i utts, %i words, %0.2f%% ins, "
+            "%0.2f%% dels, %0.2f%% subs)"
+            % (
+                best_wer,
+                self.utts_,
+                self.words_,
+                100.0 * self.insertions_ / self.words_,
+                100.0 * self.deletions_ / self.words_,
+                100.0 * self.substitutions_ / self.words_,
+            )
+        )
+    def wer(self):
+        if self.words_ == 0:
+            wer = np.nan
+        else:
+            wer = (
+                100.0
+                * (self.insertions_ + self.deletions_ + self.substitutions_)
+                / self.words_
+            )
+        return wer
+    def stats(self):
+        if self.words_ == 0:
+            stats = {}
+        else:
+            wer = (
+                100.0
+                * (self.insertions_ + self.deletions_ + self.substitutions_)
+                / self.words_
+            )
+            stats = dict(
+                {
+                    "wer": wer,
+                    "utts": self.utts_,
+                    "numwords": self.words_,
+                    "ins": self.insertions_,
+                    "dels": self.deletions_,
+                    "subs": self.substitutions_,
+                    "confusion_pairs": self.ed_.confusion_pairs_,
+                }
+            )
+        return stats
+def calc_wer(hyp_str, ref_str):
+    t = WERTransformer(hyp_str, ref_str, verbose=0)
+    return t.wer()
+def calc_wer_stats(hyp_str, ref_str):
+    t = WERTransformer(hyp_str, ref_str, verbose=0)
+    return t.stats()
+def get_wer_alignment_codes(hyp_str, ref_str):
+    """
+    INPUT: hypothesis string, reference string
+    OUTPUT: List of alignment codes (intermediate results from WER computation)
+    """
+    t = WERTransformer(hyp_str, ref_str, verbose=0)
+    return t.ed_.align(str2toks(ref_str), str2toks(hyp_str)).codes
+def merge_counts(x, y):
+    # Merge two hashes which have 'counts' as their values
+    # This can be used for example to merge confusion pair counts
+    #   conf_pairs = merge_counts(conf_pairs, stats['confusion_pairs'])
+    for k, v in y.items():
+        if k not in x:
+            x[k] = 0
+        x[k] += v
+    return x