"tests/python/pytorch/dataloading/test_dataloader.py" did not exist on "72781efbcf2532254957090c889ec5b7b9563245"
Commit c4893ca6 authored by Dmytro Okhonko's avatar Dmytro Okhonko Committed by Facebook Github Bot
Browse files

Add ctc loss to ASR task (#1233)

Summary:
Adds CTC loss and corresponding transformer ctc based models.

Tested with
`CUDA_VISIBLE_DEVICES=0 python train.py $DATA_PATH --save-dir $SAVE_DIR --max-epoch 30 --task speech_recognition --arch vggtransformer_enc_1 --optimizer adadelta --lr 1.0 --adadelta-eps 1e-8 --adadelta-rho 0.95 --clip-norm 10.0  --max-tokens 10000 --log-format json --log-interval 1 --criterion ctc_loss --user-dir examples/speech_recognition/ --validate-interval=10`
Pull Request resolved: https://github.com/pytorch/fairseq/pull/1233

Reviewed By: jcai1

Differential Revision: D17856824

Pulled By: okhonko

fbshipit-source-id: f3eac64d3fdd0c37cf8c539dd360cfb610d8a6ef
parent 33646ac9
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import math
from itertools import groupby
import torch
import torch.nn.functional as F
from fairseq import utils
from fairseq.criterions import FairseqCriterion, register_criterion
from examples.speech_recognition.data.data_utils import encoder_padding_mask_to_lengths
from examples.speech_recognition.utils.wer_utils import Code, EditDistance, Token
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
def arr_to_toks(arr):
toks = []
for a in arr:
toks.append(Token(str(a), 0.0, 0.0))
return toks
def compute_ctc_uer(logprobs, targets, input_lengths, target_lengths, blank_idx):
"""
Computes utterance error rate for CTC outputs
Args:
logprobs: (Torch.tensor) N, T1, D tensor of log probabilities out
of the encoder
targets: (Torch.tensor) N, T2 tensor of targets
input_lengths: (Torch.tensor) lengths of inputs for each sample
target_lengths: (Torch.tensor) lengths of targets for each sample
blank_idx: (integer) id of blank symbol in target dictionary
Returns:
batch_errors: (float) errors in the batch
batch_total: (float) total number of valid samples in batch
"""
batch_errors = 0.0
batch_total = 0.0
for b in range(logprobs.shape[0]):
predicted = logprobs[b][: input_lengths[b]].argmax(1).tolist()
target = targets[b][: target_lengths[b]].tolist()
# dedup predictions
predicted = [p[0] for p in groupby(predicted)]
# remove blanks
nonblanks = []
for p in predicted:
if p != blank_idx:
nonblanks.append(p)
predicted = nonblanks
# compute the alignment based on EditDistance
alignment = EditDistance(False).align(
arr_to_toks(predicted), arr_to_toks(target)
)
# compute the number of errors
# note that alignment.codes can also be used for computing
# deletion, insersion and substitution error breakdowns in future
for a in alignment.codes:
if a != Code.match:
batch_errors += 1
batch_total += len(target)
return batch_errors, batch_total
@register_criterion("ctc_loss")
class CTCCriterion(FairseqCriterion):
def __init__(self, args, task):
super().__init__(args, task)
self.blank_idx = task.target_dictionary.index("<ctc_blank>")
self.pad_idx = task.target_dictionary.pad()
self.task = task
@staticmethod
def add_args(parser):
parser.add_argument(
"--use-source-side-sample-size",
action="store_true",
default=False,
help=(
"when compute average loss, using number of source tokens "
+ "as denominator. "
+ "This argument will be no-op if sentence-avg is used."
),
)
def forward(self, model, sample, reduce=True, log_probs=True):
"""Compute the loss for the given sample.
Returns a tuple with three elements:
1) the loss
2) the sample size, which is used as the denominator for the gradient
3) logging outputs to display while training
"""
net_output = model(**sample["net_input"])
lprobs = model.get_normalized_probs(net_output, log_probs=log_probs)
if not hasattr(lprobs, "batch_first"):
logging.warning(
"ERROR: we need to know whether "
"batch first for the encoder output; "
"you need to set batch_first attribute for the return value of "
"model.get_normalized_probs. Now, we assume this is true, but "
"in the future, we will raise exception instead. "
)
batch_first = getattr(lprobs, "batch_first", True)
if not batch_first:
max_seq_len = lprobs.size(0)
bsz = lprobs.size(1)
else:
max_seq_len = lprobs.size(1)
bsz = lprobs.size(0)
device = net_output["encoder_out"].device
input_lengths = encoder_padding_mask_to_lengths(
net_output["encoder_padding_mask"], max_seq_len, bsz, device
)
target_lengths = sample["target_lengths"]
targets = sample["target"]
if batch_first:
# N T D -> T N D (F.ctc_loss expects this)
lprobs = lprobs.transpose(0, 1)
pad_mask = sample["target"] != self.pad_idx
targets_flat = targets.masked_select(pad_mask)
loss = F.ctc_loss(
lprobs,
targets_flat,
input_lengths,
target_lengths,
blank=self.blank_idx,
reduction="sum",
zero_infinity=True,
)
lprobs = lprobs.transpose(0, 1) # T N D -> N T D
errors, total = compute_ctc_uer(
lprobs, targets, input_lengths, target_lengths, self.blank_idx
)
if self.args.sentence_avg:
sample_size = sample["target"].size(0)
else:
if self.args.use_source_side_sample_size:
sample_size = torch.sum(input_lengths).item()
else:
sample_size = sample["ntokens"]
logging_output = {
"loss": utils.item(loss.data) if reduce else loss.data,
"ntokens": sample["ntokens"],
"nsentences": sample["target"].size(0),
"sample_size": sample_size,
"errors": errors,
"total": total,
"nframes": torch.sum(sample["net_input"]["src_lengths"]).item(),
}
return loss, sample_size, logging_output
@staticmethod
def aggregate_logging_outputs(logging_outputs):
"""Aggregate logging outputs from data parallel training."""
loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
errors = sum(log.get("errors", 0) for log in logging_outputs)
total = sum(log.get("total", 0) for log in logging_outputs)
nframes = sum(log.get("nframes", 0) for log in logging_outputs)
agg_output = {
"loss": loss_sum / sample_size / math.log(2),
"ntokens": ntokens,
"nsentences": nsentences,
"nframes": nframes,
"sample_size": sample_size,
"acc": 100.0 - min(errors * 100.0 / total, 100.0),
}
if sample_size != ntokens:
agg_output["nll_loss"] = loss_sum / ntokens / math.log(2)
return agg_output
...@@ -58,3 +58,39 @@ def lengths_to_encoder_padding_mask(lengths, batch_first=False): ...@@ -58,3 +58,39 @@ def lengths_to_encoder_padding_mask(lengths, batch_first=False):
return encoder_padding_mask.t(), max_lengths return encoder_padding_mask.t(), max_lengths
else: else:
return encoder_padding_mask, max_lengths return encoder_padding_mask, max_lengths
def encoder_padding_mask_to_lengths(
encoder_padding_mask, max_lengths, batch_size, device
):
"""
convert encoder_padding_mask (2-D binary tensor) to a 1-D tensor
Conventionally, encoder output contains a encoder_padding_mask, which is
a 2-D mask in a shape (T, B), whose (t, b) element indicate whether
encoder_out[t, b] is a valid output (=0) or not (=1). Occasionally, we
need to convert this mask tensor to a 1-D tensor in shape (B, ), where
[b] denotes the valid length of b-th sequence
Args:
encoder_padding_mask: a (T, B)-shaped binary tensor or None; if None,
indicating all are valid
Return:
seq_lengths: a (B,)-shaped tensor, where its (b, )-th element is the
number of valid elements of b-th sequence
max_lengths: maximum length of all sequence, if encoder_padding_mask is
not None, max_lengths must equal to encoder_padding_mask.size(0)
batch_size: batch size; if encoder_padding_mask is
not None, max_lengths must equal to encoder_padding_mask.size(1)
device: which device to put the result on
"""
if encoder_padding_mask is None:
return torch.Tensor([max_lengths] * batch_size).to(torch.int32).to(device)
assert encoder_padding_mask.size(0) == max_lengths, "max_lengths does not match"
assert encoder_padding_mask.size(1) == batch_size, "batch_size does not match"
return max_lengths - torch.sum(encoder_padding_mask, dim=0)
...@@ -12,6 +12,7 @@ import torch.nn as nn ...@@ -12,6 +12,7 @@ import torch.nn as nn
from fairseq import utils from fairseq import utils
from fairseq.models import ( from fairseq.models import (
FairseqEncoder, FairseqEncoder,
FairseqEncoderModel,
FairseqIncrementalDecoder, FairseqIncrementalDecoder,
FairseqEncoderDecoderModel, FairseqEncoderDecoderModel,
register_model, register_model,
...@@ -709,6 +710,141 @@ class TransformerDecoder(FairseqIncrementalDecoder): ...@@ -709,6 +710,141 @@ class TransformerDecoder(FairseqIncrementalDecoder):
x = x.transpose(0, 1) x = x.transpose(0, 1)
return x return x
@register_model("asr_vggtransformer_encoder")
class VGGTransformerEncoderModel(FairseqEncoderModel):
def __init__(self, encoder):
super().__init__(encoder)
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
parser.add_argument(
"--input-feat-per-channel",
type=int,
metavar="N",
help="encoder input dimension per input channel",
)
parser.add_argument(
"--vggblock-enc-config",
type=str,
metavar="EXPR",
help="""
an array of tuples each containing the configuration of one vggblock
[(out_channels, conv_kernel_size, pooling_kernel_size,num_conv_layers), ...]
""",
)
parser.add_argument(
"--transformer-enc-config",
type=str,
metavar="EXPR",
help="""
a tuple containing the configuration of the Transformer layers
configurations:
[(input_dim,
num_heads,
ffn_dim,
normalize_before,
dropout,
attention_dropout,
relu_dropout), ]""",
)
parser.add_argument(
"--enc-output-dim",
type=int,
metavar="N",
help="encoder output dimension, projecting the LSTM output",
)
parser.add_argument(
"--in-channels",
type=int,
metavar="N",
help="number of encoder input channels",
)
parser.add_argument(
"--transformer-context",
type=str,
metavar="EXPR",
help="""
either None or a tuple of two ints, indicating left/right context a
transformer can have access to""",
)
parser.add_argument(
"--transformer-sampling",
type=str,
metavar="EXPR",
help="""
either None or a tuple of ints, indicating sampling factor in each layer""",
)
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
base_architecture_enconly(args)
encoder = VGGTransformerEncoderOnly(
vocab_size=len(task.target_dictionary),
input_feat_per_channel=args.input_feat_per_channel,
vggblock_config=eval(args.vggblock_enc_config),
transformer_config=eval(args.transformer_enc_config),
encoder_output_dim=args.enc_output_dim,
in_channels=args.in_channels,
transformer_context=eval(args.transformer_context),
transformer_sampling=eval(args.transformer_sampling),
)
return cls(encoder)
def get_normalized_probs(self, net_output, log_probs, sample=None):
# net_output['encoder_out'] is a (T, B, D) tensor
lprobs = super().get_normalized_probs(net_output, log_probs, sample)
# lprobs is a (T, B, D) tensor
# we need to transoose to get (B, T, D) tensor
lprobs = lprobs.transpose(0, 1).contiguous()
lprobs.batch_first = True
return lprobs
class VGGTransformerEncoderOnly(VGGTransformerEncoder):
def __init__(
self,
vocab_size,
input_feat_per_channel,
vggblock_config=DEFAULT_ENC_VGGBLOCK_CONFIG,
transformer_config=DEFAULT_ENC_TRANSFORMER_CONFIG,
encoder_output_dim=512,
in_channels=1,
transformer_context=None,
transformer_sampling=None,
):
super().__init__(
input_feat_per_channel=input_feat_per_channel,
vggblock_config=vggblock_config,
transformer_config=transformer_config,
encoder_output_dim=encoder_output_dim,
in_channels=in_channels,
transformer_context=transformer_context,
transformer_sampling=transformer_sampling,
)
self.fc_out = Linear(self.encoder_output_dim, vocab_size)
def forward(self, src_tokens, src_lengths, **kwargs):
"""
src_tokens: padded tensor (B, T, C * feat)
src_lengths: tensor of original lengths of input utterances (B,)
"""
enc_out = super().forward(src_tokens, src_lengths)
x = self.fc_out(enc_out["encoder_out"])
# x = F.log_softmax(x, dim=-1)
# Note: no need this line, because model.get_normalized_prob will call
# log_softmax
return {
"encoder_out": x, # (T, B, C)
"encoder_padding_mask": enc_out["encoder_padding_mask"], # (T, B)
}
def max_positions(self):
"""Maximum input length supported by the encoder."""
return (1e6, 1e6) # an arbitrary large number
def Embedding(num_embeddings, embedding_dim, padding_idx): def Embedding(num_embeddings, embedding_dim, padding_idx):
m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
...@@ -836,3 +972,35 @@ def vggtransformer_base(args): ...@@ -836,3 +972,35 @@ def vggtransformer_base(args):
# - FC: 512*5000 = 256K (assuming vocab size 5K) # - FC: 512*5000 = 256K (assuming vocab size 5K)
# In total: # In total:
# ~65 M # ~65 M
# CTC models
def base_architecture_enconly(args):
args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 40)
args.vggblock_enc_config = getattr(
args, "vggblock_enc_config", "[(32, 3, 2, 2, True)] * 2"
)
args.transformer_enc_config = getattr(
args, "transformer_enc_config", "((256, 4, 1024, True, 0.2, 0.2, 0.2),) * 2"
)
args.enc_output_dim = getattr(args, "enc_output_dim", 512)
args.in_channels = getattr(args, "in_channels", 1)
args.transformer_context = getattr(args, "transformer_context", "None")
args.transformer_sampling = getattr(args, "transformer_sampling", "None")
@register_model_architecture("asr_vggtransformer_encoder", "vggtransformer_enc_1")
def vggtransformer_enc_1(args):
# vggtransformer_1 is the same as vggtransformer_enc_big, except the number
# of layers is increased to 16
# keep it here for backward compatiablity purpose
args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80)
args.vggblock_enc_config = getattr(
args, "vggblock_enc_config", "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
)
args.transformer_enc_config = getattr(
args,
"transformer_enc_config",
"((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 16",
)
args.enc_output_dim = getattr(args, "enc_output_dim", 1024)
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import absolute_import, division, print_function, unicode_literals
import re
from collections import deque
from enum import Enum
import numpy as np
"""
Utility modules for computation of Word Error Rate,
Alignments, as well as more granular metrics like
deletion, insersion and substitutions.
"""
class Code(Enum):
match = 1
substitution = 2
insertion = 3
deletion = 4
class Token(object):
def __init__(self, lbl="", st=np.nan, en=np.nan):
if np.isnan(st):
self.label, self.start, self.end = "", 0.0, 0.0
else:
self.label, self.start, self.end = lbl, st, en
class AlignmentResult(object):
def __init__(self, refs, hyps, codes, score):
self.refs = refs # std::deque<int>
self.hyps = hyps # std::deque<int>
self.codes = codes # std::deque<Code>
self.score = score # float
def coordinate_to_offset(row, col, ncols):
return int(row * ncols + col)
def offset_to_row(offset, ncols):
return int(offset / ncols)
def offset_to_col(offset, ncols):
return int(offset % ncols)
def trimWhitespace(str):
return re.sub(" +", " ", re.sub(" *$", "", re.sub("^ *", "", str)))
def str2toks(str):
pieces = trimWhitespace(str).split(" ")
toks = []
for p in pieces:
toks.append(Token(p, 0.0, 0.0))
return toks
class EditDistance(object):
def __init__(self, time_mediated):
self.time_mediated_ = time_mediated
self.scores_ = np.nan # Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>
self.backtraces_ = (
np.nan
) # Eigen::Matrix<size_t, Eigen::Dynamic, Eigen::Dynamic> backtraces_;
self.confusion_pairs_ = {}
def cost(self, ref, hyp, code):
if self.time_mediated_:
if code == Code.match:
return abs(ref.start - hyp.start) + abs(ref.end - hyp.end)
elif code == Code.insertion:
return hyp.end - hyp.start
elif code == Code.deletion:
return ref.end - ref.start
else: # substitution
return abs(ref.start - hyp.start) + abs(ref.end - hyp.end) + 0.1
else:
if code == Code.match:
return 0
elif code == Code.insertion or code == Code.deletion:
return 3
else: # substitution
return 4
def get_result(self, refs, hyps):
res = AlignmentResult(refs=deque(), hyps=deque(), codes=deque(), score=np.nan)
num_rows, num_cols = self.scores_.shape
res.score = self.scores_[num_rows - 1, num_cols - 1]
curr_offset = coordinate_to_offset(num_rows - 1, num_cols - 1, num_cols)
while curr_offset != 0:
curr_row = offset_to_row(curr_offset, num_cols)
curr_col = offset_to_col(curr_offset, num_cols)
prev_offset = self.backtraces_[curr_row, curr_col]
prev_row = offset_to_row(prev_offset, num_cols)
prev_col = offset_to_col(prev_offset, num_cols)
res.refs.appendleft(curr_row - 1) # Note: this was .push_front() in C++
res.hyps.appendleft(curr_col - 1)
if curr_row - 1 == prev_row and curr_col == prev_col:
res.codes.appendleft(Code.deletion)
elif curr_row == prev_row and curr_col - 1 == prev_col:
res.codes.appendleft(Code.insertion)
else:
# assert(curr_row - 1 == prev_row and curr_col - 1 == prev_col)
ref_str = refs[res.refs[0]].label
hyp_str = hyps[res.hyps[0]].label
if ref_str == hyp_str:
res.codes.appendleft(Code.match)
else:
res.codes.appendleft(Code.substitution)
confusion_pair = "%s -> %s" % (ref_str, hyp_str)
if confusion_pair not in self.confusion_pairs_:
self.confusion_pairs_[confusion_pair] = 1
else:
self.confusion_pairs_[confusion_pair] += 1
curr_offset = prev_offset
return res
def align(self, refs, hyps):
if len(refs) == 0 and len(hyps) == 0:
return np.nan
# NOTE: we're not resetting the values in these matrices because every value
# will be overridden in the loop below. If this assumption doesn't hold,
# be sure to set all entries in self.scores_ and self.backtraces_ to 0.
self.scores_ = np.zeros((len(refs) + 1, len(hyps) + 1))
self.backtraces_ = np.zeros((len(refs) + 1, len(hyps) + 1))
num_rows, num_cols = self.scores_.shape
for i in range(num_rows):
for j in range(num_cols):
if i == 0 and j == 0:
self.scores_[i, j] = 0.0
self.backtraces_[i, j] = 0
continue
if i == 0:
self.scores_[i, j] = self.scores_[i, j - 1] + self.cost(
None, hyps[j - 1], Code.insertion
)
self.backtraces_[i, j] = coordinate_to_offset(i, j - 1, num_cols)
continue
if j == 0:
self.scores_[i, j] = self.scores_[i - 1, j] + self.cost(
refs[i - 1], None, Code.deletion
)
self.backtraces_[i, j] = coordinate_to_offset(i - 1, j, num_cols)
continue
# Below here both i and j are greater than 0
ref = refs[i - 1]
hyp = hyps[j - 1]
best_score = self.scores_[i - 1, j - 1] + (
self.cost(ref, hyp, Code.match)
if (ref.label == hyp.label)
else self.cost(ref, hyp, Code.substitution)
)
prev_row = i - 1
prev_col = j - 1
ins = self.scores_[i, j - 1] + self.cost(None, hyp, Code.insertion)
if ins < best_score:
best_score = ins
prev_row = i
prev_col = j - 1
delt = self.scores_[i - 1, j] + self.cost(ref, None, Code.deletion)
if delt < best_score:
best_score = delt
prev_row = i - 1
prev_col = j
self.scores_[i, j] = best_score
self.backtraces_[i, j] = coordinate_to_offset(
prev_row, prev_col, num_cols
)
return self.get_result(refs, hyps)
class WERTransformer(object):
def __init__(self, hyp_str, ref_str, verbose=True):
self.ed_ = EditDistance(False)
self.id2oracle_errs_ = {}
self.utts_ = 0
self.words_ = 0
self.insertions_ = 0
self.deletions_ = 0
self.substitutions_ = 0
self.process(["dummy_str", hyp_str, ref_str])
if verbose:
print("'%s' vs '%s'" % (hyp_str, ref_str))
self.report_result()
def process(self, input): # std::vector<std::string>&& input
if len(input) < 3:
print(
"Input must be of the form <id> ... <hypo> <ref> , got ",
len(input),
" inputs:",
)
return None
# Align
# std::vector<Token> hyps;
# std::vector<Token> refs;
hyps = str2toks(input[-2])
refs = str2toks(input[-1])
alignment = self.ed_.align(refs, hyps)
if alignment is None:
print("Alignment is null")
return np.nan
# Tally errors
ins = 0
dels = 0
subs = 0
for code in alignment.codes:
if code == Code.substitution:
subs += 1
elif code == Code.insertion:
ins += 1
elif code == Code.deletion:
dels += 1
# Output
row = input
row.append(str(len(refs)))
row.append(str(ins))
row.append(str(dels))
row.append(str(subs))
# print(row)
# Accumulate
kIdIndex = 0
kNBestSep = "/"
pieces = input[kIdIndex].split(kNBestSep)
if len(pieces) == 0:
print(
"Error splitting ",
input[kIdIndex],
" on '",
kNBestSep,
"', got empty list",
)
return np.nan
id = pieces[0]
if id not in self.id2oracle_errs_:
self.utts_ += 1
self.words_ += len(refs)
self.insertions_ += ins
self.deletions_ += dels
self.substitutions_ += subs
self.id2oracle_errs_[id] = [ins, dels, subs]
else:
curr_err = ins + dels + subs
prev_err = np.sum(self.id2oracle_errs_[id])
if curr_err < prev_err:
self.id2oracle_errs_[id] = [ins, dels, subs]
return 0
def report_result(self):
# print("---------- Summary ---------------")
if self.words_ == 0:
print("No words counted")
return
# 1-best
best_wer = (
100.0
* (self.insertions_ + self.deletions_ + self.substitutions_)
/ self.words_
)
print(
"\tWER = %0.2f%% (%i utts, %i words, %0.2f%% ins, "
"%0.2f%% dels, %0.2f%% subs)"
% (
best_wer,
self.utts_,
self.words_,
100.0 * self.insertions_ / self.words_,
100.0 * self.deletions_ / self.words_,
100.0 * self.substitutions_ / self.words_,
)
)
def wer(self):
if self.words_ == 0:
wer = np.nan
else:
wer = (
100.0
* (self.insertions_ + self.deletions_ + self.substitutions_)
/ self.words_
)
return wer
def stats(self):
if self.words_ == 0:
stats = {}
else:
wer = (
100.0
* (self.insertions_ + self.deletions_ + self.substitutions_)
/ self.words_
)
stats = dict(
{
"wer": wer,
"utts": self.utts_,
"numwords": self.words_,
"ins": self.insertions_,
"dels": self.deletions_,
"subs": self.substitutions_,
"confusion_pairs": self.ed_.confusion_pairs_,
}
)
return stats
def calc_wer(hyp_str, ref_str):
t = WERTransformer(hyp_str, ref_str, verbose=0)
return t.wer()
def calc_wer_stats(hyp_str, ref_str):
t = WERTransformer(hyp_str, ref_str, verbose=0)
return t.stats()
def get_wer_alignment_codes(hyp_str, ref_str):
"""
INPUT: hypothesis string, reference string
OUTPUT: List of alignment codes (intermediate results from WER computation)
"""
t = WERTransformer(hyp_str, ref_str, verbose=0)
return t.ed_.align(str2toks(ref_str), str2toks(hyp_str)).codes
def merge_counts(x, y):
# Merge two hashes which have 'counts' as their values
# This can be used for example to merge confusion pair counts
# conf_pairs = merge_counts(conf_pairs, stats['confusion_pairs'])
for k, v in y.items():
if k not in x:
x[k] = 0
x[k] += v
return x
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment