decoder.py 3.14 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Deep speech decoder."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

21
22
import itertools

23
from nltk.metrics import distance
24
import numpy as np
25
26
27


class DeepSpeechDecoder(object):
28
  """Greedy decoder implementation for Deep Speech model."""
29

30
  def __init__(self, labels, blank_index=28):
31
32
33
    """Decoder initialization.

    Arguments:
34
35
      labels: a string specifying the speech labels for the decoder to use.
      blank_index: an integer specifying index for the blank character.
36
37
38
39
40
        Defaults to 28.
    """
    # e.g. labels = "[a-z]' _"
    self.labels = labels
    self.blank_index = blank_index
41
    self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)])
42

43
44
45
  def convert_to_string(self, sequence):
    """Convert a sequence of indexes into corresponding string."""
    return ''.join([self.int_to_char[i] for i in sequence])
46

47
  def wer(self, decode, target):
48
49
50
51
52
53
    """Computes the Word Error Rate (WER).

    WER is defined as the edit distance between the two provided sentences after
    tokenizing to words.

    Args:
54
55
      decode: string of the decoded output.
      target: a string for the ground truth label.
56
57

    Returns:
58
      A float number for the WER of the current decode-target pair.
59
60
    """
    # Map each word to a new char.
61
    words = set(decode.split() + target.split())
62
63
    word2char = dict(zip(words, range(len(words))))

64
    new_decode = [chr(word2char[w]) for w in decode.split()]
65
66
    new_target = [chr(word2char[w]) for w in target.split()]

67
    return distance.edit_distance(''.join(new_decode), ''.join(new_target))
68

69
  def cer(self, decode, target):
70
71
    """Computes the Character Error Rate (CER).

72
    CER is defined as the edit distance between the two given strings.
73
74

    Args:
75
76
      decode: a string of the decoded output.
      target: a string for the ground truth label.
77
78
79
80

    Returns:
      A float number denoting the CER for the current sentence pair.
    """
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
    return distance.edit_distance(decode, target)

  def decode(self, logits):
    """Decode the best guess from logits using greedy algorithm."""
    # Choose the class with maximimum probability.
    best = list(np.argmax(logits, axis=1))
    # Merge repeated chars.
    merge = [k for k, _ in itertools.groupby(best)]
    # Remove the blank index in the decoded sequence.
    merge_remove_blank = []
    for k in merge:
      if k != self.blank_index:
        merge_remove_blank.append(k)

    return self.convert_to_string(merge_remove_blank)