featurizer.py

#  Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
# ==============================================================================
"""Utility class for extracting features from the text and audio input."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import codecs
import numpy as np
from scipy import signal


def compute_spectrogram_feature(waveform, frame_length, frame_step, fft_length):
  """Compute the spectrograms for the input waveform."""
  _, _, stft = signal.stft(
      waveform,
      nperseg=frame_length,
      noverlap=frame_step,
      nfft=fft_length)

  # Perform transpose to set its shape as [time_steps, feature_num_bins]
  spectrogram = np.transpose(np.absolute(stft), (1, 0))
  return spectrogram


class AudioFeaturizer(object):
  """Class to extract spectrogram features from the audio input."""

  def __init__(self,
               sample_rate=16000,
               frame_length=25,
               frame_step=10,
               fft_length=None):
    """Initialize the audio featurizer class according to the configs.

    Args:
      sample_rate: an integer specifying the sample rate of the input waveform.
      frame_length: an integer for the length of a spectrogram frame, in ms.
      frame_step: an integer for the frame stride, in ms.
      fft_length: an integer for the number of fft bins.
    """
    self.frame_length = int(sample_rate * frame_length / 1e3)
    self.frame_step = int(sample_rate * frame_step / 1e3)
    self.fft_length = fft_length if fft_length else int(2**(np.ceil(
        np.log2(self.frame_length))))


def compute_label_feature(text, token_to_idx):
  """Convert string to a list of integers."""
  tokens = list(text.strip().lower())
  feats = [token_to_idx[token] for token in tokens]
  return feats


class TextFeaturizer(object):
  """Extract text feature based on char-level granularity.

  By looking up the vocabulary table, each input string (one line of transcript)
  will be converted to a sequence of integer indexes.
  """

  def __init__(self, vocab_file):
    lines = []
    with codecs.open(vocab_file, "r", "utf-8") as fin:
      lines.extend(fin.readlines())
    self.token_to_idx = {}
    self.idx_to_token = {}
    self.speech_labels = ""
    idx = 0
    for line in lines:
      line = line[:-1]  # Strip the '\n' char.
      if line.startswith("#"):
        # Skip from reading comment line.
        continue
      self.token_to_idx[line] = idx
      self.idx_to_token[idx] = line
      self.speech_labels += line
      idx += 1