dataset.py

#  Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
# ==============================================================================
"""Generate tf.data.Dataset object for deep speech training/evaluation."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import functools
import multiprocessing

import numpy as np
import scipy.io.wavfile as wavfile
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

import data.featurizer as featurizer  # pylint: disable=g-bad-import-order


class AudioConfig(object):
  """Configs for spectrogram extraction from audio."""

  def __init__(self,
               sample_rate,
               frame_length,
               frame_step,
               fft_length=None,
               normalize=False,
               spect_type="linear"):
    """Initialize the AudioConfig class.

    Args:
      sample_rate: an integer denoting the sample rate of the input waveform.
      frame_length: an integer for the length of a spectrogram frame, in ms.
      frame_step: an integer for the frame stride, in ms.
      fft_length: an integer for the number of fft bins.
      normalize: a boolean for whether apply normalization on the audio feature.
      spect_type: a string for the type of spectrogram to be extracted.
    """

    self.sample_rate = sample_rate
    self.frame_length = frame_length
    self.frame_step = frame_step
    self.fft_length = fft_length
    self.normalize = normalize
    self.spect_type = spect_type


class DatasetConfig(object):
  """Config class for generating the DeepSpeechDataset."""

  def __init__(self, audio_config, data_path, vocab_file_path):
    """Initialize the configs for deep speech dataset.

    Args:
      audio_config: AudioConfig object specifying the audio-related configs.
      data_path: a string denoting the full path of a manifest file.
      vocab_file_path: a string specifying the vocabulary file path.

    Raises:
      RuntimeError: file path not exist.
    """

    self.audio_config = audio_config
    assert tf.gfile.Exists(data_path)
    assert tf.gfile.Exists(vocab_file_path)
    self.data_path = data_path
    self.vocab_file_path = vocab_file_path


def _normalize_audio_feature(audio_feature):
  """Perform mean and variance normalization on the spectrogram feature.

  Args:
    audio_feature: a numpy array for the spectrogram feature.

  Returns:
    a numpy array of the normalized spectrogram.
  """
  mean = np.mean(audio_feature, axis=0)
  var = np.var(audio_feature, axis=0)
  normalized = (audio_feature - mean) / (np.sqrt(var) + 1e-6)

  return normalized


def _preprocess_audio(
    audio_file_path, audio_sample_rate, audio_featurizer, normalize):
  """Load the audio file in memory and compute spectrogram feature."""
  tf.logging.info(
      "Extracting spectrogram feature for {}".format(audio_file_path))
  sample_rate, data = wavfile.read(audio_file_path)
  assert sample_rate == audio_sample_rate
  if data.dtype not in [np.float32, np.float64]:
    data = data.astype(np.float32) / np.iinfo(data.dtype).max
  feature = featurizer.compute_spectrogram_feature(
      data, audio_featurizer.frame_length, audio_featurizer.frame_step,
      audio_featurizer.fft_length)
  if normalize:
    feature = _normalize_audio_feature(feature)
  return feature


def _preprocess_transcript(transcript, token_to_index):
  """Process transcript as label features."""
  return featurizer.compute_label_feature(transcript, token_to_index)


def _preprocess_data(dataset_config, audio_featurizer, token_to_index):
  """Generate a list of waveform, transcript pair.

  Each dataset file contains three columns: "wav_filename", "wav_filesize",
  and "transcript". This function parses the csv file and stores each example
  by the increasing order of audio length (indicated by wav_filesize).
  AS the waveforms are ordered in increasing length, audio samples in a
  mini-batch have similar length.

  Args:
    dataset_config: an instance of DatasetConfig.
    audio_featurizer: an instance of AudioFeaturizer.
    token_to_index: the mapping from character to its index

  Returns:
    features and labels array processed from the audio/text input.
  """

  file_path = dataset_config.data_path
  sample_rate = dataset_config.audio_config.sample_rate
  normalize = dataset_config.audio_config.normalize

  with tf.gfile.Open(file_path, "r") as f:
    lines = f.read().splitlines()
  lines = [line.split("\t") for line in lines]
  # Skip the csv header.
  lines = lines[1:]
  # Sort input data by the length of waveform.
  lines.sort(key=lambda item: int(item[1]))

  # Use multiprocessing for feature/label extraction
  num_cores = multiprocessing.cpu_count()
  pool = multiprocessing.Pool(processes=num_cores)

  features = pool.map(
      functools.partial(
          _preprocess_audio, audio_sample_rate=sample_rate,
          audio_featurizer=audio_featurizer, normalize=normalize),
      [line[0] for line in lines])
  labels = pool.map(
      functools.partial(
          _preprocess_transcript, token_to_index=token_to_index),
      [line[2] for line in lines])

  pool.terminate()
  return features, labels


class DeepSpeechDataset(object):
  """Dataset class for training/evaluation of DeepSpeech model."""

  def __init__(self, dataset_config):
    """Initialize the DeepSpeechDataset class.

    Args:
      dataset_config: DatasetConfig object.
    """
    self.config = dataset_config
    # Instantiate audio feature extractor.
    self.audio_featurizer = featurizer.AudioFeaturizer(
        sample_rate=self.config.audio_config.sample_rate,
        frame_length=self.config.audio_config.frame_length,
        frame_step=self.config.audio_config.frame_step,
        fft_length=self.config.audio_config.fft_length)
    # Instantiate text feature extractor.
    self.text_featurizer = featurizer.TextFeaturizer(
        vocab_file=self.config.vocab_file_path)

    self.speech_labels = self.text_featurizer.speech_labels
    self.features, self.labels = _preprocess_data(
        self.config,
        self.audio_featurizer,
        self.text_featurizer.token_to_idx
    )

    self.num_feature_bins = (
        self.features[0].shape[1] if len(self.features) else None)


def input_fn(batch_size, deep_speech_dataset, repeat=1):
  """Input function for model training and evaluation.

  Args:
    batch_size: an integer denoting the size of a batch.
    deep_speech_dataset: DeepSpeechDataset object.
    repeat: an integer for how many times to repeat the dataset.

  Returns:
    a tf.data.Dataset object for model to consume.
  """
  features = deep_speech_dataset.features
  labels = deep_speech_dataset.labels
  num_feature_bins = deep_speech_dataset.num_feature_bins

  def _gen_data():
    for i in xrange(len(features)):
      feature = np.expand_dims(features[i], axis=2)
      input_length = [features[i].shape[0]]
      label_length = [len(labels[i])]
      yield {
          "features": feature,
          "labels": labels[i],
          "input_length": input_length,
          "label_length": label_length
      }

  dataset = tf.data.Dataset.from_generator(
      _gen_data,
      output_types={
          "features": tf.float32,
          "labels": tf.int32,
          "input_length": tf.int32,
          "label_length": tf.int32
      },
      output_shapes={
          "features": tf.TensorShape([None, num_feature_bins, 1]),
          "labels": tf.TensorShape([None]),
          "input_length": tf.TensorShape([1]),
          "label_length": tf.TensorShape([1])
      })

  # Repeat and batch the dataset
  dataset = dataset.repeat(repeat)
  # Padding the features to its max length dimensions.
  dataset = dataset.padded_batch(
      batch_size=batch_size,
      padded_shapes={
          "features": tf.TensorShape([None, num_feature_bins, 1]),
          "labels": tf.TensorShape([None]),
          "input_length": tf.TensorShape([1]),
          "label_length": tf.TensorShape([1])
      })

  # Prefetch to improve speed of input pipeline.
  dataset = dataset.prefetch(1)
  return dataset