create_timit_dataset.py

# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Preprocesses TIMIT from raw wavfiles to create a set of TFRecords.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import glob
import os
import random
import re

import numpy as np
import tensorflow as tf

tf.app.flags.DEFINE_string("raw_timit_dir", None,
                          "Directory containing TIMIT files.")
tf.app.flags.DEFINE_string("out_dir", None,
                          "Output directory for TFRecord files.")
tf.app.flags.DEFINE_float("valid_frac", 0.05,
                          "Fraction of train set to use as valid set. "
                          "Must be between 0.0 and 1.0.")

tf.app.flags.mark_flag_as_required("raw_timit_dir")
tf.app.flags.mark_flag_as_required("out_dir")

FLAGS = tf.app.flags.FLAGS

NUM_TRAIN_FILES = 4620
NUM_TEST_FILES = 1680
SAMPLES_PER_TIMESTEP = 200

# Regexes for reading SPHERE header files.
SAMPLE_COUNT_REGEX = re.compile(r"sample_count -i (\d+)")
SAMPLE_MIN_REGEX = re.compile(r"sample_min -i (-?\d+)")
SAMPLE_MAX_REGEX = re.compile(r"sample_max -i (-?\d+)")


def get_filenames(split):
  """Get all wav filenames from the TIMIT archive."""
  path = os.path.join(FLAGS.raw_timit_dir, "TIMIT", split, "*", "*", "*.WAV")
  # Sort the output by name so the order is deterministic.
  files = sorted(glob.glob(path))
  return files


def load_timit_wav(filename):
  """Loads a TIMIT wavfile into a numpy array.

  TIMIT wavfiles include a SPHERE header, detailed in the TIMIT docs. The first
  line is the header type and the second is the length of the header in bytes.
  After the header, the remaining bytes are actual WAV data.

  The header includes information about the WAV data such as the number of
  samples and minimum and maximum amplitude. This function asserts that the
  loaded wav data matches the header.

  Args:
    filename: The name of the TIMIT wavfile to load.
  Returns:
    wav: A numpy array containing the loaded wav data.
  """
  wav_file = open(filename, "rb")
  header_type = wav_file.readline()
  header_length_str = wav_file.readline()
  # The header length includes the length of the first two lines.
  header_remaining_bytes = (int(header_length_str) - len(header_type) -
                            len(header_length_str))
  header = wav_file.read(header_remaining_bytes)
  # Read the relevant header fields.
  sample_count = int(SAMPLE_COUNT_REGEX.search(header).group(1))
  sample_min = int(SAMPLE_MIN_REGEX.search(header).group(1))
  sample_max = int(SAMPLE_MAX_REGEX.search(header).group(1))
  wav = np.fromstring(wav_file.read(), dtype="int16").astype("float32")
  # Check that the loaded data conforms to the header description.
  assert len(wav) == sample_count
  assert wav.min() == sample_min
  assert wav.max() == sample_max
  return wav


def preprocess(wavs, block_size, mean, std):
  """Normalize the wav data and reshape it into chunks."""
  processed_wavs = []
  for wav in wavs:
    wav = (wav - mean) / std
    wav_length = wav.shape[0]
    if wav_length % block_size != 0:
      pad_width = block_size - (wav_length % block_size)
      wav = np.pad(wav, (0, pad_width), "constant")
    assert wav.shape[0] % block_size == 0
    wav = wav.reshape((-1, block_size))
    processed_wavs.append(wav)
  return processed_wavs


def create_tfrecord_from_wavs(wavs, output_file):
  """Writes processed wav files to disk as sharded TFRecord files."""
  with tf.python_io.TFRecordWriter(output_file) as builder:
    for wav in wavs:
      builder.write(wav.astype(np.float32).tobytes())


def main(unused_argv):
  train_filenames = get_filenames("TRAIN")
  test_filenames = get_filenames("TEST")

  num_train_files = len(train_filenames)
  num_test_files = len(test_filenames)
  num_valid_files = int(num_train_files * FLAGS.valid_frac)
  num_train_files -= num_valid_files

  print("%d train / %d valid / %d test" % (
      num_train_files, num_valid_files, num_test_files))

  random.seed(1234)
  random.shuffle(train_filenames)

  valid_filenames = train_filenames[:num_valid_files]
  train_filenames = train_filenames[num_valid_files:]

  # Make sure there is no overlap in the train, test, and valid sets.
  train_s = set(train_filenames)
  test_s = set(test_filenames)
  valid_s = set(valid_filenames)
  # Disable explicit length testing to make the assertions more readable.
  # pylint: disable=g-explicit-length-test
  assert len(train_s & test_s) == 0
  assert len(train_s & valid_s) == 0
  assert len(valid_s & test_s) == 0
  # pylint: enable=g-explicit-length-test

  train_wavs = [load_timit_wav(f) for f in train_filenames]
  valid_wavs = [load_timit_wav(f) for f in valid_filenames]
  test_wavs = [load_timit_wav(f) for f in test_filenames]
  assert len(train_wavs) + len(valid_wavs) == NUM_TRAIN_FILES
  assert len(test_wavs) == NUM_TEST_FILES

  # Calculate the mean and standard deviation of the train set.
  train_stacked = np.hstack(train_wavs)
  train_mean = np.mean(train_stacked)
  train_std = np.std(train_stacked)
  print("train mean: %f  train std: %f" % (train_mean, train_std))

  # Process all data, normalizing with the train set statistics.
  processed_train_wavs = preprocess(train_wavs, SAMPLES_PER_TIMESTEP,
                                    train_mean, train_std)
  processed_valid_wavs = preprocess(valid_wavs, SAMPLES_PER_TIMESTEP,
                                    train_mean, train_std)
  processed_test_wavs = preprocess(test_wavs, SAMPLES_PER_TIMESTEP, train_mean,
                                   train_std)

  # Write the datasets to disk.
  create_tfrecord_from_wavs(
      processed_train_wavs,
      os.path.join(FLAGS.out_dir, "train"))
  create_tfrecord_from_wavs(
      processed_valid_wavs,
      os.path.join(FLAGS.out_dir, "valid"))
  create_tfrecord_from_wavs(
      processed_test_wavs,
      os.path.join(FLAGS.out_dir, "test"))


if __name__ == "__main__":
  tf.app.run()