download.py 7.65 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#  Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
# ==============================================================================
"""Download and preprocess LibriSpeech dataset for DeepSpeech model."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import codecs
import fnmatch
import os
import sys
import tarfile
import tempfile
import unicodedata

from absl import app as absl_app
from absl import flags as absl_flags
import pandas
from six.moves import urllib
from sox import Transformer
import tensorflow as tf

LIBRI_SPEECH_URLS = {
    "train-clean-100":
        "http://www.openslr.org/resources/12/train-clean-100.tar.gz",
    "train-clean-360":
        "http://www.openslr.org/resources/12/train-clean-360.tar.gz",
    "train-other-500":
        "http://www.openslr.org/resources/12/train-other-500.tar.gz",
    "dev-clean":
        "http://www.openslr.org/resources/12/dev-clean.tar.gz",
    "dev-other":
        "http://www.openslr.org/resources/12/dev-other.tar.gz",
    "test-clean":
        "http://www.openslr.org/resources/12/test-clean.tar.gz",
    "test-other":
        "http://www.openslr.org/resources/12/test-other.tar.gz"
}


def download_and_extract(directory, url):
  """Download and extract the given split of dataset.

  Args:
    directory: the directory where to extract the tarball.
    url: the url to download the data file.
  """

62
63
  if not tf.io.gfile.exists(directory):
    tf.io.gfile.makedirs(directory)
64
65
66
67

  _, tar_filepath = tempfile.mkstemp(suffix=".tar.gz")

  try:
68
    tf.compat.v1.logging.info("Downloading %s to %s" % (url, tar_filepath))
69
70
71
72
73
74
75
76
77

    def _progress(count, block_size, total_size):
      sys.stdout.write("\r>> Downloading {} {:.1f}%".format(
          tar_filepath, 100.0 * count * block_size / total_size))
      sys.stdout.flush()

    urllib.request.urlretrieve(url, tar_filepath, _progress)
    print()
    statinfo = os.stat(tar_filepath)
78
    tf.compat.v1.logging.info(
79
80
81
82
        "Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
    with tarfile.open(tar_filepath, "r") as tar:
      tar.extractall(directory)
  finally:
83
    tf.io.gfile.remove(tar_filepath)
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114


def convert_audio_and_split_transcript(input_dir, source_name, target_name,
                                       output_dir, output_file):
  """Convert FLAC to WAV and split the transcript.

  For audio file, convert the format from FLAC to WAV using the sox.Transformer
  library.
  For transcripts, each line contains the sequence id and the corresponding
  transcript (separated by space):
  Input data format: seq-id transcript_of_seq-id
  For example:
   1-2-0 transcript_of_1-2-0.flac
   1-2-1 transcript_of_1-2-1.flac
   ...

  Each sequence id has a corresponding .flac file.
  Parse the transcript file and generate a new csv file which has three columns:
  "wav_filename": the absolute path to a wav file.
  "wav_filesize": the size of the corresponding wav file.
  "transcript": the transcript for this audio segement.

  Args:
    input_dir: the directory which holds the input dataset.
    source_name: the name of the specified dataset. e.g. test-clean
    target_name: the directory name for the newly generated audio files.
                 e.g. test-clean-wav
    output_dir: the directory to place the newly generated csv files.
    output_file: the name of the newly generated csv file. e.g. test-clean.csv
  """

115
  tf.compat.v1.logging.info("Preprocessing audio and transcript for %s" % source_name)
116
117
118
  source_dir = os.path.join(input_dir, source_name)
  target_dir = os.path.join(input_dir, target_name)

119
120
  if not tf.io.gfile.exists(target_dir):
    tf.io.gfile.makedirs(target_dir)
121
122
123
124
125

  files = []
  tfm = Transformer()
  # Convert all FLAC file into WAV format. At the same time, generate the csv
  # file.
126
  for root, _, filenames in tf.io.gfile.walk(source_dir):
127
128
129
130
131
132
133
134
135
136
137
138
139
    for filename in fnmatch.filter(filenames, "*.trans.txt"):
      trans_file = os.path.join(root, filename)
      with codecs.open(trans_file, "r", "utf-8") as fin:
        for line in fin:
          seqid, transcript = line.split(" ", 1)
          # We do a encode-decode transformation here because the output type
          # of encode is a bytes object, we need convert it to string.
          transcript = unicodedata.normalize("NFKD", transcript).encode(
              "ascii", "ignore").decode("ascii", "ignore").strip().lower()

          # Convert FLAC to WAV.
          flac_file = os.path.join(root, seqid + ".flac")
          wav_file = os.path.join(target_dir, seqid + ".wav")
140
          if not tf.io.gfile.exists(wav_file):
141
142
143
144
145
146
147
148
149
150
151
            tfm.build(flac_file, wav_file)
          wav_filesize = os.path.getsize(wav_file)

          files.append((os.path.abspath(wav_file), wav_filesize, transcript))

  # Write to CSV file which contains three columns:
  # "wav_filename", "wav_filesize", "transcript".
  csv_file_path = os.path.join(output_dir, output_file)
  df = pandas.DataFrame(
      data=files, columns=["wav_filename", "wav_filesize", "transcript"])
  df.to_csv(csv_file_path, index=False, sep="\t")
152
  tf.compat.v1.logging.info("Successfully generated csv file {}".format(csv_file_path))
153
154
155
156
157
158
159
160
161
162


def download_and_process_datasets(directory, datasets):
  """Download and pre-process the specified list of LibriSpeech dataset.

  Args:
    directory: the directory to put all the downloaded and preprocessed data.
    datasets: list of dataset names that will be downloaded and processed.
  """

163
  tf.compat.v1.logging.info("Preparing LibriSpeech dataset: {}".format(
164
165
      ",".join(datasets)))
  for dataset in datasets:
166
    tf.compat.v1.logging.info("Preparing dataset %s", dataset)
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
    dataset_dir = os.path.join(directory, dataset)
    download_and_extract(dataset_dir, LIBRI_SPEECH_URLS[dataset])
    convert_audio_and_split_transcript(
        dataset_dir + "/LibriSpeech", dataset, dataset + "-wav",
        dataset_dir + "/LibriSpeech", dataset + ".csv")


def define_data_download_flags():
  """Define flags for data downloading."""
  absl_flags.DEFINE_string(
      "data_dir", "/tmp/librispeech_data",
      "Directory to download data and extract the tarball")
  absl_flags.DEFINE_bool("train_only", False,
                         "If true, only download the training set")
  absl_flags.DEFINE_bool("dev_only", False,
                         "If true, only download the dev set")
  absl_flags.DEFINE_bool("test_only", False,
                         "If true, only download the test set")


def main(_):
188
189
  if not tf.io.gfile.exists(FLAGS.data_dir):
    tf.io.gfile.makedirs(FLAGS.data_dir)
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204

  if FLAGS.train_only:
    download_and_process_datasets(
        FLAGS.data_dir,
        ["train-clean-100", "train-clean-360", "train-other-500"])
  elif FLAGS.dev_only:
    download_and_process_datasets(FLAGS.data_dir, ["dev-clean", "dev-other"])
  elif FLAGS.test_only:
    download_and_process_datasets(FLAGS.data_dir, ["test-clean", "test-other"])
  else:
    # By default we download the entire dataset.
    download_and_process_datasets(FLAGS.data_dir, LIBRI_SPEECH_URLS.keys())


if __name__ == "__main__":
205
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
206
207
208
  define_data_download_flags()
  FLAGS = absl_flags.FLAGS
  absl_app.run(main)