audio_io_tutorial.py 12.6 KB
Newer Older
1
2
3
4
5
# -*- coding: utf-8 -*-
"""
Audio I/O
=========

moto's avatar
moto committed
6
7
This tutorial shows how to use TorchAudio's basic I/O API to load audio files
into PyTorch's Tensor object, and save Tensor objects to audio files.
8
9
10
11
12
13
14
15
16
"""

import torch
import torchaudio

print(torch.__version__)
print(torchaudio.__version__)

######################################################################
moto's avatar
moto committed
17
18
# Preparation
# -----------
19
#
moto's avatar
moto committed
20
21
22
23
24
25
26
27
28
# First, we import the modules and download the audio assets we use in this tutorial.
#
# .. note::
#    When running this tutorial in Google Colab, install the required packages
#    with the following:
#
#    .. code::
#
#       !pip install boto3
29
30
31
32

import io
import os
import tarfile
moto's avatar
moto committed
33
import tempfile
34
35

import boto3
36
37
import matplotlib.pyplot as plt
import requests
38
39
from botocore import UNSIGNED
from botocore.config import Config
moto's avatar
moto committed
40
41
from IPython.display import Audio
from torchaudio.utils import download_asset
42

moto's avatar
moto committed
43
44
45
SAMPLE_GSM = download_asset("tutorial-assets/steam-train-whistle-daniel_simon.gsm")
SAMPLE_WAV = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
SAMPLE_WAV_8000 = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042-8000hz.wav")
46

47
48
49


######################################################################
50
51
# Querying audio metadata
# -----------------------
52
#
53
54
# Function :py:func:`torchaudio.info` fetches audio metadata.
# You can provide a path-like object or file-like object.
55
56
#

moto's avatar
moto committed
57
metadata = torchaudio.info(SAMPLE_WAV)
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
print(metadata)

######################################################################
# Where
#
# -  ``sample_rate`` is the sampling rate of the audio
# -  ``num_channels`` is the number of channels
# -  ``num_frames`` is the number of frames per channel
# -  ``bits_per_sample`` is bit depth
# -  ``encoding`` is the sample coding format
#
# ``encoding`` can take on one of the following values:
#
# -  ``"PCM_S"``: Signed integer linear PCM
# -  ``"PCM_U"``: Unsigned integer linear PCM
# -  ``"PCM_F"``: Floating point linear PCM
# -  ``"FLAC"``: Flac, `Free Lossless Audio
#    Codec <https://xiph.org/flac/>`__
# -  ``"ULAW"``: Mu-law,
#    [`wikipedia <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`__]
# -  ``"ALAW"``: A-law
#    [`wikipedia <https://en.wikipedia.org/wiki/A-law_algorithm>`__]
# -  ``"MP3"`` : MP3, MPEG-1 Audio Layer III
# -  ``"VORBIS"``: OGG Vorbis [`xiph.org <https://xiph.org/vorbis/>`__]
# -  ``"AMR_NB"``: Adaptive Multi-Rate
#    [`wikipedia <https://en.wikipedia.org/wiki/Adaptive_Multi-Rate_audio_codec>`__]
# -  ``"AMR_WB"``: Adaptive Multi-Rate Wideband
#    [`wikipedia <https://en.wikipedia.org/wiki/Adaptive_Multi-Rate_Wideband>`__]
# -  ``"OPUS"``: Opus [`opus-codec.org <https://opus-codec.org/>`__]
# -  ``"GSM"``: GSM-FR
#    [`wikipedia <https://en.wikipedia.org/wiki/Full_Rate>`__]
moto's avatar
moto committed
89
# -  ``"HTK"``: Single channel 16-bit PCM
90
91
92
93
94
95
96
97
98
99
# -  ``"UNKNOWN"`` None of above
#

######################################################################
# **Note**
#
# -  ``bits_per_sample`` can be ``0`` for formats with compression and/or
#    variable bit rate (such as MP3).
# -  ``num_frames`` can be ``0`` for GSM-FR format.
#
moto's avatar
moto committed
100
101
102
103

metadata = torchaudio.info(SAMPLE_GSM)
print(metadata)

104
105
106

######################################################################
# Querying file-like object
moto's avatar
moto committed
107
# -------------------------
108
#
109
# :py:func:`torchaudio.info` works on file-like objects.
110
111
#

moto's avatar
moto committed
112
113
url = "https://download.pytorch.org/torchaudio/tutorial-assets/steam-train-whistle-daniel_simon.wav"
with requests.get(url, stream=True) as response:
114
    metadata = torchaudio.info(response.raw)
115
116
117
print(metadata)

######################################################################
moto's avatar
moto committed
118
# .. note::
119
#
moto's avatar
moto committed
120
121
122
123
124
125
#    When passing a file-like object, ``info`` does not read
#    all of the underlying data; rather, it reads only a portion
#    of the data from the beginning.
#    Therefore, for a given audio format, it may not be able to retrieve the
#    correct metadata, including the format itself. In such case, you
#    can pass ``format`` argument to specify the format of the audio.
126
127

######################################################################
moto's avatar
moto committed
128
129
# Loading audio data
# ------------------
130
#
131
# To load audio data, you can use :py:func:`torchaudio.load`.
132
133
134
135
136
137
138
#
# This function accepts a path-like object or file-like object as input.
#
# The returned value is a tuple of waveform (``Tensor``) and sample rate
# (``int``).
#
# By default, the resulting tensor object has ``dtype=torch.float32`` and
moto's avatar
moto committed
139
# its value range is ``[-1.0, 1.0]``.
140
141
142
143
144
#
# For the list of supported format, please refer to `the torchaudio
# documentation <https://pytorch.org/audio>`__.
#

moto's avatar
moto committed
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
waveform, sample_rate = torchaudio.load(SAMPLE_WAV)


######################################################################
#
def plot_waveform(waveform, sample_rate):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sample_rate

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].plot(time_axis, waveform[c], linewidth=1)
        axes[c].grid(True)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
    figure.suptitle("waveform")
    plt.show(block=False)

167

moto's avatar
moto committed
168
169
######################################################################
#
170
plot_waveform(waveform, sample_rate)
moto's avatar
moto committed
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192


######################################################################
#
def plot_specgram(waveform, sample_rate, title="Spectrogram"):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].specgram(waveform[c], Fs=sample_rate)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
    figure.suptitle(title)
    plt.show(block=False)


######################################################################
#
193
194
195
plot_specgram(waveform, sample_rate)


moto's avatar
moto committed
196
197
198
199
######################################################################
#
Audio(waveform.numpy()[0], rate=sample_rate)

200
201
######################################################################
# Loading from file-like object
moto's avatar
moto committed
202
# -----------------------------
203
#
moto's avatar
moto committed
204
205
# The I/O functions support file-like objects.
# This allows for fetching and decoding audio data from locations
206
207
208
209
# within and beyond the local file system.
# The following examples illustrate this.
#

moto's avatar
moto committed
210
211
212
######################################################################
#

213
# Load audio data as HTTP request
moto's avatar
moto committed
214
215
url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
with requests.get(url, stream=True) as response:
216
    waveform, sample_rate = torchaudio.load(response.raw)
217
218
plot_specgram(waveform, sample_rate, title="HTTP datasource")

moto's avatar
moto committed
219
220
221
######################################################################
#

222
# Load audio from tar file
moto's avatar
moto committed
223
224
225
226
tar_path = download_asset("tutorial-assets/VOiCES_devkit.tar.gz")
tar_item = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
with tarfile.open(tar_path, mode="r") as tarfile_:
    fileobj = tarfile_.extractfile(tar_item)
227
    waveform, sample_rate = torchaudio.load(fileobj)
228
229
plot_specgram(waveform, sample_rate, title="TAR file")

moto's avatar
moto committed
230
231
232
######################################################################
#

233
# Load audio from S3
moto's avatar
moto committed
234
235
bucket = "pytorch-tutorial-assets"
key = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
236
client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
moto's avatar
moto committed
237
response = client.get_object(Bucket=bucket, Key=key)
238
waveform, sample_rate = torchaudio.load(response["Body"])
239
240
241
242
243
plot_specgram(waveform, sample_rate, title="From S3")


######################################################################
# Tips on slicing
moto's avatar
moto committed
244
# ---------------
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
#
# Providing ``num_frames`` and ``frame_offset`` arguments restricts
# decoding to the corresponding segment of the input.
#
# The same result can be achieved using vanilla Tensor slicing,
# (i.e. ``waveform[:, frame_offset:frame_offset+num_frames]``). However,
# providing ``num_frames`` and ``frame_offset`` arguments is more
# efficient.
#
# This is because the function will end data acquisition and decoding
# once it finishes decoding the requested frames. This is advantageous
# when the audio data are transferred via network as the data transfer will
# stop as soon as the necessary amount of data is fetched.
#
# The following example illustrates this.
#

# Illustration of two different decoding methods.
# The first one will fetch all the data and decode them, while
# the second one will stop fetching data once it completes decoding.
# The resulting waveforms are identical.

frame_offset, num_frames = 16000, 16000  # Fetch and decode the 1 - 2 seconds

moto's avatar
moto committed
269
url = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
270
print("Fetching all the data...")
moto's avatar
moto committed
271
with requests.get(url, stream=True) as response:
272
    waveform1, sample_rate1 = torchaudio.load(response.raw)
273
    waveform1 = waveform1[:, frame_offset : frame_offset + num_frames]
274
    print(f" - Fetched {response.raw.tell()} bytes")
275
276

print("Fetching until the requested frames are available...")
moto's avatar
moto committed
277
with requests.get(url, stream=True) as response:
278
    waveform2, sample_rate2 = torchaudio.load(response.raw, frame_offset=frame_offset, num_frames=num_frames)
279
    print(f" - Fetched {response.raw.tell()} bytes")
280
281
282
283
284
285
286
287
288
289

print("Checking the resulting waveform ... ", end="")
assert (waveform1 == waveform2).all()
print("matched!")

######################################################################
# Saving audio to file
# --------------------
#
# To save audio data in formats interpretable by common applications,
290
# you can use :py:func:`torchaudio.save`.
291
292
293
294
295
296
297
298
299
300
301
302
303
304
#
# This function accepts a path-like object or file-like object.
#
# When passing a file-like object, you also need to provide argument ``format``
# so that the function knows which format it should use. In the
# case of a path-like object, the function will infer the format from
# the extension. If you are saving to a file without an extension, you need
# to provide argument ``format``.
#
# When saving WAV-formatted data, the default encoding for ``float32`` Tensor
# is 32-bit floating-point PCM. You can provide arguments ``encoding`` and
# ``bits_per_sample`` to change this behavior. For example, to save data
# in 16-bit signed integer PCM, you can do the following.
#
moto's avatar
moto committed
305
306
307
# .. note::
#
# Saving data in encodings with a lower bit depth reduces the
308
309
310
# resulting file size but also precision.
#

moto's avatar
moto committed
311
312
waveform, sample_rate = torchaudio.load(SAMPLE_WAV)

313

moto's avatar
moto committed
314
315
######################################################################
#
316

moto's avatar
moto committed
317
318
319
320
321
322
323
324
325
326
def inspect_file(path):
    print("-" * 10)
    print("Source:", path)
    print("-" * 10)
    print(f" - File size: {os.path.getsize(path)} bytes")
    print(f" - {torchaudio.info(path)}")
    print()

######################################################################
#
327
328
329
# Save without any encoding option.
# The function will pick up the encoding which
# the provided data fit
moto's avatar
moto committed
330
331
332
333
with tempfile.TemporaryDirectory() as tempdir:
    path = f"{tempdir}/save_example_default.wav"
    torchaudio.save(path, waveform, sample_rate)
    inspect_file(path)
334

moto's avatar
moto committed
335
336
######################################################################
#
337
338
# Save as 16-bit signed integer Linear PCM
# The resulting file occupies half the storage but loses precision
moto's avatar
moto committed
339
340
341
342
with tempfile.TemporaryDirectory() as tempdir:
    path = f"{tempdir}/save_example_PCM_S16.wav"
    torchaudio.save(path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
    inspect_file(path)
343
344
345


######################################################################
moto's avatar
moto committed
346
# :py:func:`torchaudio.save` can also handle other formats.
347
# To name a few:
348
349
350
#

formats = [
351
352
353
354
355
356
    "flac",
    "vorbis",
    "sph",
    "amb",
    "amr-nb",
    "gsm",
357
358
]

moto's avatar
moto committed
359
360
361
362
363
364
365
366
######################################################################
#
waveform, sample_rate = torchaudio.load(SAMPLE_WAV_8000)
with tempfile.TemporaryDirectory() as tempdir:
    for format in formats:
        path = f"{tempdir}/save_example.{format}"
        torchaudio.save(path, waveform, sample_rate, format=format)
        inspect_file(path)
367
368
369

######################################################################
# Saving to file-like object
moto's avatar
moto committed
370
# --------------------------
371
372
373
374
375
376
377
#
# Similar to the other I/O functions, you can save audio to file-like
# objects. When saving to a file-like object, argument ``format`` is
# required.
#


moto's avatar
moto committed
378
waveform, sample_rate = torchaudio.load(SAMPLE_WAV)
379
380
381
382
383
384
385

# Saving to bytes buffer
buffer_ = io.BytesIO()
torchaudio.save(buffer_, waveform, sample_rate, format="wav")

buffer_.seek(0)
print(buffer_.read(16))