tacotron2_pipeline_tutorial.py 10.7 KB
Newer Older
moto's avatar
moto committed
1
"""
moto's avatar
moto committed
2
3
Text-to-Speech with Tacotron2
=============================
moto's avatar
moto committed
4

5
6
**Author**: `Yao-Yuan Yang <https://github.com/yangarbiter>`__,
`Moto Hira <moto@meta.com>`__
moto's avatar
moto committed
7
8
9
10
11
12

"""

######################################################################
# Overview
# --------
13
#
moto's avatar
moto committed
14
15
# This tutorial shows how to build text-to-speech pipeline, using the
# pretrained Tacotron2 in torchaudio.
16
#
moto's avatar
moto committed
17
# The text-to-speech pipeline goes as follows:
18
#
moto's avatar
moto committed
19
# 1. Text preprocessing
20
#
moto's avatar
moto committed
21
22
#    First, the input text is encoded into a list of symbols. In this
#    tutorial, we will use English characters and phonemes as the symbols.
23
#
moto's avatar
moto committed
24
# 2. Spectrogram generation
25
#
26
#    From the encoded text, a spectrogram is generated. We use the ``Tacotron2``
moto's avatar
moto committed
27
#    model for this.
28
#
moto's avatar
moto committed
29
# 3. Time-domain conversion
30
#
moto's avatar
moto committed
31
#    The last step is converting the spectrogram into the waveform. The
32
#    process to generate speech from spectrogram is also called a Vocoder.
moto's avatar
moto committed
33
#    In this tutorial, three different vocoders are used,
moto's avatar
moto committed
34
35
#    :py:class:`~torchaudio.models.WaveRNN`,
#    :py:class:`~torchaudio.transforms.GriffinLim`, and
moto's avatar
moto committed
36
#    `Nvidia's WaveGlow <https://pytorch.org/hub/nvidia_deeplearningexamples_tacotron2/>`__.
37
38
#
#
moto's avatar
moto committed
39
# The following figure illustrates the whole process.
40
#
moto's avatar
moto committed
41
# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/tacotron2_tts_pipeline.png
42
#
43
# All the related components are bundled in :py:class:`torchaudio.pipelines.Tacotron2TTSBundle`,
moto's avatar
moto committed
44
45
46
47
48
# but this tutorial will also cover the process under the hood.

######################################################################
# Preparation
# -----------
49
#
moto's avatar
moto committed
50
51
52
# First, we install the necessary dependencies. In addition to
# ``torchaudio``, ``DeepPhonemizer`` is required to perform phoneme-based
# encoding.
53
#
moto's avatar
moto committed
54

moto's avatar
moto committed
55
56
57
58
59
# %%
#  .. code-block:: bash
#
#      %%bash
#      pip3 install deep_phonemizer
moto's avatar
moto committed
60
61
62
63
64
65
66
67
68
69
70
71

import torch
import torchaudio

torch.random.manual_seed(0)
device = "cuda" if torch.cuda.is_available() else "cpu"

print(torch.__version__)
print(torchaudio.__version__)
print(device)


72
73
74
75
76
77
78
######################################################################
#

import IPython
import matplotlib.pyplot as plt


moto's avatar
moto committed
79
80
81
######################################################################
# Text Processing
# ---------------
82
#
moto's avatar
moto committed
83
84
85
86
87


######################################################################
# Character-based encoding
# ~~~~~~~~~~~~~~~~~~~~~~~~
88
#
moto's avatar
moto committed
89
90
# In this section, we will go through how the character-based encoding
# works.
91
#
moto's avatar
moto committed
92
# Since the pre-trained Tacotron2 model expects specific set of symbol
93
94
# tables, the same functionalities is available in ``torchaudio``. However,
# we will first manually implement the encoding to aid in understanding.
95
#
96
# First, we define the set of symbols
moto's avatar
moto committed
97
98
# ``'_-!\'(),.:;? abcdefghijklmnopqrstuvwxyz'``. Then, we will map the
# each character of the input text into the index of the corresponding
99
# symbol in the table. Symbols that are not in the table are ignored.
moto's avatar
moto committed
100

101
symbols = "_-!'(),.:;? abcdefghijklmnopqrstuvwxyz"
moto's avatar
moto committed
102
103
104
look_up = {s: i for i, s in enumerate(symbols)}
symbols = set(symbols)

105

moto's avatar
moto committed
106
def text_to_sequence(text):
107
108
109
    text = text.lower()
    return [look_up[s] for s in text if s in symbols]

moto's avatar
moto committed
110
111
112
113
114
115
116

text = "Hello world! Text to speech!"
print(text_to_sequence(text))


######################################################################
# As mentioned in the above, the symbol table and indices must match
117
118
# what the pretrained Tacotron2 model expects. ``torchaudio`` provides the same
# transform along with the pretrained model. You can
moto's avatar
moto committed
119
# instantiate and use such transform as follow.
120
#
moto's avatar
moto committed
121
122
123
124
125
126
127
128
129
130
131

processor = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH.get_text_processor()

text = "Hello world! Text to speech!"
processed, lengths = processor(text)

print(processed)
print(lengths)


######################################################################
132
# Note: The output of our manual encoding and the ``torchaudio`` ``text_processor`` output matches (meaning we correctly re-implemented what the library does internally). It takes either a text or list of texts as inputs.
moto's avatar
moto committed
133
134
135
# When a list of texts are provided, the returned ``lengths`` variable
# represents the valid length of each processed tokens in the output
# batch.
136
#
137
# The intermediate representation can be retrieved as follows:
138
#
moto's avatar
moto committed
139

140
print([processor.tokens[i] for i in processed[0, : lengths[0]]])
moto's avatar
moto committed
141
142
143
144
145


######################################################################
# Phoneme-based encoding
# ~~~~~~~~~~~~~~~~~~~~~~
146
#
moto's avatar
moto committed
147
148
149
# Phoneme-based encoding is similar to character-based encoding, but it
# uses a symbol table based on phonemes and a G2P (Grapheme-to-Phoneme)
# model.
150
#
151
# The detail of the G2P model is out of the scope of this tutorial, we will
moto's avatar
moto committed
152
# just look at what the conversion looks like.
153
#
moto's avatar
moto committed
154
155
156
# Similar to the case of character-based encoding, the encoding process is
# expected to match what a pretrained Tacotron2 model is trained on.
# ``torchaudio`` has an interface to create the process.
157
#
moto's avatar
moto committed
158
159
160
161
# The following code illustrates how to make and use the process. Behind
# the scene, a G2P model is created using ``DeepPhonemizer`` package, and
# the pretrained weights published by the author of ``DeepPhonemizer`` is
# fetched.
162
#
moto's avatar
moto committed
163
164
165
166
167
168
169

bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH

processor = bundle.get_text_processor()

text = "Hello world! Text to speech!"
with torch.inference_mode():
170
    processed, lengths = processor(text)
moto's avatar
moto committed
171
172
173
174
175
176
177
178

print(processed)
print(lengths)


######################################################################
# Notice that the encoded values are different from the example of
# character-based encoding.
179
#
moto's avatar
moto committed
180
# The intermediate representation looks like the following.
181
#
moto's avatar
moto committed
182

183
print([processor.tokens[i] for i in processed[0, : lengths[0]]])
moto's avatar
moto committed
184
185
186
187
188


######################################################################
# Spectrogram Generation
# ----------------------
189
#
moto's avatar
moto committed
190
191
192
# ``Tacotron2`` is the model we use to generate spectrogram from the
# encoded text. For the detail of the model, please refer to `the
# paper <https://arxiv.org/abs/1712.05884>`__.
193
#
194
# It is easy to instantiate a Tacotron2 model with pretrained weights,
moto's avatar
moto committed
195
196
# however, note that the input to Tacotron2 models need to be processed
# by the matching text processor.
197
#
198
# :py:class:`torchaudio.pipelines.Tacotron2TTSBundle` bundles the matching
moto's avatar
moto committed
199
# models and processors together so that it is easy to create the pipeline.
200
#
201
202
# For the available bundles, and its usage, please refer to
# :py:class:`~torchaudio.pipelines.Tacotron2TTSBundle`.
203
#
moto's avatar
moto committed
204
205
206
207
208
209
210
211

bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
processor = bundle.get_text_processor()
tacotron2 = bundle.get_tacotron2().to(device)

text = "Hello world! Text to speech!"

with torch.inference_mode():
212
213
214
215
    processed, lengths = processor(text)
    processed = processed.to(device)
    lengths = lengths.to(device)
    spec, _, _ = tacotron2.infer(processed, lengths)
moto's avatar
moto committed
216
217


218
_ = plt.imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
moto's avatar
moto committed
219
220
221
222


######################################################################
# Note that ``Tacotron2.infer`` method perfoms multinomial sampling,
223
# therefore, the process of generating the spectrogram incurs randomness.
224
#
moto's avatar
moto committed
225

226
227
228
229
230
231
232
233
234
235
236

def plot():
    fig, ax = plt.subplots(3, 1)
    for i in range(3):
        with torch.inference_mode():
            spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
        print(spec[0].shape)
        ax[i].imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")


plot()
moto's avatar
moto committed
237
238
239
240
241


######################################################################
# Waveform Generation
# -------------------
242
#
moto's avatar
moto committed
243
# Once the spectrogram is generated, the last process is to recover the
244
# waveform from the spectrogram using a vocoder.
245
#
moto's avatar
moto committed
246
247
# ``torchaudio`` provides vocoders based on ``GriffinLim`` and
# ``WaveRNN``.
248
#
moto's avatar
moto committed
249
250
251


######################################################################
252
253
# WaveRNN Vocoder
# ~~~~~~~~~~~~~~~
254
#
moto's avatar
moto committed
255
256
# Continuing from the previous section, we can instantiate the matching
# WaveRNN model from the same bundle.
257
#
moto's avatar
moto committed
258
259
260
261
262
263
264
265
266
267

bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH

processor = bundle.get_text_processor()
tacotron2 = bundle.get_tacotron2().to(device)
vocoder = bundle.get_vocoder().to(device)

text = "Hello world! Text to speech!"

with torch.inference_mode():
268
269
270
271
272
    processed, lengths = processor(text)
    processed = processed.to(device)
    lengths = lengths.to(device)
    spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
    waveforms, lengths = vocoder(spec, spec_lengths)
moto's avatar
moto committed
273

274
275
276
277
278
279
######################################################################
#


def plot(waveforms, spec, sample_rate):
    waveforms = waveforms.cpu().detach()
moto's avatar
moto committed
280

281
282
283
284
285
286
287
288
289
    fig, [ax1, ax2] = plt.subplots(2, 1)
    ax1.plot(waveforms[0])
    ax1.set_xlim(0, waveforms.size(-1))
    ax1.grid(True)
    ax2.imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
    return IPython.display.Audio(waveforms[0:1], rate=sample_rate)


plot(waveforms, spec, vocoder.sample_rate)
moto's avatar
moto committed
290
291
292


######################################################################
293
294
# Griffin-Lim Vocoder
# ~~~~~~~~~~~~~~~~~~~
295
#
moto's avatar
moto committed
296
# Using the Griffin-Lim vocoder is same as WaveRNN. You can instantiate
297
# the vocoder object with
298
299
# :py:func:`~torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder`
# method and pass the spectrogram.
300
#
moto's avatar
moto committed
301
302
303
304
305
306
307
308

bundle = torchaudio.pipelines.TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH

processor = bundle.get_text_processor()
tacotron2 = bundle.get_tacotron2().to(device)
vocoder = bundle.get_vocoder().to(device)

with torch.inference_mode():
309
310
311
312
    processed, lengths = processor(text)
    processed = processed.to(device)
    lengths = lengths.to(device)
    spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
moto's avatar
moto committed
313
314
waveforms, lengths = vocoder(spec, spec_lengths)

315
316
######################################################################
#
moto's avatar
moto committed
317

318
plot(waveforms, spec, vocoder.sample_rate)
moto's avatar
moto committed
319
320
321


######################################################################
322
323
# Waveglow Vocoder
# ~~~~~~~~~~~~~~~~
324
#
325
326
# Waveglow is a vocoder published by Nvidia. The pretrained weights are
# published on Torch Hub. One can instantiate the model using ``torch.hub``
moto's avatar
moto committed
327
# module.
328
#
moto's avatar
moto committed
329
330
331

# Workaround to load model mapped on GPU
# https://stackoverflow.com/a/61840832
332
333
334
335
336
337
338
339
340
341
342
waveglow = torch.hub.load(
    "NVIDIA/DeepLearningExamples:torchhub",
    "nvidia_waveglow",
    model_math="fp32",
    pretrained=False,
)
checkpoint = torch.hub.load_state_dict_from_url(
    "https://api.ngc.nvidia.com/v2/models/nvidia/waveglowpyt_fp32/versions/1/files/nvidia_waveglowpyt_fp32_20190306.pth",  # noqa: E501
    progress=False,
    map_location=device,
)
343
state_dict = {key.replace("module.", ""): value for key, value in checkpoint["state_dict"].items()}
moto's avatar
moto committed
344
345
346
347
348
349
350

waveglow.load_state_dict(state_dict)
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to(device)
waveglow.eval()

with torch.no_grad():
351
    waveforms = waveglow.infer(spec)
moto's avatar
moto committed
352

353
354
######################################################################
#
moto's avatar
moto committed
355

356
plot(waveforms, spec, 22050)