Unverified Commit 9e7aeea1 authored by yangarbiter's avatar yangarbiter Committed by GitHub
Browse files

Add tacotron2 training script (#1642)

parent af652ca6
This is an example pipeline for text-to-speech using Tacotron2.
## Install required packages
Required packages
```bash
pip install librosa tqdm inflect
```
To use tensorboard
```bash
pip install tensorboard pillow
```
## Training Tacotron2
The training of Tacotron2 can be invoked with the following command.
```bash
python train.py \
--learning-rate 1e-3 \
--epochs 1501 \
--anneal-steps 500 1000 1500 \
--anneal-factor 0.1 \
--batch-size 96 \
--weight-decay 1e-6 \
--grad-clip 1.0 \
--text-preprocessor character \
--logging-dir ./logs \
--checkpoint-path ./ckpt.pth \
--dataset-path ./
```
The training script will use all GPUs that is available, please set the
environment variable `CUDA_VISIBLE_DEVICES` if you don't want all GPUs to be used.
The newest checkpoint will be saved to `./ckpt.pth` and the checkpoint with the best validation
loss will be saved to `./best_ckpt.pth`.
The training log will be saved to `./logs/train.log` and the tensorboard results will also
be in `./logs`.
If `./ckpt.pth` already exist, this script will automatically load the file and try to continue
training from the checkpoint.
This command takes around 36 hours to train on 8 NVIDIA Tesla V100 GPUs.
\ No newline at end of file
# *****************************************************************************
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
from typing import Tuple, Callable, List
import torch
from torch import Tensor
from torch.utils.data.dataset import random_split
from torchaudio.datasets import LJSPEECH
class SpectralNormalization(torch.nn.Module):
def forward(self, input):
return torch.log(torch.clamp(input, min=1e-5))
class InverseSpectralNormalization(torch.nn.Module):
def forward(self, input):
return torch.exp(input)
class MapMemoryCache(torch.utils.data.Dataset):
r"""Wrap a dataset so that, whenever a new item is returned, it is saved to memory.
"""
def __init__(self, dataset):
self.dataset = dataset
self._cache = [None] * len(dataset)
def __getitem__(self, n):
if self._cache[n] is not None:
return self._cache[n]
item = self.dataset[n]
self._cache[n] = item
return item
def __len__(self):
return len(self.dataset)
class Processed(torch.utils.data.Dataset):
def __init__(self, dataset, transforms, text_preprocessor):
self.dataset = dataset
self.transforms = transforms
self.text_preprocessor = text_preprocessor
def __getitem__(self, key):
item = self.dataset[key]
return self.process_datapoint(item)
def __len__(self):
return len(self.dataset)
def process_datapoint(self, item):
melspec = self.transforms(item[0])
text_norm = torch.IntTensor(self.text_preprocessor(item[2]))
return text_norm, torch.squeeze(melspec, 0)
def split_process_dataset(dataset: str,
file_path: str,
val_ratio: float,
transforms: Callable,
text_preprocessor: Callable[[str], List[int]],
) -> Tuple[torch.utils.data.Dataset, torch.utils.data.Dataset]:
"""Returns the Training and validation datasets.
Args:
dataset (str): The dataset to use. Avaliable options: [`'ljspeech'`]
file_path (str): Path to the data.
val_ratio (float): Path to the data.
transforms (callable): A function/transform that takes in a waveform and
returns a transformed waveform (mel spectrogram in this example).
text_preprocess (callable): A function that takes in a string and
returns a list of integers representing each of the symbol in the string.
Returns:
train_dataset (`torch.utils.data.Dataset`): The training set.
val_dataset (`torch.utils.data.Dataset`): The validation set.
"""
if dataset == 'ljspeech':
data = LJSPEECH(root=file_path, download=False)
val_length = int(len(data) * val_ratio)
lengths = [len(data) - val_length, val_length]
train_dataset, val_dataset = random_split(data, lengths)
else:
raise ValueError(f"Expected datasets: `ljspeech`, but found {dataset}")
train_dataset = Processed(train_dataset, transforms, text_preprocessor)
val_dataset = Processed(val_dataset, transforms, text_preprocessor)
train_dataset = MapMemoryCache(train_dataset)
val_dataset = MapMemoryCache(val_dataset)
return train_dataset, val_dataset
def text_mel_collate_fn(batch: Tuple[Tensor, Tensor],
n_frames_per_step: int=1) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
"""The collate function padding and adjusting the data based on `n_frames_per_step`.
Modified from https://github.com/NVIDIA/DeepLearningExamples
Args:
batch (tuple of two tensors): the first tensor is the mel spectrogram with shape
(n_batch, n_mels, n_frames), the second tensor is the text with shape (n_batch, ).
n_frames_per_step (int): The number of frames to advance every step.
Returns:
text_padded (Tensor): The input text to Tacotron2 with shape (n_batch, max of ``text_lengths``).
text_lengths (Tensor): The length of each text with shape (n_batch).
mel_specgram_padded (Tensor): The target mel spectrogram
with shape (n_batch, n_mels, max of ``mel_specgram_lengths``)
mel_specgram_lengths (Tensor): The length of each mel spectrogram with shape (n_batch).
gate_padded (Tensor): The ground truth gate output
with shape (n_batch, max of ``mel_specgram_lengths``)
"""
text_lengths, ids_sorted_decreasing = torch.sort(
torch.LongTensor([len(x[0]) for x in batch]), dim=0, descending=True)
max_input_len = text_lengths[0]
text_padded = torch.zeros((len(batch), max_input_len), dtype=torch.int64)
for i in range(len(ids_sorted_decreasing)):
text = batch[ids_sorted_decreasing[i]][0]
text_padded[i, :text.size(0)] = text
# Right zero-pad mel-spec
num_mels = batch[0][1].size(0)
max_target_len = max([x[1].size(1) for x in batch])
if max_target_len % n_frames_per_step != 0:
max_target_len += n_frames_per_step - max_target_len % n_frames_per_step
assert max_target_len % n_frames_per_step == 0
# include mel padded and gate padded
mel_specgram_padded = torch.zeros((len(batch), num_mels, max_target_len), dtype=torch.float32)
gate_padded = torch.zeros((len(batch), max_target_len), dtype=torch.float32)
mel_specgram_lengths = torch.LongTensor(len(batch))
for i in range(len(ids_sorted_decreasing)):
mel = batch[ids_sorted_decreasing[i]][1]
mel_specgram_padded[i, :, :mel.size(1)] = mel
mel_specgram_lengths[i] = mel.size(1)
gate_padded[i, mel.size(1) - 1:] = 1
return text_padded, text_lengths, mel_specgram_padded, mel_specgram_lengths, gate_padded
This diff is collapsed.
# *****************************************************************************
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
import logging
import os
import shutil
from typing import List, Tuple, Callable
import torch
from torch import Tensor
def save_checkpoint(state, is_best, filename):
r"""Save the model to a temporary file first, then copy it to filename,
in case signals interrupt the torch.save() process.
"""
torch.save(state, filename)
logging.info(f"Checkpoint saved to {filename}")
if is_best:
path, best_filename = os.path.split(filename)
best_filename = os.path.join(path, "best_" + best_filename)
shutil.copyfile(filename, best_filename)
logging.info(f"Current best checkpoint saved to {best_filename}")
def pad_sequences(batch: List[Tensor]) -> Tuple[Tensor, Tensor]:
r"""Right zero-pad all one-hot text sequences to max input length.
Modified from https://github.com/NVIDIA/DeepLearningExamples.
"""
input_lengths, ids_sorted_decreasing = torch.sort(
torch.LongTensor([len(x) for x in batch]), dim=0, descending=True)
max_input_len = input_lengths[0]
text_padded = torch.LongTensor(len(batch), max_input_len)
text_padded.zero_()
for i in range(len(ids_sorted_decreasing)):
text = batch[ids_sorted_decreasing[i]]
text_padded[i, :text.size(0)] = text
return text_padded, input_lengths
def prepare_input_sequence(texts: List[str],
text_processor: Callable[[str], List[int]]) -> Tuple[Tensor, Tensor]:
d = []
for text in texts:
d.append(torch.IntTensor(text_processor(text)[:]))
text_padded, input_lengths = pad_sequences(d)
return text_padded, input_lengths
def get_text_preprocessor(preprocessor_name: str) -> Tuple[List[str], Callable[[str], List[int]]]:
if preprocessor_name == "character":
from text.text_preprocessing import symbols
from text.text_preprocessing import text_to_sequence
text_preprocessor = text_to_sequence
else:
raise ValueError("The preprocessor_name ({preprocessor_name}) provided is not supported.")
return symbols, text_preprocessor
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment