Add tacotron2 training script (#1642)

9e7aeea1 · yangarbiter · GitHub · af652ca6 · 9e7aeea1 · 9e7aeea1
Unverified Commit 9e7aeea1 authored Aug 03, 2021 by yangarbiter Committed by GitHub Aug 03, 2021
4 changed files
--- a/examples/pipeline_tacotron2/README.md
+++ b/examples/pipeline_tacotron2/README.md
 This is an example pipeline for text-to-speech using Tacotron2.
+
+
+## Install required packages
+
+Required packages
+```bash
+pip install librosa tqdm inflect
+```
+
+To use tensorboard
+```bash
+pip install tensorboard pillow
+```
+
+## Training Tacotron2
+
+The training of Tacotron2 can be invoked with the following command.
+
+```bash
+python train.py \
+    --learning-rate 1e-3 \
+    --epochs 1501 \
+    --anneal-steps 500 1000 1500 \
+    --anneal-factor 0.1 \
+    --batch-size 96 \
+    --weight-decay 1e-6 \
+    --grad-clip 1.0 \
+    --text-preprocessor character \
+    --logging-dir ./logs \
+    --checkpoint-path ./ckpt.pth \
+    --dataset-path ./
+```
+
+The training script will use all GPUs that is available, please set the
+environment variable `CUDA_VISIBLE_DEVICES` if you don't want all GPUs to be used.
+The newest checkpoint will be saved to `./ckpt.pth` and the checkpoint with the best validation
+loss will be saved to `./best_ckpt.pth`.
+The training log will be saved to `./logs/train.log` and the tensorboard results will also
+be in `./logs`.
+
+If `./ckpt.pth` already exist, this script will automatically load the file and try to continue
+training from the checkpoint.
+
+This command takes around 36 hours to train on 8 NVIDIA Tesla V100 GPUs.
\ No newline at end of file
--- a/examples/pipeline_tacotron2/datasets.py
+++ b/examples/pipeline_tacotron2/datasets.py
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+from typing import Tuple, Callable, List
+
+import torch
+from torch import Tensor
+
+from torch.utils.data.dataset import random_split
+from torchaudio.datasets import LJSPEECH
+
+
+class SpectralNormalization(torch.nn.Module):
+    def forward(self, input):
+        return torch.log(torch.clamp(input, min=1e-5))
+
+
+class InverseSpectralNormalization(torch.nn.Module):
+    def forward(self, input):
+        return torch.exp(input)
+
+
+class MapMemoryCache(torch.utils.data.Dataset):
+    r"""Wrap a dataset so that, whenever a new item is returned, it is saved to memory.
+    """
+
+    def __init__(self, dataset):
+        self.dataset = dataset
+        self._cache = [None] * len(dataset)
+
+    def __getitem__(self, n):
+        if self._cache[n] is not None:
+            return self._cache[n]
+
+        item = self.dataset[n]
+        self._cache[n] = item
+
+        return item
+
+    def __len__(self):
+        return len(self.dataset)
+
+
+class Processed(torch.utils.data.Dataset):
+    def __init__(self, dataset, transforms, text_preprocessor):
+        self.dataset = dataset
+        self.transforms = transforms
+        self.text_preprocessor = text_preprocessor
+
+    def __getitem__(self, key):
+        item = self.dataset[key]
+        return self.process_datapoint(item)
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def process_datapoint(self, item):
+        melspec = self.transforms(item[0])
+        text_norm = torch.IntTensor(self.text_preprocessor(item[2]))
+        return text_norm, torch.squeeze(melspec, 0)
+
+
+def split_process_dataset(dataset: str,
+                          file_path: str,
+                          val_ratio: float,
+                          transforms: Callable,
+                          text_preprocessor: Callable[[str], List[int]],
+                          ) -> Tuple[torch.utils.data.Dataset, torch.utils.data.Dataset]:
+    """Returns the Training and validation datasets.
+
+    Args:
+        dataset (str): The dataset to use. Avaliable options: [`'ljspeech'`]
+        file_path (str): Path to the data.
+        val_ratio (float): Path to the data.
+        transforms (callable): A function/transform that takes in a waveform and
+            returns a transformed waveform (mel spectrogram in this example).
+        text_preprocess (callable): A function that takes in a string and
+            returns a list of integers representing each of the symbol in the string.
+
+    Returns:
+        train_dataset (`torch.utils.data.Dataset`): The training set.
+        val_dataset (`torch.utils.data.Dataset`): The validation set.
+    """
+    if dataset == 'ljspeech':
+        data = LJSPEECH(root=file_path, download=False)
+
+        val_length = int(len(data) * val_ratio)
+        lengths = [len(data) - val_length, val_length]
+        train_dataset, val_dataset = random_split(data, lengths)
+    else:
+        raise ValueError(f"Expected datasets: `ljspeech`, but found {dataset}")
+
+    train_dataset = Processed(train_dataset, transforms, text_preprocessor)
+    val_dataset = Processed(val_dataset, transforms, text_preprocessor)
+
+    train_dataset = MapMemoryCache(train_dataset)
+    val_dataset = MapMemoryCache(val_dataset)
+
+    return train_dataset, val_dataset
+
+
+def text_mel_collate_fn(batch: Tuple[Tensor, Tensor],
+                        n_frames_per_step: int=1) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
+    """The collate function padding and adjusting the data based on `n_frames_per_step`.
+    Modified from https://github.com/NVIDIA/DeepLearningExamples
+
+    Args:
+        batch (tuple of two tensors): the first tensor is the mel spectrogram with shape
+            (n_batch, n_mels, n_frames), the second tensor is the text with shape (n_batch, ).
+        n_frames_per_step (int): The number of frames to advance every step.
+
+    Returns:
+        text_padded (Tensor): The input text to Tacotron2 with shape (n_batch, max of ``text_lengths``).
+        text_lengths (Tensor): The length of each text with shape (n_batch).
+        mel_specgram_padded (Tensor): The target mel spectrogram
+            with shape (n_batch, n_mels, max of ``mel_specgram_lengths``)
+        mel_specgram_lengths (Tensor): The length of each mel spectrogram with shape (n_batch).
+        gate_padded (Tensor): The ground truth gate output
+            with shape (n_batch, max of ``mel_specgram_lengths``)
+    """
+    text_lengths, ids_sorted_decreasing = torch.sort(
+        torch.LongTensor([len(x[0]) for x in batch]), dim=0, descending=True)
+    max_input_len = text_lengths[0]
+
+    text_padded = torch.zeros((len(batch), max_input_len), dtype=torch.int64)
+    for i in range(len(ids_sorted_decreasing)):
+        text = batch[ids_sorted_decreasing[i]][0]
+        text_padded[i, :text.size(0)] = text
+
+    # Right zero-pad mel-spec
+    num_mels = batch[0][1].size(0)
+    max_target_len = max([x[1].size(1) for x in batch])
+    if max_target_len % n_frames_per_step != 0:
+        max_target_len += n_frames_per_step - max_target_len % n_frames_per_step
+        assert max_target_len % n_frames_per_step == 0
+
+    # include mel padded and gate padded
+    mel_specgram_padded = torch.zeros((len(batch), num_mels, max_target_len), dtype=torch.float32)
+    gate_padded = torch.zeros((len(batch), max_target_len), dtype=torch.float32)
+    mel_specgram_lengths = torch.LongTensor(len(batch))
+    for i in range(len(ids_sorted_decreasing)):
+        mel = batch[ids_sorted_decreasing[i]][1]
+        mel_specgram_padded[i, :, :mel.size(1)] = mel
+        mel_specgram_lengths[i] = mel.size(1)
+        gate_padded[i, mel.size(1) - 1:] = 1
+
+    return text_padded, text_lengths, mel_specgram_padded, mel_specgram_lengths, gate_padded
--- a/examples/pipeline_tacotron2/train.py
+++ b/examples/pipeline_tacotron2/train.py
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+"""
+Modified from
+https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/train.py
+"""
+
+import argparse
+from datetime import datetime
+from functools import partial
+import logging
+import random
+import os
+from time import time
+
+import torch
+import torchaudio
+import torch.multiprocessing as mp
+import torch.distributed as dist
+from torch.utils.tensorboard import SummaryWriter
+from torch.utils.data import DataLoader
+from torch.optim import Adam
+from torchaudio.prototype.tacotron2 import Tacotron2
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+plt.switch_backend('agg')
+
+from datasets import text_mel_collate_fn, split_process_dataset, SpectralNormalization
+from utils import save_checkpoint, get_text_preprocessor
+from loss import Tacotron2Loss
+
+
+logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
+                    level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')
+logger = logging.getLogger(os.path.basename(__file__))
+
+
+def parse_args(parser):
+    """Parse commandline arguments."""
+
+    parser.add_argument("--dataset", default="ljspeech", choices=["ljspeech"], type=str,
+                        help="select dataset to train with")
+    parser.add_argument('--logging-dir', type=str, default=None,
+                        help='directory to save the log files')
+    parser.add_argument('--dataset-path', type=str, default='./',
+                        help='path to dataset')
+    parser.add_argument("--val-ratio", default=0.1, type=float,
+                        help="the ratio of waveforms for validation")
+
+    parser.add_argument('--anneal-steps', nargs='*',
+                        help='epochs after which decrease learning rate')
+    parser.add_argument('--anneal-factor', type=float, choices=[0.1, 0.3], default=0.1,
+                        help='factor for annealing learning rate')
+
+    parser.add_argument('--text-preprocessor', default='character', type=str,
+                        choices=['character'], help='[string] Select text preprocessor to use.')
+
+    parser.add_argument('--master-addr', default=None, type=str,
+                        help='The address to use for distributed training.')
+    parser.add_argument('--master-port', default=None, type=str,
+                        help='The port to use for distributed training.')
+
+    # training
+    training = parser.add_argument_group('training setup')
+    training.add_argument('--epochs', type=int, required=True,
+                          help='number of total epochs to run')
+    training.add_argument('--checkpoint-path', type=str, default='',
+                          help='checkpoint path. If a file exists, '
+                               'the program will load it and resume training.')
+    training.add_argument('--workers', default=8, type=int,
+                          help="number of data loading workers")
+    training.add_argument("--validate-and-checkpoint-freq", default=10, type=int, metavar="N",
+                          help="validation and saving checkpoint frequency in epochs",)
+    training.add_argument("--logging-freq", default=10, type=int, metavar="N",
+                          help="logging frequency in epochs")
+
+    optimization = parser.add_argument_group('optimization setup')
+    optimization.add_argument('--learning-rate', default=1e-3, type=float,
+                              help='initial learing rate')
+    optimization.add_argument('--weight-decay', default=1e-6, type=float,
+                              help='weight decay')
+    optimization.add_argument('--batch-size', default=32, type=int,
+                              help='batch size per GPU')
+    optimization.add_argument('--grad-clip', default=5.0, type=float,
+                              help='clipping gradient with maximum gradient norm value')
+
+    # model parameters
+    model = parser.add_argument_group('model parameters')
+    model.add_argument('--mask-padding', action='store_true', default=False,
+                       help='use mask padding')
+    model.add_argument('--symbols-embedding-dim', default=512, type=int,
+                       help='input embedding dimension')
+
+    # encoder
+    model.add_argument('--encoder-embedding-dim', default=512, type=int,
+                       help='encoder embedding dimension')
+    model.add_argument('--encoder-n-convolution', default=3, type=int,
+                       help='number of encoder convolutions')
+    model.add_argument('--encoder-kernel-size', default=5, type=int,
+                       help='encoder kernel size')
+    # decoder
+    model.add_argument('--n-frames-per-step', default=1, type=int,
+                       help='number of frames processed per step (currently only 1 is supported)')
+    model.add_argument('--decoder-rnn-dim', default=1024, type=int,
+                       help='number of units in decoder LSTM')
+    model.add_argument('--decoder-dropout', default=0.1, type=float,
+                       help='dropout probability for decoder LSTM')
+    model.add_argument('--decoder-max-step', default=2000, type=int,
+                       help='maximum number of output mel spectrograms')
+    model.add_argument('--decoder-no-early-stopping', action='store_true', default=False,
+                       help='stop decoding only when all samples are finished')
+
+    # attention model
+    model.add_argument('--attention-hidden-dim', default=128, type=int,
+                       help='dimension of attention hidden representation')
+    model.add_argument('--attention-rnn-dim', default=1024, type=int,
+                       help='number of units in attention LSTM')
+    model.add_argument('--attention-location-n-filter', default=32, type=int,
+                       help='number of filters for location-sensitive attention')
+    model.add_argument('--attention-location-kernel-size', default=31, type=int,
+                       help='kernel size for location-sensitive attention')
+    model.add_argument('--attention-dropout', default=0.1, type=float,
+                       help='dropout probability for attention LSTM')
+
+    model.add_argument('--prenet-dim', default=256, type=int,
+                       help='number of ReLU units in prenet layers')
+
+    # mel-post processing network parameters
+    model.add_argument('--postnet-n-convolution', default=5, type=float,
+                       help='number of postnet convolutions')
+    model.add_argument('--postnet-kernel-size', default=5, type=float,
+                       help='postnet kernel size')
+    model.add_argument('--postnet-embedding-dim', default=512, type=float,
+                       help='postnet embedding dimension')
+
+    model.add_argument('--gate-threshold', default=0.5, type=float,
+                       help='probability threshold for stop token')
+
+    # audio parameters
+    audio = parser.add_argument_group('audio parameters')
+    audio.add_argument('--sample-rate', default=22050, type=int,
+                       help='Sampling rate')
+    audio.add_argument('--n-fft', default=1024, type=int,
+                       help='Filter length for STFT')
+    audio.add_argument('--hop-length', default=256, type=int,
+                       help='Hop (stride) length')
+    audio.add_argument('--win-length', default=1024, type=int,
+                       help='Window length')
+    audio.add_argument('--n-mels', default=80, type=int,
+                       help='')
+    audio.add_argument('--mel-fmin', default=0.0, type=float,
+                       help='Minimum mel frequency')
+    audio.add_argument('--mel-fmax', default=8000.0, type=float,
+                       help='Maximum mel frequency')
+
+    return parser
+
+
+def adjust_learning_rate(epoch, optimizer, learning_rate,
+                         anneal_steps, anneal_factor):
+    """Adjust learning rate base on the initial setting."""
+    p = 0
+    if anneal_steps is not None:
+        for _, a_step in enumerate(anneal_steps):
+            if epoch >= int(a_step):
+                p = p + 1
+
+    if anneal_factor == 0.3:
+        lr = learning_rate * ((0.1 ** (p // 2)) * (1.0 if p % 2 == 0 else 0.3))
+    else:
+        lr = learning_rate * (anneal_factor ** p)
+
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+def to_gpu(x):
+    x = x.contiguous()
+    if torch.cuda.is_available():
+        x = x.cuda(non_blocking=True)
+    return x
+
+
+def batch_to_gpu(batch):
+    text_padded, text_lengths, mel_specgram_padded, mel_specgram_lengths, gate_padded = batch
+    text_padded = to_gpu(text_padded).long()
+    text_lengths = to_gpu(text_lengths).long()
+    mel_specgram_padded = to_gpu(mel_specgram_padded).float()
+    gate_padded = to_gpu(gate_padded).float()
+    mel_specgram_lengths = to_gpu(mel_specgram_lengths).long()
+    x = (text_padded, text_lengths, mel_specgram_padded, mel_specgram_lengths)
+    y = (mel_specgram_padded, gate_padded)
+    return x, y
+
+
+def training_step(model, train_batch, batch_idx):
+    (text_padded, text_lengths, mel_specgram_padded, mel_specgram_lengths), y = batch_to_gpu(train_batch)
+    y_pred = model(text_padded, text_lengths, mel_specgram_padded, mel_specgram_lengths)
+    y[0].requires_grad = False
+    y[1].requires_grad = False
+    losses = Tacotron2Loss()(y_pred[:3], y)
+    return losses[0] + losses[1] + losses[2], losses
+
+
+def validation_step(model, val_batch, batch_idx):
+    (text_padded, text_lengths, mel_specgram_padded, mel_specgram_lengths), y = batch_to_gpu(val_batch)
+    y_pred = model(text_padded, text_lengths, mel_specgram_padded, mel_specgram_lengths)
+    losses = Tacotron2Loss()(y_pred[:3], y)
+    return losses[0] + losses[1] + losses[2], losses
+
+
+def reduce_tensor(tensor, world_size):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+    if rt.is_floating_point():
+        rt = rt / world_size
+    else:
+        rt = rt // world_size
+    return rt
+
+
+def log_additional_info(writer, model, loader, epoch):
+    model.eval()
+    data = next(iter(loader))
+    with torch.no_grad():
+        (text_padded, text_lengths, mel_specgram_padded, mel_specgram_lengths), _ = batch_to_gpu(data)
+        y_pred = model(text_padded, text_lengths, mel_specgram_padded, mel_specgram_lengths)
+        mel_out, mel_out_postnet, gate_out, alignment = y_pred
+
+    fig = plt.figure()
+    ax = plt.gca()
+    ax.imshow(mel_out[0].cpu().numpy())
+    writer.add_figure("trn/mel_out", fig, epoch)
+    fig = plt.figure()
+    ax = plt.gca()
+    ax.imshow(mel_out_postnet[0].cpu().numpy())
+    writer.add_figure("trn/mel_out_postnet", fig, epoch)
+    writer.add_image("trn/gate_out", torch.tile(gate_out[:1], (10, 1)), epoch, dataformats="HW")
+    writer.add_image("trn/alignment", alignment[0], epoch, dataformats="HW")
+
+
+def train(rank, world_size, args):
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+    if rank == 0 and args.logging_dir:
+        if not os.path.isdir(args.logging_dir):
+            os.makedirs(args.logging_dir)
+        filehandler = logging.FileHandler(os.path.join(args.logging_dir, 'train.log'))
+        filehandler.setLevel(logging.INFO)
+        logger.addHandler(filehandler)
+
+        writer = SummaryWriter(log_dir=args.logging_dir)
+    else:
+        writer = None
+
+    torch.manual_seed(0)
+
+    torch.cuda.set_device(rank)
+
+    symbols, text_preprocessor = get_text_preprocessor(args.text_preprocessor)
+
+    model = Tacotron2(
+        mask_padding=args.mask_padding,
+        n_mels=args.n_mels,
+        n_symbol=len(symbols),
+        n_frames_per_step=args.n_frames_per_step,
+        symbol_embedding_dim=args.symbols_embedding_dim,
+        encoder_embedding_dim=args.encoder_embedding_dim,
+        encoder_n_convolution=args.encoder_n_convolution,
+        encoder_kernel_size=args.encoder_kernel_size,
+        decoder_rnn_dim=args.decoder_rnn_dim,
+        decoder_max_step=args.decoder_max_step,
+        decoder_dropout=args.decoder_dropout,
+        decoder_early_stopping=(not args.decoder_no_early_stopping),
+        attention_rnn_dim=args.attention_rnn_dim,
+        attention_hidden_dim=args.attention_hidden_dim,
+        attention_location_n_filter=args.attention_location_n_filter,
+        attention_location_kernel_size=args.attention_location_kernel_size,
+        attention_dropout=args.attention_dropout,
+        prenet_dim=args.prenet_dim,
+        postnet_n_convolution=args.postnet_n_convolution,
+        postnet_kernel_size=args.postnet_kernel_size,
+        postnet_embedding_dim=args.postnet_embedding_dim,
+        gate_threshold=args.gate_threshold,
+    ).cuda(rank)
+    model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank])
+
+    optimizer = Adam(model.parameters(), lr=args.learning_rate)
+
+    best_loss = float("inf")
+    start_epoch = 0
+
+    if args.checkpoint_path and os.path.isfile(args.checkpoint_path):
+        logger.info(f"Checkpoint: loading '{args.checkpoint_path}'")
+        map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
+        checkpoint = torch.load(args.checkpoint_path, map_location=map_location)
+
+        start_epoch = checkpoint["epoch"]
+        best_loss = checkpoint["best_loss"]
+
+        model.load_state_dict(checkpoint["state_dict"])
+        optimizer.load_state_dict(checkpoint["optimizer"])
+
+        logger.info(
+            f"Checkpoint: loaded '{args.checkpoint_path}' at epoch {checkpoint['epoch']}"
+        )
+
+    transforms = torch.nn.Sequential(
+        torchaudio.transforms.MelSpectrogram(
+            sample_rate=args.sample_rate,
+            n_fft=args.n_fft,
+            win_length=args.win_length,
+            hop_length=args.hop_length,
+            f_min=args.mel_fmin,
+            f_max=args.mel_fmax,
+            n_mels=args.n_mels,
+            mel_scale='slaney',
+            normalized=False,
+            power=1,
+            norm='slaney',
+        ),
+        SpectralNormalization()
+    )
+    trainset, valset = split_process_dataset(
+        args.dataset, args.dataset_path, args.val_ratio, transforms, text_preprocessor)
+
+    train_sampler = torch.utils.data.distributed.DistributedSampler(
+        trainset,
+        shuffle=True,
+        num_replicas=world_size,
+        rank=rank,
+    )
+    val_sampler = torch.utils.data.distributed.DistributedSampler(
+        valset,
+        shuffle=False,
+        num_replicas=world_size,
+        rank=rank,
+    )
+
+    loader_params = {
+        "batch_size": args.batch_size,
+        "num_workers": args.workers,
+        "shuffle": False,
+        "pin_memory": True,
+        "drop_last": False,
+        "collate_fn": partial(text_mel_collate_fn, n_frames_per_step=args.n_frames_per_step),
+    }
+
+    train_loader = DataLoader(trainset, sampler=train_sampler, **loader_params)
+    val_loader = DataLoader(valset, sampler=val_sampler, **loader_params)
+    dist.barrier()
+
+    for epoch in range(start_epoch, args.epochs):
+        start = time()
+
+        model.train()
+        trn_loss, counts = 0, 0
+
+        if rank == 0:
+            iterator = tqdm(enumerate(train_loader), desc=f"Epoch {epoch}", total=len(train_loader))
+        else:
+            iterator = enumerate(train_loader)
+
+        for i, batch in iterator:
+            adjust_learning_rate(epoch, optimizer, args.learning_rate,
+                                 args.anneal_steps, args.anneal_factor)
+
+            model.zero_grad()
+
+            loss, losses = training_step(model, batch, i)
+
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(
+                model.parameters(), args.grad_clip)
+
+            optimizer.step()
+
+            if rank == 0 and writer:
+                global_iters = epoch * len(train_loader)
+                writer.add_scalar("trn/mel_loss", losses[0], global_iters)
+                writer.add_scalar("trn/mel_postnet_loss", losses[1], global_iters)
+                writer.add_scalar("trn/gate_loss", losses[2], global_iters)
+
+            trn_loss += loss * len(batch[0])
+            counts += len(batch[0])
+
+        trn_loss = trn_loss / counts
+
+        trn_loss = reduce_tensor(trn_loss, world_size)
+        if rank == 0:
+            logger.info(f"[Epoch: {epoch}] time: {time()-start}; trn_loss: {trn_loss}")
+            if writer:
+                writer.add_scalar("trn_loss", trn_loss, epoch)
+
+        if ((epoch + 1) % args.validate_and_checkpoint_freq == 0) or (epoch == args.epochs - 1):
+
+            val_start_time = time()
+            model.eval()
+
+            val_loss, counts = 0, 0
+            iterator = tqdm(enumerate(val_loader), desc=f"[Rank: {rank}; Epoch: {epoch}; Eval]", total=len(val_loader))
+
+            with torch.no_grad():
+                for val_batch_idx, val_batch in iterator:
+                    val_loss = val_loss + validation_step(model, val_batch, val_batch_idx)[0] * len(val_batch[0])
+                    counts = counts + len(val_batch[0])
+                val_loss = val_loss / counts
+
+            val_loss = reduce_tensor(val_loss, world_size)
+            if rank == 0 and writer:
+                writer.add_scalar("val_loss", val_loss, epoch)
+                log_additional_info(writer, model, val_loader, epoch)
+
+            if rank == 0:
+                is_best = val_loss < best_loss
+                best_loss = min(val_loss, best_loss)
+                logger.info(f"[Rank: {rank}, Epoch: {epoch}; Eval] time: {time()-val_start_time}; val_loss: {val_loss}")
+                logger.info(f"[Epoch: {epoch}] Saving checkpoint to {args.checkpoint_path}")
+                save_checkpoint(
+                    {
+                        "epoch": epoch + 1,
+                        "state_dict": model.state_dict(),
+                        "best_loss": best_loss,
+                        "optimizer": optimizer.state_dict(),
+                    },
+                    is_best,
+                    args.checkpoint_path,
+                )
+
+    dist.destroy_process_group()
+
+
+def main(args):
+    logger.info("Start time: {}".format(str(datetime.now())))
+
+    torch.manual_seed(0)
+    random.seed(0)
+
+    if args.master_addr is not None:
+        os.environ['MASTER_ADDR'] = args.master_addr
+    elif 'MASTER_ADDR' not in os.environ:
+        os.environ['MASTER_ADDR'] = 'localhost'
+
+    if args.master_port is not None:
+        os.environ['MASTER_PORT'] = args.master_port
+    elif 'MASTER_PORT' not in os.environ:
+        os.environ['MASTER_PORT'] = '17778'
+
+    device_counts = torch.cuda.device_count()
+
+    logger.info(f"# available GPUs: {device_counts}")
+
+    # download dataset is not already downloaded
+    if args.dataset == 'ljspeech':
+        if not os.path.exists(os.path.join(args.dataset_path, 'LJSpeech-1.1')):
+            from torchaudio.datasets import LJSPEECH
+            LJSPEECH(root=args.dataset_path, download=True)
+
+    if device_counts == 1:
+        train(0, 1, args)
+    else:
+        mp.spawn(train, args=(device_counts, args, ), nprocs=device_counts, join=True)
+
+    logger.info(f"End time: {datetime.now()}")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training')
+    parser = parse_args(parser)
+    args, _ = parser.parse_known_args()
+
+    main(args)
--- a/examples/pipeline_tacotron2/utils.py
+++ b/examples/pipeline_tacotron2/utils.py
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+
+import logging
+import os
+import shutil
+from typing import List, Tuple, Callable
+
+import torch
+from torch import Tensor
+
+
+def save_checkpoint(state, is_best, filename):
+    r"""Save the model to a temporary file first, then copy it to filename,
+    in case signals interrupt the torch.save() process.
+    """
+    torch.save(state, filename)
+    logging.info(f"Checkpoint saved to {filename}")
+
+    if is_best:
+        path, best_filename = os.path.split(filename)
+        best_filename = os.path.join(path, "best_" + best_filename)
+        shutil.copyfile(filename, best_filename)
+        logging.info(f"Current best checkpoint saved to {best_filename}")
+
+
+def pad_sequences(batch: List[Tensor]) -> Tuple[Tensor, Tensor]:
+    r"""Right zero-pad all one-hot text sequences to max input length.
+
+    Modified from https://github.com/NVIDIA/DeepLearningExamples.
+    """
+    input_lengths, ids_sorted_decreasing = torch.sort(
+        torch.LongTensor([len(x) for x in batch]), dim=0, descending=True)
+    max_input_len = input_lengths[0]
+
+    text_padded = torch.LongTensor(len(batch), max_input_len)
+    text_padded.zero_()
+    for i in range(len(ids_sorted_decreasing)):
+        text = batch[ids_sorted_decreasing[i]]
+        text_padded[i, :text.size(0)] = text
+
+    return text_padded, input_lengths
+
+
+def prepare_input_sequence(texts: List[str],
+                           text_processor: Callable[[str], List[int]]) -> Tuple[Tensor, Tensor]:
+    d = []
+    for text in texts:
+        d.append(torch.IntTensor(text_processor(text)[:]))
+
+    text_padded, input_lengths = pad_sequences(d)
+    return text_padded, input_lengths
+
+
+def get_text_preprocessor(preprocessor_name: str) -> Tuple[List[str], Callable[[str], List[int]]]:
+    if preprocessor_name == "character":
+        from text.text_preprocessing import symbols
+        from text.text_preprocessing import text_to_sequence
+        text_preprocessor = text_to_sequence
+    else:
+        raise ValueError("The preprocessor_name ({preprocessor_name}) provided is not supported.")
+
+    return symbols, text_preprocessor