Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
9e7aeea1
"csrc/mlp.cpp" did not exist on "2ec84ebdca59278eaf15e8ddf32476d9d6d8b904"
Unverified
Commit
9e7aeea1
authored
Aug 03, 2021
by
yangarbiter
Committed by
GitHub
Aug 03, 2021
Browse files
Add tacotron2 training script (#1642)
parent
af652ca6
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
799 additions
and
0 deletions
+799
-0
examples/pipeline_tacotron2/README.md
examples/pipeline_tacotron2/README.md
+44
-0
examples/pipeline_tacotron2/datasets.py
examples/pipeline_tacotron2/datasets.py
+171
-0
examples/pipeline_tacotron2/train.py
examples/pipeline_tacotron2/train.py
+497
-0
examples/pipeline_tacotron2/utils.py
examples/pipeline_tacotron2/utils.py
+87
-0
No files found.
examples/pipeline_tacotron2/README.md
View file @
9e7aeea1
This is an example pipeline for text-to-speech using Tacotron2.
## Install required packages
Required packages
```
bash
pip
install
librosa tqdm inflect
```
To use tensorboard
```
bash
pip
install
tensorboard pillow
```
## Training Tacotron2
The training of Tacotron2 can be invoked with the following command.
```
bash
python train.py
\
--learning-rate
1e-3
\
--epochs
1501
\
--anneal-steps
500 1000 1500
\
--anneal-factor
0.1
\
--batch-size
96
\
--weight-decay
1e-6
\
--grad-clip
1.0
\
--text-preprocessor
character
\
--logging-dir
./logs
\
--checkpoint-path
./ckpt.pth
\
--dataset-path
./
```
The training script will use all GPUs that is available, please set the
environment variable
`CUDA_VISIBLE_DEVICES`
if you don't want all GPUs to be used.
The newest checkpoint will be saved to
`./ckpt.pth`
and the checkpoint with the best validation
loss will be saved to
`./best_ckpt.pth`
.
The training log will be saved to
`./logs/train.log`
and the tensorboard results will also
be in
`./logs`
.
If
`./ckpt.pth`
already exist, this script will automatically load the file and try to continue
training from the checkpoint.
This command takes around 36 hours to train on 8 NVIDIA Tesla V100 GPUs.
\ No newline at end of file
examples/pipeline_tacotron2/datasets.py
0 → 100644
View file @
9e7aeea1
# *****************************************************************************
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
from
typing
import
Tuple
,
Callable
,
List
import
torch
from
torch
import
Tensor
from
torch.utils.data.dataset
import
random_split
from
torchaudio.datasets
import
LJSPEECH
class
SpectralNormalization
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input
):
return
torch
.
log
(
torch
.
clamp
(
input
,
min
=
1e-5
))
class
InverseSpectralNormalization
(
torch
.
nn
.
Module
):
def
forward
(
self
,
input
):
return
torch
.
exp
(
input
)
class
MapMemoryCache
(
torch
.
utils
.
data
.
Dataset
):
r
"""Wrap a dataset so that, whenever a new item is returned, it is saved to memory.
"""
def
__init__
(
self
,
dataset
):
self
.
dataset
=
dataset
self
.
_cache
=
[
None
]
*
len
(
dataset
)
def
__getitem__
(
self
,
n
):
if
self
.
_cache
[
n
]
is
not
None
:
return
self
.
_cache
[
n
]
item
=
self
.
dataset
[
n
]
self
.
_cache
[
n
]
=
item
return
item
def
__len__
(
self
):
return
len
(
self
.
dataset
)
class
Processed
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
dataset
,
transforms
,
text_preprocessor
):
self
.
dataset
=
dataset
self
.
transforms
=
transforms
self
.
text_preprocessor
=
text_preprocessor
def
__getitem__
(
self
,
key
):
item
=
self
.
dataset
[
key
]
return
self
.
process_datapoint
(
item
)
def
__len__
(
self
):
return
len
(
self
.
dataset
)
def
process_datapoint
(
self
,
item
):
melspec
=
self
.
transforms
(
item
[
0
])
text_norm
=
torch
.
IntTensor
(
self
.
text_preprocessor
(
item
[
2
]))
return
text_norm
,
torch
.
squeeze
(
melspec
,
0
)
def
split_process_dataset
(
dataset
:
str
,
file_path
:
str
,
val_ratio
:
float
,
transforms
:
Callable
,
text_preprocessor
:
Callable
[[
str
],
List
[
int
]],
)
->
Tuple
[
torch
.
utils
.
data
.
Dataset
,
torch
.
utils
.
data
.
Dataset
]:
"""Returns the Training and validation datasets.
Args:
dataset (str): The dataset to use. Avaliable options: [`'ljspeech'`]
file_path (str): Path to the data.
val_ratio (float): Path to the data.
transforms (callable): A function/transform that takes in a waveform and
returns a transformed waveform (mel spectrogram in this example).
text_preprocess (callable): A function that takes in a string and
returns a list of integers representing each of the symbol in the string.
Returns:
train_dataset (`torch.utils.data.Dataset`): The training set.
val_dataset (`torch.utils.data.Dataset`): The validation set.
"""
if
dataset
==
'ljspeech'
:
data
=
LJSPEECH
(
root
=
file_path
,
download
=
False
)
val_length
=
int
(
len
(
data
)
*
val_ratio
)
lengths
=
[
len
(
data
)
-
val_length
,
val_length
]
train_dataset
,
val_dataset
=
random_split
(
data
,
lengths
)
else
:
raise
ValueError
(
f
"Expected datasets: `ljspeech`, but found
{
dataset
}
"
)
train_dataset
=
Processed
(
train_dataset
,
transforms
,
text_preprocessor
)
val_dataset
=
Processed
(
val_dataset
,
transforms
,
text_preprocessor
)
train_dataset
=
MapMemoryCache
(
train_dataset
)
val_dataset
=
MapMemoryCache
(
val_dataset
)
return
train_dataset
,
val_dataset
def
text_mel_collate_fn
(
batch
:
Tuple
[
Tensor
,
Tensor
],
n_frames_per_step
:
int
=
1
)
->
Tuple
[
Tensor
,
Tensor
,
Tensor
,
Tensor
,
Tensor
]:
"""The collate function padding and adjusting the data based on `n_frames_per_step`.
Modified from https://github.com/NVIDIA/DeepLearningExamples
Args:
batch (tuple of two tensors): the first tensor is the mel spectrogram with shape
(n_batch, n_mels, n_frames), the second tensor is the text with shape (n_batch, ).
n_frames_per_step (int): The number of frames to advance every step.
Returns:
text_padded (Tensor): The input text to Tacotron2 with shape (n_batch, max of ``text_lengths``).
text_lengths (Tensor): The length of each text with shape (n_batch).
mel_specgram_padded (Tensor): The target mel spectrogram
with shape (n_batch, n_mels, max of ``mel_specgram_lengths``)
mel_specgram_lengths (Tensor): The length of each mel spectrogram with shape (n_batch).
gate_padded (Tensor): The ground truth gate output
with shape (n_batch, max of ``mel_specgram_lengths``)
"""
text_lengths
,
ids_sorted_decreasing
=
torch
.
sort
(
torch
.
LongTensor
([
len
(
x
[
0
])
for
x
in
batch
]),
dim
=
0
,
descending
=
True
)
max_input_len
=
text_lengths
[
0
]
text_padded
=
torch
.
zeros
((
len
(
batch
),
max_input_len
),
dtype
=
torch
.
int64
)
for
i
in
range
(
len
(
ids_sorted_decreasing
)):
text
=
batch
[
ids_sorted_decreasing
[
i
]][
0
]
text_padded
[
i
,
:
text
.
size
(
0
)]
=
text
# Right zero-pad mel-spec
num_mels
=
batch
[
0
][
1
].
size
(
0
)
max_target_len
=
max
([
x
[
1
].
size
(
1
)
for
x
in
batch
])
if
max_target_len
%
n_frames_per_step
!=
0
:
max_target_len
+=
n_frames_per_step
-
max_target_len
%
n_frames_per_step
assert
max_target_len
%
n_frames_per_step
==
0
# include mel padded and gate padded
mel_specgram_padded
=
torch
.
zeros
((
len
(
batch
),
num_mels
,
max_target_len
),
dtype
=
torch
.
float32
)
gate_padded
=
torch
.
zeros
((
len
(
batch
),
max_target_len
),
dtype
=
torch
.
float32
)
mel_specgram_lengths
=
torch
.
LongTensor
(
len
(
batch
))
for
i
in
range
(
len
(
ids_sorted_decreasing
)):
mel
=
batch
[
ids_sorted_decreasing
[
i
]][
1
]
mel_specgram_padded
[
i
,
:,
:
mel
.
size
(
1
)]
=
mel
mel_specgram_lengths
[
i
]
=
mel
.
size
(
1
)
gate_padded
[
i
,
mel
.
size
(
1
)
-
1
:]
=
1
return
text_padded
,
text_lengths
,
mel_specgram_padded
,
mel_specgram_lengths
,
gate_padded
examples/pipeline_tacotron2/train.py
0 → 100644
View file @
9e7aeea1
# *****************************************************************************
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
"""
Modified from
https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/train.py
"""
import
argparse
from
datetime
import
datetime
from
functools
import
partial
import
logging
import
random
import
os
from
time
import
time
import
torch
import
torchaudio
import
torch.multiprocessing
as
mp
import
torch.distributed
as
dist
from
torch.utils.tensorboard
import
SummaryWriter
from
torch.utils.data
import
DataLoader
from
torch.optim
import
Adam
from
torchaudio.prototype.tacotron2
import
Tacotron2
from
tqdm
import
tqdm
import
matplotlib.pyplot
as
plt
plt
.
switch_backend
(
'agg'
)
from
datasets
import
text_mel_collate_fn
,
split_process_dataset
,
SpectralNormalization
from
utils
import
save_checkpoint
,
get_text_preprocessor
from
loss
import
Tacotron2Loss
logging
.
basicConfig
(
format
=
'%(asctime)s %(levelname)-8s %(message)s'
,
level
=
logging
.
INFO
,
datefmt
=
'%Y-%m-%d %H:%M:%S'
)
logger
=
logging
.
getLogger
(
os
.
path
.
basename
(
__file__
))
def
parse_args
(
parser
):
"""Parse commandline arguments."""
parser
.
add_argument
(
"--dataset"
,
default
=
"ljspeech"
,
choices
=
[
"ljspeech"
],
type
=
str
,
help
=
"select dataset to train with"
)
parser
.
add_argument
(
'--logging-dir'
,
type
=
str
,
default
=
None
,
help
=
'directory to save the log files'
)
parser
.
add_argument
(
'--dataset-path'
,
type
=
str
,
default
=
'./'
,
help
=
'path to dataset'
)
parser
.
add_argument
(
"--val-ratio"
,
default
=
0.1
,
type
=
float
,
help
=
"the ratio of waveforms for validation"
)
parser
.
add_argument
(
'--anneal-steps'
,
nargs
=
'*'
,
help
=
'epochs after which decrease learning rate'
)
parser
.
add_argument
(
'--anneal-factor'
,
type
=
float
,
choices
=
[
0.1
,
0.3
],
default
=
0.1
,
help
=
'factor for annealing learning rate'
)
parser
.
add_argument
(
'--text-preprocessor'
,
default
=
'character'
,
type
=
str
,
choices
=
[
'character'
],
help
=
'[string] Select text preprocessor to use.'
)
parser
.
add_argument
(
'--master-addr'
,
default
=
None
,
type
=
str
,
help
=
'The address to use for distributed training.'
)
parser
.
add_argument
(
'--master-port'
,
default
=
None
,
type
=
str
,
help
=
'The port to use for distributed training.'
)
# training
training
=
parser
.
add_argument_group
(
'training setup'
)
training
.
add_argument
(
'--epochs'
,
type
=
int
,
required
=
True
,
help
=
'number of total epochs to run'
)
training
.
add_argument
(
'--checkpoint-path'
,
type
=
str
,
default
=
''
,
help
=
'checkpoint path. If a file exists, '
'the program will load it and resume training.'
)
training
.
add_argument
(
'--workers'
,
default
=
8
,
type
=
int
,
help
=
"number of data loading workers"
)
training
.
add_argument
(
"--validate-and-checkpoint-freq"
,
default
=
10
,
type
=
int
,
metavar
=
"N"
,
help
=
"validation and saving checkpoint frequency in epochs"
,)
training
.
add_argument
(
"--logging-freq"
,
default
=
10
,
type
=
int
,
metavar
=
"N"
,
help
=
"logging frequency in epochs"
)
optimization
=
parser
.
add_argument_group
(
'optimization setup'
)
optimization
.
add_argument
(
'--learning-rate'
,
default
=
1e-3
,
type
=
float
,
help
=
'initial learing rate'
)
optimization
.
add_argument
(
'--weight-decay'
,
default
=
1e-6
,
type
=
float
,
help
=
'weight decay'
)
optimization
.
add_argument
(
'--batch-size'
,
default
=
32
,
type
=
int
,
help
=
'batch size per GPU'
)
optimization
.
add_argument
(
'--grad-clip'
,
default
=
5.0
,
type
=
float
,
help
=
'clipping gradient with maximum gradient norm value'
)
# model parameters
model
=
parser
.
add_argument_group
(
'model parameters'
)
model
.
add_argument
(
'--mask-padding'
,
action
=
'store_true'
,
default
=
False
,
help
=
'use mask padding'
)
model
.
add_argument
(
'--symbols-embedding-dim'
,
default
=
512
,
type
=
int
,
help
=
'input embedding dimension'
)
# encoder
model
.
add_argument
(
'--encoder-embedding-dim'
,
default
=
512
,
type
=
int
,
help
=
'encoder embedding dimension'
)
model
.
add_argument
(
'--encoder-n-convolution'
,
default
=
3
,
type
=
int
,
help
=
'number of encoder convolutions'
)
model
.
add_argument
(
'--encoder-kernel-size'
,
default
=
5
,
type
=
int
,
help
=
'encoder kernel size'
)
# decoder
model
.
add_argument
(
'--n-frames-per-step'
,
default
=
1
,
type
=
int
,
help
=
'number of frames processed per step (currently only 1 is supported)'
)
model
.
add_argument
(
'--decoder-rnn-dim'
,
default
=
1024
,
type
=
int
,
help
=
'number of units in decoder LSTM'
)
model
.
add_argument
(
'--decoder-dropout'
,
default
=
0.1
,
type
=
float
,
help
=
'dropout probability for decoder LSTM'
)
model
.
add_argument
(
'--decoder-max-step'
,
default
=
2000
,
type
=
int
,
help
=
'maximum number of output mel spectrograms'
)
model
.
add_argument
(
'--decoder-no-early-stopping'
,
action
=
'store_true'
,
default
=
False
,
help
=
'stop decoding only when all samples are finished'
)
# attention model
model
.
add_argument
(
'--attention-hidden-dim'
,
default
=
128
,
type
=
int
,
help
=
'dimension of attention hidden representation'
)
model
.
add_argument
(
'--attention-rnn-dim'
,
default
=
1024
,
type
=
int
,
help
=
'number of units in attention LSTM'
)
model
.
add_argument
(
'--attention-location-n-filter'
,
default
=
32
,
type
=
int
,
help
=
'number of filters for location-sensitive attention'
)
model
.
add_argument
(
'--attention-location-kernel-size'
,
default
=
31
,
type
=
int
,
help
=
'kernel size for location-sensitive attention'
)
model
.
add_argument
(
'--attention-dropout'
,
default
=
0.1
,
type
=
float
,
help
=
'dropout probability for attention LSTM'
)
model
.
add_argument
(
'--prenet-dim'
,
default
=
256
,
type
=
int
,
help
=
'number of ReLU units in prenet layers'
)
# mel-post processing network parameters
model
.
add_argument
(
'--postnet-n-convolution'
,
default
=
5
,
type
=
float
,
help
=
'number of postnet convolutions'
)
model
.
add_argument
(
'--postnet-kernel-size'
,
default
=
5
,
type
=
float
,
help
=
'postnet kernel size'
)
model
.
add_argument
(
'--postnet-embedding-dim'
,
default
=
512
,
type
=
float
,
help
=
'postnet embedding dimension'
)
model
.
add_argument
(
'--gate-threshold'
,
default
=
0.5
,
type
=
float
,
help
=
'probability threshold for stop token'
)
# audio parameters
audio
=
parser
.
add_argument_group
(
'audio parameters'
)
audio
.
add_argument
(
'--sample-rate'
,
default
=
22050
,
type
=
int
,
help
=
'Sampling rate'
)
audio
.
add_argument
(
'--n-fft'
,
default
=
1024
,
type
=
int
,
help
=
'Filter length for STFT'
)
audio
.
add_argument
(
'--hop-length'
,
default
=
256
,
type
=
int
,
help
=
'Hop (stride) length'
)
audio
.
add_argument
(
'--win-length'
,
default
=
1024
,
type
=
int
,
help
=
'Window length'
)
audio
.
add_argument
(
'--n-mels'
,
default
=
80
,
type
=
int
,
help
=
''
)
audio
.
add_argument
(
'--mel-fmin'
,
default
=
0.0
,
type
=
float
,
help
=
'Minimum mel frequency'
)
audio
.
add_argument
(
'--mel-fmax'
,
default
=
8000.0
,
type
=
float
,
help
=
'Maximum mel frequency'
)
return
parser
def
adjust_learning_rate
(
epoch
,
optimizer
,
learning_rate
,
anneal_steps
,
anneal_factor
):
"""Adjust learning rate base on the initial setting."""
p
=
0
if
anneal_steps
is
not
None
:
for
_
,
a_step
in
enumerate
(
anneal_steps
):
if
epoch
>=
int
(
a_step
):
p
=
p
+
1
if
anneal_factor
==
0.3
:
lr
=
learning_rate
*
((
0.1
**
(
p
//
2
))
*
(
1.0
if
p
%
2
==
0
else
0.3
))
else
:
lr
=
learning_rate
*
(
anneal_factor
**
p
)
for
param_group
in
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
lr
def
to_gpu
(
x
):
x
=
x
.
contiguous
()
if
torch
.
cuda
.
is_available
():
x
=
x
.
cuda
(
non_blocking
=
True
)
return
x
def
batch_to_gpu
(
batch
):
text_padded
,
text_lengths
,
mel_specgram_padded
,
mel_specgram_lengths
,
gate_padded
=
batch
text_padded
=
to_gpu
(
text_padded
).
long
()
text_lengths
=
to_gpu
(
text_lengths
).
long
()
mel_specgram_padded
=
to_gpu
(
mel_specgram_padded
).
float
()
gate_padded
=
to_gpu
(
gate_padded
).
float
()
mel_specgram_lengths
=
to_gpu
(
mel_specgram_lengths
).
long
()
x
=
(
text_padded
,
text_lengths
,
mel_specgram_padded
,
mel_specgram_lengths
)
y
=
(
mel_specgram_padded
,
gate_padded
)
return
x
,
y
def
training_step
(
model
,
train_batch
,
batch_idx
):
(
text_padded
,
text_lengths
,
mel_specgram_padded
,
mel_specgram_lengths
),
y
=
batch_to_gpu
(
train_batch
)
y_pred
=
model
(
text_padded
,
text_lengths
,
mel_specgram_padded
,
mel_specgram_lengths
)
y
[
0
].
requires_grad
=
False
y
[
1
].
requires_grad
=
False
losses
=
Tacotron2Loss
()(
y_pred
[:
3
],
y
)
return
losses
[
0
]
+
losses
[
1
]
+
losses
[
2
],
losses
def
validation_step
(
model
,
val_batch
,
batch_idx
):
(
text_padded
,
text_lengths
,
mel_specgram_padded
,
mel_specgram_lengths
),
y
=
batch_to_gpu
(
val_batch
)
y_pred
=
model
(
text_padded
,
text_lengths
,
mel_specgram_padded
,
mel_specgram_lengths
)
losses
=
Tacotron2Loss
()(
y_pred
[:
3
],
y
)
return
losses
[
0
]
+
losses
[
1
]
+
losses
[
2
],
losses
def
reduce_tensor
(
tensor
,
world_size
):
rt
=
tensor
.
clone
()
dist
.
all_reduce
(
rt
,
op
=
dist
.
ReduceOp
.
SUM
)
if
rt
.
is_floating_point
():
rt
=
rt
/
world_size
else
:
rt
=
rt
//
world_size
return
rt
def
log_additional_info
(
writer
,
model
,
loader
,
epoch
):
model
.
eval
()
data
=
next
(
iter
(
loader
))
with
torch
.
no_grad
():
(
text_padded
,
text_lengths
,
mel_specgram_padded
,
mel_specgram_lengths
),
_
=
batch_to_gpu
(
data
)
y_pred
=
model
(
text_padded
,
text_lengths
,
mel_specgram_padded
,
mel_specgram_lengths
)
mel_out
,
mel_out_postnet
,
gate_out
,
alignment
=
y_pred
fig
=
plt
.
figure
()
ax
=
plt
.
gca
()
ax
.
imshow
(
mel_out
[
0
].
cpu
().
numpy
())
writer
.
add_figure
(
"trn/mel_out"
,
fig
,
epoch
)
fig
=
plt
.
figure
()
ax
=
plt
.
gca
()
ax
.
imshow
(
mel_out_postnet
[
0
].
cpu
().
numpy
())
writer
.
add_figure
(
"trn/mel_out_postnet"
,
fig
,
epoch
)
writer
.
add_image
(
"trn/gate_out"
,
torch
.
tile
(
gate_out
[:
1
],
(
10
,
1
)),
epoch
,
dataformats
=
"HW"
)
writer
.
add_image
(
"trn/alignment"
,
alignment
[
0
],
epoch
,
dataformats
=
"HW"
)
def
train
(
rank
,
world_size
,
args
):
dist
.
init_process_group
(
"nccl"
,
rank
=
rank
,
world_size
=
world_size
)
if
rank
==
0
and
args
.
logging_dir
:
if
not
os
.
path
.
isdir
(
args
.
logging_dir
):
os
.
makedirs
(
args
.
logging_dir
)
filehandler
=
logging
.
FileHandler
(
os
.
path
.
join
(
args
.
logging_dir
,
'train.log'
))
filehandler
.
setLevel
(
logging
.
INFO
)
logger
.
addHandler
(
filehandler
)
writer
=
SummaryWriter
(
log_dir
=
args
.
logging_dir
)
else
:
writer
=
None
torch
.
manual_seed
(
0
)
torch
.
cuda
.
set_device
(
rank
)
symbols
,
text_preprocessor
=
get_text_preprocessor
(
args
.
text_preprocessor
)
model
=
Tacotron2
(
mask_padding
=
args
.
mask_padding
,
n_mels
=
args
.
n_mels
,
n_symbol
=
len
(
symbols
),
n_frames_per_step
=
args
.
n_frames_per_step
,
symbol_embedding_dim
=
args
.
symbols_embedding_dim
,
encoder_embedding_dim
=
args
.
encoder_embedding_dim
,
encoder_n_convolution
=
args
.
encoder_n_convolution
,
encoder_kernel_size
=
args
.
encoder_kernel_size
,
decoder_rnn_dim
=
args
.
decoder_rnn_dim
,
decoder_max_step
=
args
.
decoder_max_step
,
decoder_dropout
=
args
.
decoder_dropout
,
decoder_early_stopping
=
(
not
args
.
decoder_no_early_stopping
),
attention_rnn_dim
=
args
.
attention_rnn_dim
,
attention_hidden_dim
=
args
.
attention_hidden_dim
,
attention_location_n_filter
=
args
.
attention_location_n_filter
,
attention_location_kernel_size
=
args
.
attention_location_kernel_size
,
attention_dropout
=
args
.
attention_dropout
,
prenet_dim
=
args
.
prenet_dim
,
postnet_n_convolution
=
args
.
postnet_n_convolution
,
postnet_kernel_size
=
args
.
postnet_kernel_size
,
postnet_embedding_dim
=
args
.
postnet_embedding_dim
,
gate_threshold
=
args
.
gate_threshold
,
).
cuda
(
rank
)
model
=
torch
.
nn
.
SyncBatchNorm
.
convert_sync_batchnorm
(
model
)
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
rank
])
optimizer
=
Adam
(
model
.
parameters
(),
lr
=
args
.
learning_rate
)
best_loss
=
float
(
"inf"
)
start_epoch
=
0
if
args
.
checkpoint_path
and
os
.
path
.
isfile
(
args
.
checkpoint_path
):
logger
.
info
(
f
"Checkpoint: loading '
{
args
.
checkpoint_path
}
'"
)
map_location
=
{
'cuda:%d'
%
0
:
'cuda:%d'
%
rank
}
checkpoint
=
torch
.
load
(
args
.
checkpoint_path
,
map_location
=
map_location
)
start_epoch
=
checkpoint
[
"epoch"
]
best_loss
=
checkpoint
[
"best_loss"
]
model
.
load_state_dict
(
checkpoint
[
"state_dict"
])
optimizer
.
load_state_dict
(
checkpoint
[
"optimizer"
])
logger
.
info
(
f
"Checkpoint: loaded '
{
args
.
checkpoint_path
}
' at epoch
{
checkpoint
[
'epoch'
]
}
"
)
transforms
=
torch
.
nn
.
Sequential
(
torchaudio
.
transforms
.
MelSpectrogram
(
sample_rate
=
args
.
sample_rate
,
n_fft
=
args
.
n_fft
,
win_length
=
args
.
win_length
,
hop_length
=
args
.
hop_length
,
f_min
=
args
.
mel_fmin
,
f_max
=
args
.
mel_fmax
,
n_mels
=
args
.
n_mels
,
mel_scale
=
'slaney'
,
normalized
=
False
,
power
=
1
,
norm
=
'slaney'
,
),
SpectralNormalization
()
)
trainset
,
valset
=
split_process_dataset
(
args
.
dataset
,
args
.
dataset_path
,
args
.
val_ratio
,
transforms
,
text_preprocessor
)
train_sampler
=
torch
.
utils
.
data
.
distributed
.
DistributedSampler
(
trainset
,
shuffle
=
True
,
num_replicas
=
world_size
,
rank
=
rank
,
)
val_sampler
=
torch
.
utils
.
data
.
distributed
.
DistributedSampler
(
valset
,
shuffle
=
False
,
num_replicas
=
world_size
,
rank
=
rank
,
)
loader_params
=
{
"batch_size"
:
args
.
batch_size
,
"num_workers"
:
args
.
workers
,
"shuffle"
:
False
,
"pin_memory"
:
True
,
"drop_last"
:
False
,
"collate_fn"
:
partial
(
text_mel_collate_fn
,
n_frames_per_step
=
args
.
n_frames_per_step
),
}
train_loader
=
DataLoader
(
trainset
,
sampler
=
train_sampler
,
**
loader_params
)
val_loader
=
DataLoader
(
valset
,
sampler
=
val_sampler
,
**
loader_params
)
dist
.
barrier
()
for
epoch
in
range
(
start_epoch
,
args
.
epochs
):
start
=
time
()
model
.
train
()
trn_loss
,
counts
=
0
,
0
if
rank
==
0
:
iterator
=
tqdm
(
enumerate
(
train_loader
),
desc
=
f
"Epoch
{
epoch
}
"
,
total
=
len
(
train_loader
))
else
:
iterator
=
enumerate
(
train_loader
)
for
i
,
batch
in
iterator
:
adjust_learning_rate
(
epoch
,
optimizer
,
args
.
learning_rate
,
args
.
anneal_steps
,
args
.
anneal_factor
)
model
.
zero_grad
()
loss
,
losses
=
training_step
(
model
,
batch
,
i
)
loss
.
backward
()
torch
.
nn
.
utils
.
clip_grad_norm_
(
model
.
parameters
(),
args
.
grad_clip
)
optimizer
.
step
()
if
rank
==
0
and
writer
:
global_iters
=
epoch
*
len
(
train_loader
)
writer
.
add_scalar
(
"trn/mel_loss"
,
losses
[
0
],
global_iters
)
writer
.
add_scalar
(
"trn/mel_postnet_loss"
,
losses
[
1
],
global_iters
)
writer
.
add_scalar
(
"trn/gate_loss"
,
losses
[
2
],
global_iters
)
trn_loss
+=
loss
*
len
(
batch
[
0
])
counts
+=
len
(
batch
[
0
])
trn_loss
=
trn_loss
/
counts
trn_loss
=
reduce_tensor
(
trn_loss
,
world_size
)
if
rank
==
0
:
logger
.
info
(
f
"[Epoch:
{
epoch
}
] time:
{
time
()
-
start
}
; trn_loss:
{
trn_loss
}
"
)
if
writer
:
writer
.
add_scalar
(
"trn_loss"
,
trn_loss
,
epoch
)
if
((
epoch
+
1
)
%
args
.
validate_and_checkpoint_freq
==
0
)
or
(
epoch
==
args
.
epochs
-
1
):
val_start_time
=
time
()
model
.
eval
()
val_loss
,
counts
=
0
,
0
iterator
=
tqdm
(
enumerate
(
val_loader
),
desc
=
f
"[Rank:
{
rank
}
; Epoch:
{
epoch
}
; Eval]"
,
total
=
len
(
val_loader
))
with
torch
.
no_grad
():
for
val_batch_idx
,
val_batch
in
iterator
:
val_loss
=
val_loss
+
validation_step
(
model
,
val_batch
,
val_batch_idx
)[
0
]
*
len
(
val_batch
[
0
])
counts
=
counts
+
len
(
val_batch
[
0
])
val_loss
=
val_loss
/
counts
val_loss
=
reduce_tensor
(
val_loss
,
world_size
)
if
rank
==
0
and
writer
:
writer
.
add_scalar
(
"val_loss"
,
val_loss
,
epoch
)
log_additional_info
(
writer
,
model
,
val_loader
,
epoch
)
if
rank
==
0
:
is_best
=
val_loss
<
best_loss
best_loss
=
min
(
val_loss
,
best_loss
)
logger
.
info
(
f
"[Rank:
{
rank
}
, Epoch:
{
epoch
}
; Eval] time:
{
time
()
-
val_start_time
}
; val_loss:
{
val_loss
}
"
)
logger
.
info
(
f
"[Epoch:
{
epoch
}
] Saving checkpoint to
{
args
.
checkpoint_path
}
"
)
save_checkpoint
(
{
"epoch"
:
epoch
+
1
,
"state_dict"
:
model
.
state_dict
(),
"best_loss"
:
best_loss
,
"optimizer"
:
optimizer
.
state_dict
(),
},
is_best
,
args
.
checkpoint_path
,
)
dist
.
destroy_process_group
()
def
main
(
args
):
logger
.
info
(
"Start time: {}"
.
format
(
str
(
datetime
.
now
())))
torch
.
manual_seed
(
0
)
random
.
seed
(
0
)
if
args
.
master_addr
is
not
None
:
os
.
environ
[
'MASTER_ADDR'
]
=
args
.
master_addr
elif
'MASTER_ADDR'
not
in
os
.
environ
:
os
.
environ
[
'MASTER_ADDR'
]
=
'localhost'
if
args
.
master_port
is
not
None
:
os
.
environ
[
'MASTER_PORT'
]
=
args
.
master_port
elif
'MASTER_PORT'
not
in
os
.
environ
:
os
.
environ
[
'MASTER_PORT'
]
=
'17778'
device_counts
=
torch
.
cuda
.
device_count
()
logger
.
info
(
f
"# available GPUs:
{
device_counts
}
"
)
# download dataset is not already downloaded
if
args
.
dataset
==
'ljspeech'
:
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
args
.
dataset_path
,
'LJSpeech-1.1'
)):
from
torchaudio.datasets
import
LJSPEECH
LJSPEECH
(
root
=
args
.
dataset_path
,
download
=
True
)
if
device_counts
==
1
:
train
(
0
,
1
,
args
)
else
:
mp
.
spawn
(
train
,
args
=
(
device_counts
,
args
,
),
nprocs
=
device_counts
,
join
=
True
)
logger
.
info
(
f
"End time:
{
datetime
.
now
()
}
"
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch Tacotron 2 Training'
)
parser
=
parse_args
(
parser
)
args
,
_
=
parser
.
parse_known_args
()
main
(
args
)
examples/pipeline_tacotron2/utils.py
0 → 100644
View file @
9e7aeea1
# *****************************************************************************
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
import
logging
import
os
import
shutil
from
typing
import
List
,
Tuple
,
Callable
import
torch
from
torch
import
Tensor
def
save_checkpoint
(
state
,
is_best
,
filename
):
r
"""Save the model to a temporary file first, then copy it to filename,
in case signals interrupt the torch.save() process.
"""
torch
.
save
(
state
,
filename
)
logging
.
info
(
f
"Checkpoint saved to
{
filename
}
"
)
if
is_best
:
path
,
best_filename
=
os
.
path
.
split
(
filename
)
best_filename
=
os
.
path
.
join
(
path
,
"best_"
+
best_filename
)
shutil
.
copyfile
(
filename
,
best_filename
)
logging
.
info
(
f
"Current best checkpoint saved to
{
best_filename
}
"
)
def
pad_sequences
(
batch
:
List
[
Tensor
])
->
Tuple
[
Tensor
,
Tensor
]:
r
"""Right zero-pad all one-hot text sequences to max input length.
Modified from https://github.com/NVIDIA/DeepLearningExamples.
"""
input_lengths
,
ids_sorted_decreasing
=
torch
.
sort
(
torch
.
LongTensor
([
len
(
x
)
for
x
in
batch
]),
dim
=
0
,
descending
=
True
)
max_input_len
=
input_lengths
[
0
]
text_padded
=
torch
.
LongTensor
(
len
(
batch
),
max_input_len
)
text_padded
.
zero_
()
for
i
in
range
(
len
(
ids_sorted_decreasing
)):
text
=
batch
[
ids_sorted_decreasing
[
i
]]
text_padded
[
i
,
:
text
.
size
(
0
)]
=
text
return
text_padded
,
input_lengths
def
prepare_input_sequence
(
texts
:
List
[
str
],
text_processor
:
Callable
[[
str
],
List
[
int
]])
->
Tuple
[
Tensor
,
Tensor
]:
d
=
[]
for
text
in
texts
:
d
.
append
(
torch
.
IntTensor
(
text_processor
(
text
)[:]))
text_padded
,
input_lengths
=
pad_sequences
(
d
)
return
text_padded
,
input_lengths
def
get_text_preprocessor
(
preprocessor_name
:
str
)
->
Tuple
[
List
[
str
],
Callable
[[
str
],
List
[
int
]]]:
if
preprocessor_name
==
"character"
:
from
text.text_preprocessing
import
symbols
from
text.text_preprocessing
import
text_to_sequence
text_preprocessor
=
text_to_sequence
else
:
raise
ValueError
(
"The preprocessor_name ({preprocessor_name}) provided is not supported."
)
return
symbols
,
text_preprocessor
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment