Commit c23afcc1 authored by xinghao's avatar xinghao
Browse files

Initial commit

parents
Pipeline #1621 failed with stages
in 0 seconds
[submodule "tacotron2"]
path = tacotron2
url = https://gh.api.99988866.xyz/http://github.com/NVIDIA/tacotron2
BSD 3-Clause License
Copyright (c) 2018, NVIDIA Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# WaveGlow
## 代码下载
```bash
git clone http://developer.hpccube.com/codes/xinghao/waveglow.git
cd waveglow
git submodule init
git submodule update
```
## 环境配置
### Docker
```bash
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
#-v挂载工作目录
docker run -it --shm-size 80g --network=host --name=waveglow --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v /物理机工作目录/:/容器工作目录/ image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10 /bin/bash
```
### 安装docker中没有的依赖
```bash
#安装requirement.txt中的依赖
pip3 install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
#安装requirement.txt中没有的依赖
pip install apex -i https://pypi.tuna.tsinghua.edu.cn/simple
python3 -m pip install --upgrade pip setuptools wheel
```
## 数据处理
```bash
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
tar -xvjf LJSpeech-1.1.tar.bz2
ls LJSpeech-1.1/wavs/*.wav | tail -n+10 > train_files.txt
ls LJSpeech-1.1/wavs/*.wav | head -n10 > test_files.txt
```
## 使用预训练模型生成语音
1. 下载模型
```bash
https://drive.google.com/file/d/1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF/view
```
2. 下载梅尔频谱图
```
https://drive.google.com/file/d/1g_VXK2lpP9J25dQFhQwx7doWl_p20fXA/view 
```
3. 生成语音
```bash
python3 inference.py -f <(ls mel_spectrograms/*.pt) -w waveglow_256channels.pt -o . --is_fp16 -s 0.6
```
## 模型训练
### 单DCU
```bash
mkdir checkpoints
python train.py -c config.json
```
### 多DCU
```bash
mkdir checkpoints
python distributed.py -c config.json
```
`config.json`中的`"fp16_run"改成 true` 可以使用混合精度训练。
## 模型推理
### 生成测试集的梅尔频谱图
```bash
python mel2samp.py -f test_files.txt -o . -c config.json
```
### 使用自己训练的网络推理
```bash
ls *.pt > mel_files.txt
python3 inference.py -f mel_files.txt -w checkpoints/waveglow_10000 -o . --is_fp16 -s 0.6
```
{
"train_config": {
"fp16_run": true,
"output_directory": "checkpoints",
"epochs": 100000,
"learning_rate": 1e-4,
"sigma": 1.0,
"iters_per_checkpoint": 2000,
"batch_size": 12,
"seed": 1234,
"checkpoint_path": "",
"with_tensorboard": false
},
"data_config": {
"training_files": "train_files.txt",
"segment_length": 16000,
"sampling_rate": 22050,
"filter_length": 1024,
"hop_length": 256,
"win_length": 1024,
"mel_fmin": 0.0,
"mel_fmax": 8000.0
},
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54321"
},
"waveglow_config": {
"n_mel_channels": 80,
"n_flows": 12,
"n_group": 8,
"n_early_every": 4,
"n_early_size": 2,
"WN_config": {
"n_layers": 8,
"n_channels": 256,
"kernel_size": 3
}
}
}
import sys
import copy
import torch
def _check_model_old_version(model):
if hasattr(model.WN[0], 'res_layers') or hasattr(model.WN[0], 'cond_layers'):
return True
else:
return False
def _update_model_res_skip(old_model, new_model):
for idx in range(0, len(new_model.WN)):
wavenet = new_model.WN[idx]
n_channels = wavenet.n_channels
n_layers = wavenet.n_layers
wavenet.res_skip_layers = torch.nn.ModuleList()
for i in range(0, n_layers):
if i < n_layers - 1:
res_skip_channels = 2*n_channels
else:
res_skip_channels = n_channels
res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
skip_layer = torch.nn.utils.remove_weight_norm(wavenet.skip_layers[i])
if i < n_layers - 1:
res_layer = torch.nn.utils.remove_weight_norm(wavenet.res_layers[i])
res_skip_layer.weight = torch.nn.Parameter(torch.cat([res_layer.weight, skip_layer.weight]))
res_skip_layer.bias = torch.nn.Parameter(torch.cat([res_layer.bias, skip_layer.bias]))
else:
res_skip_layer.weight = torch.nn.Parameter(skip_layer.weight)
res_skip_layer.bias = torch.nn.Parameter(skip_layer.bias)
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
wavenet.res_skip_layers.append(res_skip_layer)
del wavenet.res_layers
del wavenet.skip_layers
def _update_model_cond(old_model, new_model):
for idx in range(0, len(new_model.WN)):
wavenet = new_model.WN[idx]
n_channels = wavenet.n_channels
n_layers = wavenet.n_layers
n_mel_channels = wavenet.cond_layers[0].weight.shape[1]
cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1)
cond_layer_weight = []
cond_layer_bias = []
for i in range(0, n_layers):
_cond_layer = torch.nn.utils.remove_weight_norm(wavenet.cond_layers[i])
cond_layer_weight.append(_cond_layer.weight)
cond_layer_bias.append(_cond_layer.bias)
cond_layer.weight = torch.nn.Parameter(torch.cat(cond_layer_weight))
cond_layer.bias = torch.nn.Parameter(torch.cat(cond_layer_bias))
cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
wavenet.cond_layer = cond_layer
del wavenet.cond_layers
def update_model(old_model):
if not _check_model_old_version(old_model):
return old_model
new_model = copy.deepcopy(old_model)
if hasattr(old_model.WN[0], 'res_layers'):
_update_model_res_skip(old_model, new_model)
if hasattr(old_model.WN[0], 'cond_layers'):
_update_model_cond(old_model, new_model)
for m in new_model.modules():
if 'Conv' in str(type(m)) and not hasattr(m, 'padding_mode'):
setattr(m, 'padding_mode', 'zeros')
return new_model
if __name__ == '__main__':
old_model_path = sys.argv[1]
new_model_path = sys.argv[2]
model = torch.load(old_model_path, map_location='cpu')
model['model'] = update_model(model['model'])
torch.save(model, new_model_path)
import sys
sys.path.append('tacotron2')
import torch
from layers import STFT
class Denoiser(torch.nn.Module):
""" Removes model bias from audio produced with waveglow """
def __init__(self, waveglow, filter_length=1024, n_overlap=4,
win_length=1024, mode='zeros'):
super(Denoiser, self).__init__()
self.stft = STFT(filter_length=filter_length,
hop_length=int(filter_length/n_overlap),
win_length=win_length).cuda()
if mode == 'zeros':
mel_input = torch.zeros(
(1, 80, 88),
dtype=waveglow.upsample.weight.dtype,
device=waveglow.upsample.weight.device)
elif mode == 'normal':
mel_input = torch.randn(
(1, 80, 88),
dtype=waveglow.upsample.weight.dtype,
device=waveglow.upsample.weight.device)
else:
raise Exception("Mode {} if not supported".format(mode))
with torch.no_grad():
bias_audio = waveglow.infer(mel_input, sigma=0.0).float()
bias_spec, _ = self.stft.transform(bias_audio)
self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])
def forward(self, audio, strength=0.1):
audio_spec, audio_angles = self.stft.transform(audio.cuda().float())
audio_spec_denoised = audio_spec - self.bias_spec * strength
audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles)
return audio_denoised
# *****************************************************************************
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
import os
import sys
import time
import subprocess
import argparse
import torch
import torch.distributed as dist
from torch.autograd import Variable
def reduce_tensor(tensor, num_gpus):
rt = tensor.clone()
dist.all_reduce(rt, op=dist.reduce_op.SUM)
rt /= num_gpus
return rt
def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url):
assert torch.cuda.is_available(), "Distributed mode requires CUDA."
print("Initializing Distributed")
# Set cuda device so everything is done on the right GPU.
torch.cuda.set_device(rank % torch.cuda.device_count())
# Initialize distributed communication
dist.init_process_group(dist_backend, init_method=dist_url,
world_size=num_gpus, rank=rank,
group_name=group_name)
def _flatten_dense_tensors(tensors):
"""Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
same dense type.
Since inputs are dense, the resulting tensor will be a concatenated 1D
buffer. Element-wise operation on this buffer will be equivalent to
operating individually.
Arguments:
tensors (Iterable[Tensor]): dense tensors to flatten.
Returns:
A contiguous 1D buffer containing input tensors.
"""
if len(tensors) == 1:
return tensors[0].contiguous().view(-1)
flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
return flat
def _unflatten_dense_tensors(flat, tensors):
"""View a flat buffer using the sizes of tensors. Assume that tensors are of
same dense type, and that flat is given by _flatten_dense_tensors.
Arguments:
flat (Tensor): flattened dense tensors to unflatten.
tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
unflatten flat.
Returns:
Unflattened dense tensors with sizes same as tensors and values from
flat.
"""
outputs = []
offset = 0
for tensor in tensors:
numel = tensor.numel()
outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
offset += numel
return tuple(outputs)
def apply_gradient_allreduce(module):
"""
Modifies existing model to do gradient allreduce, but doesn't change class
so you don't need "module"
"""
if not hasattr(dist, '_backend'):
module.warn_on_half = True
else:
module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
for p in module.state_dict().values():
if not torch.is_tensor(p):
continue
dist.broadcast(p, 0)
def allreduce_params():
if(module.needs_reduction):
module.needs_reduction = False
buckets = {}
for param in module.parameters():
if param.requires_grad and param.grad is not None:
tp = type(param.data)
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(param)
if module.warn_on_half:
if torch.cuda.HalfTensor in buckets:
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
" It is recommended to use the NCCL backend in this case. This currently requires" +
"PyTorch built from top of tree master.")
module.warn_on_half = False
for tp in buckets:
bucket = buckets[tp]
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
for param in list(module.parameters()):
def allreduce_hook(*unused):
Variable._execution_engine.queue_callback(allreduce_params)
if param.requires_grad:
param.register_hook(allreduce_hook)
dir(param)
def set_needs_reduction(self, input, output):
self.needs_reduction = True
module.register_forward_hook(set_needs_reduction)
return module
def main(config, stdout_dir, args_str):
args_list = ['train.py']
args_list += args_str.split(' ') if len(args_str) > 0 else []
args_list.append('--config={}'.format(config))
num_gpus = torch.cuda.device_count()
args_list.append('--num_gpus={}'.format(num_gpus))
args_list.append("--group_name=group_{}".format(time.strftime("%Y_%m_%d-%H%M%S")))
if not os.path.isdir(stdout_dir):
os.makedirs(stdout_dir)
os.chmod(stdout_dir, 0o775)
workers = []
for i in range(num_gpus):
args_list[-2] = '--rank={}'.format(i)
stdout = None if i == 0 else open(
os.path.join(stdout_dir, "GPU_{}.log".format(i)), "w")
print(args_list)
p = subprocess.Popen([str(sys.executable)]+args_list, stdout=stdout)
workers.append(p)
for p in workers:
p.wait()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--config', type=str, required=True,
help='JSON file for configuration')
parser.add_argument('-s', '--stdout_dir', type=str, default=".",
help='directory to save stoud logs')
parser.add_argument(
'-a', '--args_str', type=str, default='',
help='double quoted string with space separated key value pairs')
args = parser.parse_args()
main(args.config, args.stdout_dir, args.args_str)
# *****************************************************************************
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
import copy
import torch
from torch.autograd import Variable
import torch.nn.functional as F
@torch.jit.script
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
n_channels_int = n_channels[0]
in_act = input_a+input_b
t_act = torch.tanh(in_act[:, :n_channels_int, :])
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
acts = t_act * s_act
return acts
class WaveGlowLoss(torch.nn.Module):
def __init__(self, sigma=1.0):
super(WaveGlowLoss, self).__init__()
self.sigma = sigma
def forward(self, model_output):
z, log_s_list, log_det_W_list = model_output
for i, log_s in enumerate(log_s_list):
if i == 0:
log_s_total = torch.sum(log_s)
log_det_W_total = log_det_W_list[i]
else:
log_s_total = log_s_total + torch.sum(log_s)
log_det_W_total += log_det_W_list[i]
loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total
return loss/(z.size(0)*z.size(1)*z.size(2))
class Invertible1x1Conv(torch.nn.Module):
"""
The layer outputs both the convolution, and the log determinant
of its weight matrix. If reverse=True it does convolution with
inverse
"""
def __init__(self, c):
super(Invertible1x1Conv, self).__init__()
self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
bias=False)
# Sample a random orthonormal matrix to initialize weights
W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
# Ensure determinant is 1.0 not -1.0
if torch.det(W) < 0:
W[:,0] = -1*W[:,0]
W = W.contiguous().view(c, c, 1)
self.conv.weight.data = W
def forward(self, z, reverse=False):
# shape
batch_size, group_size, n_of_groups = z.size()
W = self.conv.weight.squeeze()
if reverse:
if not hasattr(self, 'W_inverse'):
# Reverse computation
W_inverse = W.float().inverse()
W_inverse = Variable(W_inverse[..., None])
if z.type() == 'torch.cuda.HalfTensor':
W_inverse = W_inverse.half()
self.W_inverse = W_inverse
z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
return z
else:
# Forward computation
log_det_W = batch_size * n_of_groups * torch.logdet(W)
z = self.conv(z)
return z, log_det_W
class WN(torch.nn.Module):
"""
This is the WaveNet like layer for the affine coupling. The primary difference
from WaveNet is the convolutions need not be causal. There is also no dilation
size reset. The dilation only doubles on each layer
"""
def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
kernel_size):
super(WN, self).__init__()
assert(kernel_size % 2 == 1)
assert(n_channels % 2 == 0)
self.n_layers = n_layers
self.n_channels = n_channels
self.in_layers = torch.nn.ModuleList()
self.res_skip_layers = torch.nn.ModuleList()
start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
start = torch.nn.utils.weight_norm(start, name='weight')
self.start = start
# Initializing last layer to 0 makes the affine coupling layers
# do nothing at first. This helps with training stability
end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
end.weight.data.zero_()
end.bias.data.zero_()
self.end = end
cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1)
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
for i in range(n_layers):
dilation = 2 ** i
padding = int((kernel_size*dilation - dilation)/2)
in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
dilation=dilation, padding=padding)
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
self.in_layers.append(in_layer)
# last one is not necessary
if i < n_layers - 1:
res_skip_channels = 2*n_channels
else:
res_skip_channels = n_channels
res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
self.res_skip_layers.append(res_skip_layer)
def forward(self, forward_input):
audio, spect = forward_input
audio = self.start(audio)
output = torch.zeros_like(audio)
n_channels_tensor = torch.IntTensor([self.n_channels])
spect = self.cond_layer(spect)
for i in range(self.n_layers):
spect_offset = i*2*self.n_channels
acts = fused_add_tanh_sigmoid_multiply(
self.in_layers[i](audio),
spect[:,spect_offset:spect_offset+2*self.n_channels,:],
n_channels_tensor)
res_skip_acts = self.res_skip_layers[i](acts)
if i < self.n_layers - 1:
audio = audio + res_skip_acts[:,:self.n_channels,:]
output = output + res_skip_acts[:,self.n_channels:,:]
else:
output = output + res_skip_acts
return self.end(output)
class WaveGlow(torch.nn.Module):
def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
n_early_size, WN_config):
super(WaveGlow, self).__init__()
self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
n_mel_channels,
1024, stride=256)
assert(n_group % 2 == 0)
self.n_flows = n_flows
self.n_group = n_group
self.n_early_every = n_early_every
self.n_early_size = n_early_size
self.WN = torch.nn.ModuleList()
self.convinv = torch.nn.ModuleList()
n_half = int(n_group/2)
# Set up layers with the right sizes based on how many dimensions
# have been output already
n_remaining_channels = n_group
for k in range(n_flows):
if k % self.n_early_every == 0 and k > 0:
n_half = n_half - int(self.n_early_size/2)
n_remaining_channels = n_remaining_channels - self.n_early_size
self.convinv.append(Invertible1x1Conv(n_remaining_channels))
self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
self.n_remaining_channels = n_remaining_channels # Useful during inference
def forward(self, forward_input):
"""
forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames
forward_input[1] = audio: batch x time
"""
spect, audio = forward_input
# Upsample spectrogram to size of audio
spect = self.upsample(spect)
assert(spect.size(2) >= audio.size(1))
if spect.size(2) > audio.size(1):
spect = spect[:, :, :audio.size(1)]
spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
output_audio = []
log_s_list = []
log_det_W_list = []
for k in range(self.n_flows):
if k % self.n_early_every == 0 and k > 0:
output_audio.append(audio[:,:self.n_early_size,:])
audio = audio[:,self.n_early_size:,:]
audio, log_det_W = self.convinv[k](audio)
log_det_W_list.append(log_det_W)
n_half = int(audio.size(1)/2)
audio_0 = audio[:,:n_half,:]
audio_1 = audio[:,n_half:,:]
output = self.WN[k]((audio_0, spect))
log_s = output[:, n_half:, :]
b = output[:, :n_half, :]
audio_1 = torch.exp(log_s)*audio_1 + b
log_s_list.append(log_s)
audio = torch.cat([audio_0, audio_1],1)
output_audio.append(audio)
return torch.cat(output_audio,1), log_s_list, log_det_W_list
def infer(self, spect, sigma=1.0):
spect = self.upsample(spect)
# trim conv artifacts. maybe pad spec to kernel multiple
time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
spect = spect[:, :, :-time_cutoff]
spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
if spect.type() == 'torch.cuda.HalfTensor':
audio = torch.cuda.HalfTensor(spect.size(0),
self.n_remaining_channels,
spect.size(2)).normal_()
else:
audio = torch.cuda.FloatTensor(spect.size(0),
self.n_remaining_channels,
spect.size(2)).normal_()
audio = torch.autograd.Variable(sigma*audio)
for k in reversed(range(self.n_flows)):
n_half = int(audio.size(1)/2)
audio_0 = audio[:,:n_half,:]
audio_1 = audio[:,n_half:,:]
output = self.WN[k]((audio_0, spect))
s = output[:, n_half:, :]
b = output[:, :n_half, :]
audio_1 = (audio_1 - b)/torch.exp(s)
audio = torch.cat([audio_0, audio_1],1)
audio = self.convinv[k](audio, reverse=True)
if k % self.n_early_every == 0 and k > 0:
if spect.type() == 'torch.cuda.HalfTensor':
z = torch.cuda.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
else:
z = torch.cuda.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
audio = torch.cat((sigma*z, audio),1)
audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
return audio
@staticmethod
def remove_weightnorm(model):
waveglow = model
for WN in waveglow.WN:
WN.start = torch.nn.utils.remove_weight_norm(WN.start)
WN.in_layers = remove(WN.in_layers)
WN.cond_layer = torch.nn.utils.remove_weight_norm(WN.cond_layer)
WN.res_skip_layers = remove(WN.res_skip_layers)
return waveglow
def remove(conv_list):
new_conv_list = torch.nn.ModuleList()
for old_conv in conv_list:
old_conv = torch.nn.utils.remove_weight_norm(old_conv)
new_conv_list.append(old_conv)
return new_conv_list
import copy
import torch
from glow import Invertible1x1Conv, remove
@torch.jit.script
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
n_channels_int = n_channels[0]
in_act = input_a+input_b
t_act = torch.tanh(in_act[:, :n_channels_int, :])
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
acts = t_act * s_act
return acts
class WN(torch.nn.Module):
"""
This is the WaveNet like layer for the affine coupling. The primary difference
from WaveNet is the convolutions need not be causal. There is also no dilation
size reset. The dilation only doubles on each layer
"""
def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
kernel_size):
super(WN, self).__init__()
assert(kernel_size % 2 == 1)
assert(n_channels % 2 == 0)
self.n_layers = n_layers
self.n_channels = n_channels
self.in_layers = torch.nn.ModuleList()
self.res_skip_layers = torch.nn.ModuleList()
self.cond_layers = torch.nn.ModuleList()
start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
start = torch.nn.utils.weight_norm(start, name='weight')
self.start = start
# Initializing last layer to 0 makes the affine coupling layers
# do nothing at first. This helps with training stability
end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
end.weight.data.zero_()
end.bias.data.zero_()
self.end = end
for i in range(n_layers):
dilation = 2 ** i
padding = int((kernel_size*dilation - dilation)/2)
in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
dilation=dilation, padding=padding)
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
self.in_layers.append(in_layer)
cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1)
cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
self.cond_layers.append(cond_layer)
# last one is not necessary
if i < n_layers - 1:
res_skip_channels = 2*n_channels
else:
res_skip_channels = n_channels
res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
self.res_skip_layers.append(res_skip_layer)
def forward(self, forward_input):
audio, spect = forward_input
audio = self.start(audio)
for i in range(self.n_layers):
acts = fused_add_tanh_sigmoid_multiply(
self.in_layers[i](audio),
self.cond_layers[i](spect),
torch.IntTensor([self.n_channels]))
res_skip_acts = self.res_skip_layers[i](acts)
if i < self.n_layers - 1:
audio = res_skip_acts[:,:self.n_channels,:] + audio
skip_acts = res_skip_acts[:,self.n_channels:,:]
else:
skip_acts = res_skip_acts
if i == 0:
output = skip_acts
else:
output = skip_acts + output
return self.end(output)
class WaveGlow(torch.nn.Module):
def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
n_early_size, WN_config):
super(WaveGlow, self).__init__()
self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
n_mel_channels,
1024, stride=256)
assert(n_group % 2 == 0)
self.n_flows = n_flows
self.n_group = n_group
self.n_early_every = n_early_every
self.n_early_size = n_early_size
self.WN = torch.nn.ModuleList()
self.convinv = torch.nn.ModuleList()
n_half = int(n_group/2)
# Set up layers with the right sizes based on how many dimensions
# have been output already
n_remaining_channels = n_group
for k in range(n_flows):
if k % self.n_early_every == 0 and k > 0:
n_half = n_half - int(self.n_early_size/2)
n_remaining_channels = n_remaining_channels - self.n_early_size
self.convinv.append(Invertible1x1Conv(n_remaining_channels))
self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
self.n_remaining_channels = n_remaining_channels # Useful during inference
def forward(self, forward_input):
return None
"""
forward_input[0] = audio: batch x time
forward_input[1] = upsamp_spectrogram: batch x n_cond_channels x time
"""
"""
spect, audio = forward_input
# Upsample spectrogram to size of audio
spect = self.upsample(spect)
assert(spect.size(2) >= audio.size(1))
if spect.size(2) > audio.size(1):
spect = spect[:, :, :audio.size(1)]
spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
output_audio = []
s_list = []
s_conv_list = []
for k in range(self.n_flows):
if k%4 == 0 and k > 0:
output_audio.append(audio[:,:self.n_multi,:])
audio = audio[:,self.n_multi:,:]
# project to new basis
audio, s = self.convinv[k](audio)
s_conv_list.append(s)
n_half = int(audio.size(1)/2)
if k%2 == 0:
audio_0 = audio[:,:n_half,:]
audio_1 = audio[:,n_half:,:]
else:
audio_1 = audio[:,:n_half,:]
audio_0 = audio[:,n_half:,:]
output = self.nn[k]((audio_0, spect))
s = output[:, n_half:, :]
b = output[:, :n_half, :]
audio_1 = torch.exp(s)*audio_1 + b
s_list.append(s)
if k%2 == 0:
audio = torch.cat([audio[:,:n_half,:], audio_1],1)
else:
audio = torch.cat([audio_1, audio[:,n_half:,:]], 1)
output_audio.append(audio)
return torch.cat(output_audio,1), s_list, s_conv_list
"""
def infer(self, spect, sigma=1.0):
spect = self.upsample(spect)
# trim conv artifacts. maybe pad spec to kernel multiple
time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
spect = spect[:, :, :-time_cutoff]
spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
if spect.type() == 'torch.cuda.HalfTensor':
audio = torch.cuda.HalfTensor(spect.size(0),
self.n_remaining_channels,
spect.size(2)).normal_()
else:
audio = torch.cuda.FloatTensor(spect.size(0),
self.n_remaining_channels,
spect.size(2)).normal_()
audio = torch.autograd.Variable(sigma*audio)
for k in reversed(range(self.n_flows)):
n_half = int(audio.size(1)/2)
if k%2 == 0:
audio_0 = audio[:,:n_half,:]
audio_1 = audio[:,n_half:,:]
else:
audio_1 = audio[:,:n_half,:]
audio_0 = audio[:,n_half:,:]
output = self.WN[k]((audio_0, spect))
s = output[:, n_half:, :]
b = output[:, :n_half, :]
audio_1 = (audio_1 - b)/torch.exp(s)
if k%2 == 0:
audio = torch.cat([audio[:,:n_half,:], audio_1],1)
else:
audio = torch.cat([audio_1, audio[:,n_half:,:]], 1)
audio = self.convinv[k](audio, reverse=True)
if k%4 == 0 and k > 0:
if spect.type() == 'torch.cuda.HalfTensor':
z = torch.cuda.HalfTensor(spect.size(0),
self.n_early_size,
spect.size(2)).normal_()
else:
z = torch.cuda.FloatTensor(spect.size(0),
self.n_early_size,
spect.size(2)).normal_()
audio = torch.cat((sigma*z, audio),1)
return audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
@staticmethod
def remove_weightnorm(model):
waveglow = model
for WN in waveglow.WN:
WN.start = torch.nn.utils.remove_weight_norm(WN.start)
WN.in_layers = remove(WN.in_layers)
WN.cond_layers = remove(WN.cond_layers)
WN.res_skip_layers = remove(WN.res_skip_layers)
return waveglow
# *****************************************************************************
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
import os
from scipy.io.wavfile import write
import torch
from mel2samp import files_to_list, MAX_WAV_VALUE
from denoiser import Denoiser
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
denoiser_strength):
mel_files = files_to_list(mel_files)
waveglow = torch.load(waveglow_path)['model']
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow.cuda().eval()
if is_fp16:
from apex import amp
waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")
if denoiser_strength > 0:
denoiser = Denoiser(waveglow).cuda()
for i, file_path in enumerate(mel_files):
file_name = os.path.splitext(os.path.basename(file_path))[0]
mel = torch.load(file_path)
mel = torch.autograd.Variable(mel.cuda())
mel = torch.unsqueeze(mel, 0)
mel = mel.half() if is_fp16 else mel
with torch.no_grad():
audio = waveglow.infer(mel, sigma=sigma)
if denoiser_strength > 0:
audio = denoiser(audio, denoiser_strength)
audio = audio * MAX_WAV_VALUE
audio = audio.squeeze()
audio = audio.cpu().numpy()
audio = audio.astype('int16')
audio_path = os.path.join(
output_dir, "{}_synthesis.wav".format(file_name))
write(audio_path, sampling_rate, audio)
print(audio_path)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-f', "--filelist_path", required=True)
parser.add_argument('-w', '--waveglow_path',
help='Path to waveglow decoder checkpoint with model')
parser.add_argument('-o', "--output_dir", required=True)
parser.add_argument("-s", "--sigma", default=1.0, type=float)
parser.add_argument("--sampling_rate", default=22050, type=int)
parser.add_argument("--is_fp16", action="store_true")
parser.add_argument("-d", "--denoiser_strength", default=0.0, type=float,
help='Removes model bias. Start with 0.1 and adjust')
args = parser.parse_args()
main(args.filelist_path, args.waveglow_path, args.sigma, args.output_dir,
args.sampling_rate, args.is_fp16, args.denoiser_strength)
# *****************************************************************************
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************\
import os
import random
import argparse
import json
import torch
import torch.utils.data
import sys
from scipy.io.wavfile import read
# We're using the audio processing from TacoTron2 to make sure it matches
sys.path.insert(0, 'tacotron2')
from tacotron2.layers import TacotronSTFT
MAX_WAV_VALUE = 32768.0
def files_to_list(filename):
"""
Takes a text file of filenames and makes a list of filenames
"""
with open(filename, encoding='utf-8') as f:
files = f.readlines()
files = [f.rstrip() for f in files]
return files
def load_wav_to_torch(full_path):
"""
Loads wavdata into torch array
"""
sampling_rate, data = read(full_path)
return torch.from_numpy(data).float(), sampling_rate
class Mel2Samp(torch.utils.data.Dataset):
"""
This is the main class that calculates the spectrogram and returns the
spectrogram, audio pair.
"""
def __init__(self, training_files, segment_length, filter_length,
hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
self.audio_files = files_to_list(training_files)
random.seed(1234)
random.shuffle(self.audio_files)
self.stft = TacotronSTFT(filter_length=filter_length,
hop_length=hop_length,
win_length=win_length,
sampling_rate=sampling_rate,
mel_fmin=mel_fmin, mel_fmax=mel_fmax)
self.segment_length = segment_length
self.sampling_rate = sampling_rate
def get_mel(self, audio):
audio_norm = audio / MAX_WAV_VALUE
audio_norm = audio_norm.unsqueeze(0)
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
melspec = self.stft.mel_spectrogram(audio_norm)
melspec = torch.squeeze(melspec, 0)
return melspec
def __getitem__(self, index):
# Read audio
filename = self.audio_files[index]
audio, sampling_rate = load_wav_to_torch(filename)
if sampling_rate != self.sampling_rate:
raise ValueError("{} SR doesn't match target {} SR".format(
sampling_rate, self.sampling_rate))
# Take segment
if audio.size(0) >= self.segment_length:
max_audio_start = audio.size(0) - self.segment_length
audio_start = random.randint(0, max_audio_start)
audio = audio[audio_start:audio_start+self.segment_length]
else:
audio = torch.nn.functional.pad(audio, (0, self.segment_length - audio.size(0)), 'constant').data
mel = self.get_mel(audio)
audio = audio / MAX_WAV_VALUE
return (mel, audio)
def __len__(self):
return len(self.audio_files)
# ===================================================================
# Takes directory of clean audio and makes directory of spectrograms
# Useful for making test sets
# ===================================================================
if __name__ == "__main__":
# Get defaults so it can work with no Sacred
parser = argparse.ArgumentParser()
parser.add_argument('-f', "--filelist_path", required=True)
parser.add_argument('-c', '--config', type=str,
help='JSON file for configuration')
parser.add_argument('-o', '--output_dir', type=str,
help='Output directory')
args = parser.parse_args()
with open(args.config) as f:
data = f.read()
data_config = json.loads(data)["data_config"]
mel2samp = Mel2Samp(**data_config)
filepaths = files_to_list(args.filelist_path)
# Make directory if it doesn't exist
if not os.path.isdir(args.output_dir):
os.makedirs(args.output_dir)
os.chmod(args.output_dir, 0o775)
for filepath in filepaths:
audio, sr = load_wav_to_torch(filepath)
melspectrogram = mel2samp.get_mel(audio)
filename = os.path.basename(filepath)
new_filepath = args.output_dir + '/' + filename + '.pt'
print(new_filepath)
torch.save(melspectrogram, new_filepath)
matplotlib==3.9.0
tensorflow==2.17.0
numpy==1.23.5
inflect==0.2.5
librosa==0.8.1
scipy==1.13.1
tensorboardX==1.1
Unidecode==1.0.22
pillow
# *****************************************************************************
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
import argparse
import json
import os
import torch
#=====START: ADDED FOR DISTRIBUTED======
from distributed import init_distributed, apply_gradient_allreduce, reduce_tensor
from torch.utils.data.distributed import DistributedSampler
#=====END: ADDED FOR DISTRIBUTED======
from torch.utils.data import DataLoader
from glow import WaveGlow, WaveGlowLoss
from mel2samp import Mel2Samp
def load_checkpoint(checkpoint_path, model, optimizer):
assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
iteration = checkpoint_dict['iteration']
optimizer.load_state_dict(checkpoint_dict['optimizer'])
model_for_loading = checkpoint_dict['model']
model.load_state_dict(model_for_loading.state_dict())
print("Loaded checkpoint '{}' (iteration {})" .format(
checkpoint_path, iteration))
return model, optimizer, iteration
def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
print("Saving model and optimizer state at iteration {} to {}".format(
iteration, filepath))
model_for_saving = WaveGlow(**waveglow_config).cuda()
model_for_saving.load_state_dict(model.state_dict())
torch.save({'model': model_for_saving,
'iteration': iteration,
'optimizer': optimizer.state_dict(),
'learning_rate': learning_rate}, filepath)
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
checkpoint_path, with_tensorboard):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
#=====START: ADDED FOR DISTRIBUTED======
if num_gpus > 1:
init_distributed(rank, num_gpus, group_name, **dist_config)
#=====END: ADDED FOR DISTRIBUTED======
criterion = WaveGlowLoss(sigma)
model = WaveGlow(**waveglow_config).cuda()
#=====START: ADDED FOR DISTRIBUTED======
if num_gpus > 1:
model = apply_gradient_allreduce(model)
#=====END: ADDED FOR DISTRIBUTED======
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
if fp16_run:
from apex import amp
model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
# Load checkpoint if one exists
iteration = 0
if checkpoint_path != "":
model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
optimizer)
iteration += 1 # next iteration is iteration + 1
trainset = Mel2Samp(**data_config)
# =====START: ADDED FOR DISTRIBUTED======
train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
# =====END: ADDED FOR DISTRIBUTED======
train_loader = DataLoader(trainset, num_workers=1, shuffle=False,
sampler=train_sampler,
batch_size=batch_size,
pin_memory=False,
drop_last=True)
# Get shared output_directory ready
if rank == 0:
if not os.path.isdir(output_directory):
os.makedirs(output_directory)
os.chmod(output_directory, 0o775)
print("output directory", output_directory)
if with_tensorboard and rank == 0:
from tensorboardX import SummaryWriter
logger = SummaryWriter(os.path.join(output_directory, 'logs'))
model.train()
epoch_offset = max(0, int(iteration / len(train_loader)))
# ================ MAIN TRAINNIG LOOP! ===================
for epoch in range(epoch_offset, epochs):
print("Epoch: {}".format(epoch))
for i, batch in enumerate(train_loader):
model.zero_grad()
mel, audio = batch
mel = torch.autograd.Variable(mel.cuda())
audio = torch.autograd.Variable(audio.cuda())
outputs = model((mel, audio))
loss = criterion(outputs)
if num_gpus > 1:
reduced_loss = reduce_tensor(loss.data, num_gpus).item()
else:
reduced_loss = loss.item()
if fp16_run:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
optimizer.step()
print("{}:\t{:.9f}".format(iteration, reduced_loss))
if with_tensorboard and rank == 0:
logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch)
if (iteration % iters_per_checkpoint == 0):
if rank == 0:
checkpoint_path = "{}/waveglow_{}".format(
output_directory, iteration)
save_checkpoint(model, optimizer, learning_rate, iteration,
checkpoint_path)
iteration += 1
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--config', type=str,
help='JSON file for configuration')
parser.add_argument('-r', '--rank', type=int, default=0,
help='rank of process for distributed')
parser.add_argument('-g', '--group_name', type=str, default='',
help='name of group for distributed')
args = parser.parse_args()
# Parse configs. Globals nicer in this case
with open(args.config) as f:
data = f.read()
config = json.loads(data)
train_config = config["train_config"]
global data_config
data_config = config["data_config"]
global dist_config
dist_config = config["dist_config"]
global waveglow_config
waveglow_config = config["waveglow_config"]
num_gpus = torch.cuda.device_count()
if num_gpus > 1:
if args.group_name == '':
print("WARNING: Multiple GPUs detected but no distributed group set")
print("Only running 1 GPU. Use distributed.py for multiple GPUs")
num_gpus = 1
if num_gpus == 1 and args.rank != 0:
raise Exception("Doing single GPU training on rank > 0")
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = False
train(num_gpus, args.rank, args.group_name, **train_config)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment