Initial commit

c23afcc1 · xinghao · c23afcc1 · c23afcc1 · c23afcc1 · c23afcc1
Commit c23afcc1 authored Aug 28, 2024 by xinghao
14 changed files
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "tacotron2"]
+	path = tacotron2
+	url = https://gh.api.99988866.xyz/http://github.com/NVIDIA/tacotron2
--- a/LICENSE
+++ b/LICENSE
+BSD 3-Clause License
+
+Copyright (c) 2018, NVIDIA Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
+# WaveGlow
+
+## 代码下载
+
+```bash
+git clone http://developer.hpccube.com/codes/xinghao/waveglow.git
+cd waveglow
+git submodule init
+git submodule update
+```
+
+## 环境配置
+
+### Docker
+
+```bash
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
+#-v挂载工作目录
+docker run -it  --shm-size 80g --network=host --name=waveglow --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v /物理机工作目录/:/容器工作目录/ image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10 /bin/bash
+```
+
+### 安装docker中没有的依赖
+
+```bash
+#安装requirement.txt中的依赖
+pip3 install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+#安装requirement.txt中没有的依赖
+pip install apex -i https://pypi.tuna.tsinghua.edu.cn/simple
+python3 -m pip install --upgrade pip setuptools wheel
+```
+
+## 数据处理
+
+```bash
+wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
+tar -xvjf LJSpeech-1.1.tar.bz2
+ls LJSpeech-1.1/wavs/*.wav | tail -n+10 > train_files.txt
+ls LJSpeech-1.1/wavs/*.wav | head -n10 > test_files.txt
+```
+
+
+
+## 使用预训练模型生成语音
+
+1. 下载模型
+
+   ```bash
+   https://drive.google.com/file/d/1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF/view
+   ```
+
+   
+
+2. 下载梅尔频谱图
+
+   ```
+   https://drive.google.com/file/d/1g_VXK2lpP9J25dQFhQwx7doWl_p20fXA/view 
+   ```
+
+   
+
+3. 生成语音
+
+   ```bash
+   python3 inference.py -f <(ls mel_spectrograms/*.pt) -w waveglow_256channels.pt -o . --is_fp16 -s 0.6
+   ```
+
+    
+
+## 模型训练
+
+### 单DCU
+
+```bash
+mkdir checkpoints
+python train.py -c config.json
+```
+
+### 多DCU
+
+```bash
+mkdir checkpoints
+python distributed.py -c config.json
+```
+
+将`config.json`中的`"fp16_run"改成 true` 可以使用混合精度训练。
+
+## 模型推理
+
+### 生成测试集的梅尔频谱图
+
+```bash
+python mel2samp.py -f test_files.txt -o . -c config.json
+```
+
+### 使用自己训练的网络推理
+
+```bash
+ls *.pt > mel_files.txt
+python3 inference.py -f mel_files.txt -w checkpoints/waveglow_10000 -o . --is_fp16 -s 0.6
+```
--- a/config.json
+++ b/config.json
+{
+    "train_config": {
+        "fp16_run": true,
+        "output_directory": "checkpoints",
+        "epochs": 100000,
+        "learning_rate": 1e-4,
+        "sigma": 1.0,
+        "iters_per_checkpoint": 2000,
+        "batch_size": 12,
+        "seed": 1234,
+        "checkpoint_path": "",
+        "with_tensorboard": false
+    },
+    "data_config": {
+        "training_files": "train_files.txt",
+        "segment_length": 16000,
+        "sampling_rate": 22050,
+        "filter_length": 1024,
+        "hop_length": 256,
+        "win_length": 1024,
+        "mel_fmin": 0.0,
+        "mel_fmax": 8000.0
+    },
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321"
+    },
+
+    "waveglow_config": {
+        "n_mel_channels": 80,
+        "n_flows": 12,
+        "n_group": 8,
+        "n_early_every": 4,
+        "n_early_size": 2,
+        "WN_config": {
+            "n_layers": 8,
+            "n_channels": 256,
+            "kernel_size": 3
+        }
+    }
+}
--- a/convert_model.py
+++ b/convert_model.py
+import sys
+import copy
+import torch
+
+def _check_model_old_version(model):
+    if hasattr(model.WN[0], 'res_layers') or hasattr(model.WN[0], 'cond_layers'):
+        return True
+    else:
+        return False
+
+
+def _update_model_res_skip(old_model, new_model):
+    for idx in range(0, len(new_model.WN)):
+        wavenet = new_model.WN[idx]
+        n_channels = wavenet.n_channels
+        n_layers = wavenet.n_layers
+        wavenet.res_skip_layers = torch.nn.ModuleList()
+        for i in range(0, n_layers):
+            if i < n_layers - 1:
+                res_skip_channels = 2*n_channels
+            else:
+                res_skip_channels = n_channels
+            res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
+            skip_layer = torch.nn.utils.remove_weight_norm(wavenet.skip_layers[i])
+            if i < n_layers - 1:
+                res_layer = torch.nn.utils.remove_weight_norm(wavenet.res_layers[i])
+                res_skip_layer.weight = torch.nn.Parameter(torch.cat([res_layer.weight, skip_layer.weight]))
+                res_skip_layer.bias = torch.nn.Parameter(torch.cat([res_layer.bias, skip_layer.bias]))
+            else:
+                res_skip_layer.weight = torch.nn.Parameter(skip_layer.weight)
+                res_skip_layer.bias = torch.nn.Parameter(skip_layer.bias)
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+            wavenet.res_skip_layers.append(res_skip_layer)
+        del wavenet.res_layers
+        del wavenet.skip_layers
+
+def _update_model_cond(old_model, new_model):
+    for idx in range(0, len(new_model.WN)):
+        wavenet = new_model.WN[idx]
+        n_channels = wavenet.n_channels
+        n_layers = wavenet.n_layers
+        n_mel_channels = wavenet.cond_layers[0].weight.shape[1]
+        cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1)
+        cond_layer_weight = []
+        cond_layer_bias = []
+        for i in range(0, n_layers):
+            _cond_layer = torch.nn.utils.remove_weight_norm(wavenet.cond_layers[i])
+            cond_layer_weight.append(_cond_layer.weight)
+            cond_layer_bias.append(_cond_layer.bias)
+        cond_layer.weight = torch.nn.Parameter(torch.cat(cond_layer_weight))
+        cond_layer.bias = torch.nn.Parameter(torch.cat(cond_layer_bias))
+        cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+        wavenet.cond_layer = cond_layer
+        del wavenet.cond_layers
+
+def update_model(old_model):
+    if not _check_model_old_version(old_model):
+        return old_model
+    new_model = copy.deepcopy(old_model)
+    if hasattr(old_model.WN[0], 'res_layers'):
+        _update_model_res_skip(old_model, new_model)
+    if hasattr(old_model.WN[0], 'cond_layers'):
+        _update_model_cond(old_model, new_model)
+    for m in new_model.modules():
+        if 'Conv' in str(type(m)) and not hasattr(m, 'padding_mode'):
+            setattr(m, 'padding_mode', 'zeros')        
+    return new_model
+
+if __name__ == '__main__':
+    old_model_path = sys.argv[1]
+    new_model_path = sys.argv[2]
+    model = torch.load(old_model_path, map_location='cpu')
+    model['model'] = update_model(model['model'])
+    torch.save(model, new_model_path)
+    
--- a/denoiser.py
+++ b/denoiser.py
+import sys
+sys.path.append('tacotron2')
+import torch
+from layers import STFT
+
+
+class Denoiser(torch.nn.Module):
+    """ Removes model bias from audio produced with waveglow """
+
+    def __init__(self, waveglow, filter_length=1024, n_overlap=4,
+                 win_length=1024, mode='zeros'):
+        super(Denoiser, self).__init__()
+        self.stft = STFT(filter_length=filter_length,
+                         hop_length=int(filter_length/n_overlap),
+                         win_length=win_length).cuda()
+        if mode == 'zeros':
+            mel_input = torch.zeros(
+                (1, 80, 88),
+                dtype=waveglow.upsample.weight.dtype,
+                device=waveglow.upsample.weight.device)
+        elif mode == 'normal':
+            mel_input = torch.randn(
+                (1, 80, 88),
+                dtype=waveglow.upsample.weight.dtype,
+                device=waveglow.upsample.weight.device)
+        else:
+            raise Exception("Mode {} if not supported".format(mode))
+
+        with torch.no_grad():
+            bias_audio = waveglow.infer(mel_input, sigma=0.0).float()
+            bias_spec, _ = self.stft.transform(bias_audio)
+
+        self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])
+
+    def forward(self, audio, strength=0.1):
+        audio_spec, audio_angles = self.stft.transform(audio.cuda().float())
+        audio_spec_denoised = audio_spec - self.bias_spec * strength
+        audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
+        audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles)
+        return audio_denoised
--- a/distributed.py
+++ b/distributed.py
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import os
+import sys
+import time
+import subprocess
+import argparse
+
+import torch
+import torch.distributed as dist
+from torch.autograd import Variable
+
+def reduce_tensor(tensor, num_gpus):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.reduce_op.SUM)
+    rt /= num_gpus
+    return rt
+
+def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url):
+    assert torch.cuda.is_available(), "Distributed mode requires CUDA."
+    print("Initializing Distributed")
+
+    # Set cuda device so everything is done on the right GPU.
+    torch.cuda.set_device(rank % torch.cuda.device_count())
+
+    # Initialize distributed communication
+    dist.init_process_group(dist_backend, init_method=dist_url,
+                            world_size=num_gpus, rank=rank,
+                            group_name=group_name)
+
+def _flatten_dense_tensors(tensors):
+    """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
+    same dense type.
+    Since inputs are dense, the resulting tensor will be a concatenated 1D
+    buffer. Element-wise operation on this buffer will be equivalent to
+    operating individually.
+    Arguments:
+        tensors (Iterable[Tensor]): dense tensors to flatten.
+    Returns:
+        A contiguous 1D buffer containing input tensors.
+    """
+    if len(tensors) == 1:
+        return tensors[0].contiguous().view(-1)
+    flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
+    return flat
+
+def _unflatten_dense_tensors(flat, tensors):
+    """View a flat buffer using the sizes of tensors. Assume that tensors are of
+    same dense type, and that flat is given by _flatten_dense_tensors.
+    Arguments:
+        flat (Tensor): flattened dense tensors to unflatten.
+        tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
+          unflatten flat.
+    Returns:
+        Unflattened dense tensors with sizes same as tensors and values from
+        flat.
+    """
+    outputs = []
+    offset = 0
+    for tensor in tensors:
+        numel = tensor.numel()
+        outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
+        offset += numel
+    return tuple(outputs)
+
+def apply_gradient_allreduce(module):
+    """
+    Modifies existing model to do gradient allreduce, but doesn't change class
+    so you don't need "module"
+    """
+    if not hasattr(dist, '_backend'):
+        module.warn_on_half = True
+    else:
+        module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+
+    for p in module.state_dict().values():
+        if not torch.is_tensor(p):
+            continue
+        dist.broadcast(p, 0)
+
+    def allreduce_params():
+        if(module.needs_reduction):
+            module.needs_reduction = False
+            buckets = {}
+            for param in module.parameters():
+                if param.requires_grad and param.grad is not None:
+                    tp = type(param.data)
+                    if tp not in buckets:
+                        buckets[tp] = []
+                    buckets[tp].append(param)
+            if module.warn_on_half:
+                if torch.cuda.HalfTensor in buckets:
+                    print("WARNING: gloo dist backend for half parameters may be extremely slow." +
+                          " It is recommended to use the NCCL backend in this case. This currently requires" +
+                          "PyTorch built from top of tree master.")
+                    module.warn_on_half = False
+
+            for tp in buckets:
+                bucket = buckets[tp]
+                grads = [param.grad.data for param in bucket]
+                coalesced = _flatten_dense_tensors(grads)
+                dist.all_reduce(coalesced)
+                coalesced /= dist.get_world_size()
+                for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+                    buf.copy_(synced)
+
+    for param in list(module.parameters()):
+        def allreduce_hook(*unused):
+            Variable._execution_engine.queue_callback(allreduce_params)
+        if param.requires_grad:
+            param.register_hook(allreduce_hook)
+            dir(param)
+
+    def set_needs_reduction(self, input, output):
+        self.needs_reduction = True
+
+    module.register_forward_hook(set_needs_reduction)
+    return module
+
+
+def main(config, stdout_dir, args_str):
+    args_list = ['train.py']
+    args_list += args_str.split(' ') if len(args_str) > 0 else []
+
+    args_list.append('--config={}'.format(config))
+
+    num_gpus = torch.cuda.device_count()
+    args_list.append('--num_gpus={}'.format(num_gpus))
+    args_list.append("--group_name=group_{}".format(time.strftime("%Y_%m_%d-%H%M%S")))
+
+    if not os.path.isdir(stdout_dir):
+        os.makedirs(stdout_dir)
+        os.chmod(stdout_dir, 0o775)
+
+    workers = []
+
+    for i in range(num_gpus):
+        args_list[-2] = '--rank={}'.format(i)
+        stdout = None if i == 0 else open(
+            os.path.join(stdout_dir, "GPU_{}.log".format(i)), "w")
+        print(args_list)
+        p = subprocess.Popen([str(sys.executable)]+args_list, stdout=stdout)
+        workers.append(p)
+
+    for p in workers:
+        p.wait()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, required=True,
+                        help='JSON file for configuration')
+    parser.add_argument('-s', '--stdout_dir', type=str, default=".",
+                        help='directory to save stoud logs')
+    parser.add_argument(
+        '-a', '--args_str', type=str, default='',
+        help='double quoted string with space separated key value pairs')
+
+    args = parser.parse_args()
+    main(args.config, args.stdout_dir, args.args_str)
--- a/glow.py
+++ b/glow.py
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import copy
+import torch
+from torch.autograd import Variable
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a+input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+
+
+class WaveGlowLoss(torch.nn.Module):
+    def __init__(self, sigma=1.0):
+        super(WaveGlowLoss, self).__init__()
+        self.sigma = sigma
+
+    def forward(self, model_output):
+        z, log_s_list, log_det_W_list = model_output
+        for i, log_s in enumerate(log_s_list):
+            if i == 0:
+                log_s_total = torch.sum(log_s)
+                log_det_W_total = log_det_W_list[i]
+            else:
+                log_s_total = log_s_total + torch.sum(log_s)
+                log_det_W_total += log_det_W_list[i]
+
+        loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total
+        return loss/(z.size(0)*z.size(1)*z.size(2))
+
+
+class Invertible1x1Conv(torch.nn.Module):
+    """
+    The layer outputs both the convolution, and the log determinant
+    of its weight matrix.  If reverse=True it does convolution with
+    inverse
+    """
+    def __init__(self, c):
+        super(Invertible1x1Conv, self).__init__()
+        self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
+                                    bias=False)
+
+        # Sample a random orthonormal matrix to initialize weights
+        W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
+
+        # Ensure determinant is 1.0 not -1.0
+        if torch.det(W) < 0:
+            W[:,0] = -1*W[:,0]
+        W = W.contiguous().view(c, c, 1)
+        self.conv.weight.data = W
+
+    def forward(self, z, reverse=False):
+        # shape
+        batch_size, group_size, n_of_groups = z.size()
+
+        W = self.conv.weight.squeeze()
+
+        if reverse:
+            if not hasattr(self, 'W_inverse'):
+                # Reverse computation
+                W_inverse = W.float().inverse()
+                W_inverse = Variable(W_inverse[..., None])
+                if z.type() == 'torch.cuda.HalfTensor':
+                    W_inverse = W_inverse.half()
+                self.W_inverse = W_inverse
+            z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
+            return z
+        else:
+            # Forward computation
+            log_det_W = batch_size * n_of_groups * torch.logdet(W)
+            z = self.conv(z)
+            return z, log_det_W
+
+
+class WN(torch.nn.Module):
+    """
+    This is the WaveNet like layer for the affine coupling.  The primary difference
+    from WaveNet is the convolutions need not be causal.  There is also no dilation
+    size reset.  The dilation only doubles on each layer
+    """
+    def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
+                 kernel_size):
+        super(WN, self).__init__()
+        assert(kernel_size % 2 == 1)
+        assert(n_channels % 2 == 0)
+        self.n_layers = n_layers
+        self.n_channels = n_channels
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+
+        start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
+        start = torch.nn.utils.weight_norm(start, name='weight')
+        self.start = start
+
+        # Initializing last layer to 0 makes the affine coupling layers
+        # do nothing at first.  This helps with training stability
+        end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
+        end.weight.data.zero_()
+        end.bias.data.zero_()
+        self.end = end
+
+        cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1)
+        self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+
+        for i in range(n_layers):
+            dilation = 2 ** i
+            padding = int((kernel_size*dilation - dilation)/2)
+            in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
+                                       dilation=dilation, padding=padding)
+            in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+            self.in_layers.append(in_layer)
+
+
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2*n_channels
+            else:
+                res_skip_channels = n_channels
+            res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+            self.res_skip_layers.append(res_skip_layer)
+
+    def forward(self, forward_input):
+        audio, spect = forward_input
+        audio = self.start(audio)
+        output = torch.zeros_like(audio)
+        n_channels_tensor = torch.IntTensor([self.n_channels])
+
+        spect = self.cond_layer(spect)
+
+        for i in range(self.n_layers):
+            spect_offset = i*2*self.n_channels
+            acts = fused_add_tanh_sigmoid_multiply(
+                self.in_layers[i](audio),
+                spect[:,spect_offset:spect_offset+2*self.n_channels,:],
+                n_channels_tensor)
+
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                audio = audio + res_skip_acts[:,:self.n_channels,:]
+                output = output + res_skip_acts[:,self.n_channels:,:]
+            else:
+                output = output + res_skip_acts
+
+        return self.end(output)
+
+
+class WaveGlow(torch.nn.Module):
+    def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
+                 n_early_size, WN_config):
+        super(WaveGlow, self).__init__()
+
+        self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
+                                                 n_mel_channels,
+                                                 1024, stride=256)
+        assert(n_group % 2 == 0)
+        self.n_flows = n_flows
+        self.n_group = n_group
+        self.n_early_every = n_early_every
+        self.n_early_size = n_early_size
+        self.WN = torch.nn.ModuleList()
+        self.convinv = torch.nn.ModuleList()
+
+        n_half = int(n_group/2)
+
+        # Set up layers with the right sizes based on how many dimensions
+        # have been output already
+        n_remaining_channels = n_group
+        for k in range(n_flows):
+            if k % self.n_early_every == 0 and k > 0:
+                n_half = n_half - int(self.n_early_size/2)
+                n_remaining_channels = n_remaining_channels - self.n_early_size
+            self.convinv.append(Invertible1x1Conv(n_remaining_channels))
+            self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
+        self.n_remaining_channels = n_remaining_channels  # Useful during inference
+
+    def forward(self, forward_input):
+        """
+        forward_input[0] = mel_spectrogram:  batch x n_mel_channels x frames
+        forward_input[1] = audio: batch x time
+        """
+        spect, audio = forward_input
+
+        #  Upsample spectrogram to size of audio
+        spect = self.upsample(spect)
+        assert(spect.size(2) >= audio.size(1))
+        if spect.size(2) > audio.size(1):
+            spect = spect[:, :, :audio.size(1)]
+
+        spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
+        spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
+
+        audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
+        output_audio = []
+        log_s_list = []
+        log_det_W_list = []
+
+        for k in range(self.n_flows):
+            if k % self.n_early_every == 0 and k > 0:
+                output_audio.append(audio[:,:self.n_early_size,:])
+                audio = audio[:,self.n_early_size:,:]
+
+            audio, log_det_W = self.convinv[k](audio)
+            log_det_W_list.append(log_det_W)
+
+            n_half = int(audio.size(1)/2)
+            audio_0 = audio[:,:n_half,:]
+            audio_1 = audio[:,n_half:,:]
+
+            output = self.WN[k]((audio_0, spect))
+            log_s = output[:, n_half:, :]
+            b = output[:, :n_half, :]
+            audio_1 = torch.exp(log_s)*audio_1 + b
+            log_s_list.append(log_s)
+
+            audio = torch.cat([audio_0, audio_1],1)
+
+        output_audio.append(audio)
+        return torch.cat(output_audio,1), log_s_list, log_det_W_list
+
+    def infer(self, spect, sigma=1.0):
+        spect = self.upsample(spect)
+        # trim conv artifacts. maybe pad spec to kernel multiple
+        time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
+        spect = spect[:, :, :-time_cutoff]
+
+        spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
+        spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
+
+        if spect.type() == 'torch.cuda.HalfTensor':
+            audio = torch.cuda.HalfTensor(spect.size(0),
+                                          self.n_remaining_channels,
+                                          spect.size(2)).normal_()
+        else:
+            audio = torch.cuda.FloatTensor(spect.size(0),
+                                           self.n_remaining_channels,
+                                           spect.size(2)).normal_()
+
+        audio = torch.autograd.Variable(sigma*audio)
+
+        for k in reversed(range(self.n_flows)):
+            n_half = int(audio.size(1)/2)
+            audio_0 = audio[:,:n_half,:]
+            audio_1 = audio[:,n_half:,:]
+
+            output = self.WN[k]((audio_0, spect))
+
+            s = output[:, n_half:, :]
+            b = output[:, :n_half, :]
+            audio_1 = (audio_1 - b)/torch.exp(s)
+            audio = torch.cat([audio_0, audio_1],1)
+
+            audio = self.convinv[k](audio, reverse=True)
+
+            if k % self.n_early_every == 0 and k > 0:
+                if spect.type() == 'torch.cuda.HalfTensor':
+                    z = torch.cuda.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
+                else:
+                    z = torch.cuda.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
+                audio = torch.cat((sigma*z, audio),1)
+
+        audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
+        return audio
+
+    @staticmethod
+    def remove_weightnorm(model):
+        waveglow = model
+        for WN in waveglow.WN:
+            WN.start = torch.nn.utils.remove_weight_norm(WN.start)
+            WN.in_layers = remove(WN.in_layers)
+            WN.cond_layer = torch.nn.utils.remove_weight_norm(WN.cond_layer)
+            WN.res_skip_layers = remove(WN.res_skip_layers)
+        return waveglow
+
+
+def remove(conv_list):
+    new_conv_list = torch.nn.ModuleList()
+    for old_conv in conv_list:
+        old_conv = torch.nn.utils.remove_weight_norm(old_conv)
+        new_conv_list.append(old_conv)
+    return new_conv_list
--- a/glow_old.py
+++ b/glow_old.py
+import copy
+import torch
+from glow import Invertible1x1Conv, remove
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a+input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+
+
+class WN(torch.nn.Module):
+    """
+    This is the WaveNet like layer for the affine coupling.  The primary difference
+    from WaveNet is the convolutions need not be causal.  There is also no dilation
+    size reset.  The dilation only doubles on each layer
+    """
+    def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
+                 kernel_size):
+        super(WN, self).__init__()
+        assert(kernel_size % 2 == 1)
+        assert(n_channels % 2 == 0)
+        self.n_layers = n_layers
+        self.n_channels = n_channels
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.cond_layers = torch.nn.ModuleList()
+
+        start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
+        start = torch.nn.utils.weight_norm(start, name='weight')
+        self.start = start
+
+        # Initializing last layer to 0 makes the affine coupling layers
+        # do nothing at first.  This helps with training stability
+        end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
+        end.weight.data.zero_()
+        end.bias.data.zero_()
+        self.end = end
+
+        for i in range(n_layers):
+            dilation = 2 ** i
+            padding = int((kernel_size*dilation - dilation)/2)
+            in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
+                                       dilation=dilation, padding=padding)
+            in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+            self.in_layers.append(in_layer)
+
+            cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1)
+            cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+            self.cond_layers.append(cond_layer)
+
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2*n_channels
+            else:
+                res_skip_channels = n_channels
+            res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+            self.res_skip_layers.append(res_skip_layer)
+
+    def forward(self, forward_input):
+        audio, spect = forward_input
+        audio = self.start(audio)
+
+        for i in range(self.n_layers):
+            acts = fused_add_tanh_sigmoid_multiply(
+                self.in_layers[i](audio),
+                self.cond_layers[i](spect),
+                torch.IntTensor([self.n_channels]))
+
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                audio = res_skip_acts[:,:self.n_channels,:] + audio
+                skip_acts = res_skip_acts[:,self.n_channels:,:]
+            else:
+                skip_acts = res_skip_acts
+
+            if i == 0:
+                output = skip_acts
+            else:
+                output = skip_acts + output
+        return self.end(output)
+
+
+class WaveGlow(torch.nn.Module):
+    def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
+                 n_early_size, WN_config):
+        super(WaveGlow, self).__init__()
+
+        self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
+                                                 n_mel_channels,
+                                                 1024, stride=256)
+        assert(n_group % 2 == 0)
+        self.n_flows = n_flows
+        self.n_group = n_group
+        self.n_early_every = n_early_every
+        self.n_early_size = n_early_size
+        self.WN = torch.nn.ModuleList()
+        self.convinv = torch.nn.ModuleList()
+
+        n_half = int(n_group/2)
+
+        # Set up layers with the right sizes based on how many dimensions
+        # have been output already
+        n_remaining_channels = n_group
+        for k in range(n_flows):
+            if k % self.n_early_every == 0 and k > 0:
+                n_half = n_half - int(self.n_early_size/2)
+                n_remaining_channels = n_remaining_channels - self.n_early_size
+            self.convinv.append(Invertible1x1Conv(n_remaining_channels))
+            self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
+        self.n_remaining_channels = n_remaining_channels  # Useful during inference
+
+    def forward(self, forward_input):
+        return None
+        """
+        forward_input[0] = audio: batch x time
+        forward_input[1] = upsamp_spectrogram:  batch x n_cond_channels x time
+        """
+        """
+        spect, audio = forward_input
+
+        #  Upsample spectrogram to size of audio
+        spect = self.upsample(spect)
+        assert(spect.size(2) >= audio.size(1))
+        if spect.size(2) > audio.size(1):
+            spect = spect[:, :, :audio.size(1)]
+
+        spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
+        spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
+
+        audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
+        output_audio = []
+        s_list = []
+        s_conv_list = []
+
+        for k in range(self.n_flows):
+            if k%4 == 0 and k > 0:
+                output_audio.append(audio[:,:self.n_multi,:])
+                audio = audio[:,self.n_multi:,:]
+
+            # project to new basis
+            audio, s = self.convinv[k](audio)
+            s_conv_list.append(s)
+
+            n_half = int(audio.size(1)/2)
+            if k%2 == 0:
+                audio_0 = audio[:,:n_half,:]
+                audio_1 = audio[:,n_half:,:]
+            else:
+                audio_1 = audio[:,:n_half,:]
+                audio_0 = audio[:,n_half:,:]
+
+            output = self.nn[k]((audio_0, spect))
+            s = output[:, n_half:, :]
+            b = output[:, :n_half, :]
+            audio_1 = torch.exp(s)*audio_1 + b
+            s_list.append(s)
+
+            if k%2 == 0:
+                audio = torch.cat([audio[:,:n_half,:], audio_1],1)
+            else:
+                audio = torch.cat([audio_1, audio[:,n_half:,:]], 1)
+        output_audio.append(audio)
+        return torch.cat(output_audio,1), s_list, s_conv_list
+        """
+
+    def infer(self, spect, sigma=1.0):
+        spect = self.upsample(spect)
+        # trim conv artifacts. maybe pad spec to kernel multiple
+        time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
+        spect = spect[:, :, :-time_cutoff]
+
+        spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
+        spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
+
+        if spect.type() == 'torch.cuda.HalfTensor':
+            audio = torch.cuda.HalfTensor(spect.size(0),
+                                          self.n_remaining_channels,
+                                          spect.size(2)).normal_()
+        else:
+            audio = torch.cuda.FloatTensor(spect.size(0),
+                                           self.n_remaining_channels,
+                                           spect.size(2)).normal_()
+
+        audio = torch.autograd.Variable(sigma*audio)
+
+        for k in reversed(range(self.n_flows)):
+            n_half = int(audio.size(1)/2)
+            if k%2 == 0:
+                audio_0 = audio[:,:n_half,:]
+                audio_1 = audio[:,n_half:,:]
+            else:
+                audio_1 = audio[:,:n_half,:]
+                audio_0 = audio[:,n_half:,:]
+
+            output = self.WN[k]((audio_0, spect))
+            s = output[:, n_half:, :]
+            b = output[:, :n_half, :]
+            audio_1 = (audio_1 - b)/torch.exp(s)
+            if k%2 == 0:
+                audio = torch.cat([audio[:,:n_half,:], audio_1],1)
+            else:
+                audio = torch.cat([audio_1, audio[:,n_half:,:]], 1)
+
+            audio = self.convinv[k](audio, reverse=True)
+
+            if k%4 == 0 and k > 0:
+                if spect.type() == 'torch.cuda.HalfTensor':
+                    z = torch.cuda.HalfTensor(spect.size(0),
+                                              self.n_early_size,
+                                              spect.size(2)).normal_()
+                else:
+                    z = torch.cuda.FloatTensor(spect.size(0),
+                                               self.n_early_size,
+                                               spect.size(2)).normal_()
+                audio = torch.cat((sigma*z, audio),1)
+
+        return audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
+
+    @staticmethod
+    def remove_weightnorm(model):
+        waveglow = model
+        for WN in waveglow.WN:
+            WN.start = torch.nn.utils.remove_weight_norm(WN.start)
+            WN.in_layers = remove(WN.in_layers)
+            WN.cond_layers = remove(WN.cond_layers)
+            WN.res_skip_layers = remove(WN.res_skip_layers)
+        return waveglow
--- a/inference.py
+++ b/inference.py
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import os
+from scipy.io.wavfile import write
+import torch
+from mel2samp import files_to_list, MAX_WAV_VALUE
+from denoiser import Denoiser
+
+
+def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
+         denoiser_strength):
+    mel_files = files_to_list(mel_files)
+    waveglow = torch.load(waveglow_path)['model']
+    waveglow = waveglow.remove_weightnorm(waveglow)
+    waveglow.cuda().eval()
+    if is_fp16:
+        from apex import amp
+        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")
+
+    if denoiser_strength > 0:
+        denoiser = Denoiser(waveglow).cuda()
+
+    for i, file_path in enumerate(mel_files):
+        file_name = os.path.splitext(os.path.basename(file_path))[0]
+        mel = torch.load(file_path)
+        mel = torch.autograd.Variable(mel.cuda())
+        mel = torch.unsqueeze(mel, 0)
+        mel = mel.half() if is_fp16 else mel
+        with torch.no_grad():
+            audio = waveglow.infer(mel, sigma=sigma)
+            if denoiser_strength > 0:
+                audio = denoiser(audio, denoiser_strength)
+            audio = audio * MAX_WAV_VALUE
+        audio = audio.squeeze()
+        audio = audio.cpu().numpy()
+        audio = audio.astype('int16')
+        audio_path = os.path.join(
+            output_dir, "{}_synthesis.wav".format(file_name))
+        write(audio_path, sampling_rate, audio)
+        print(audio_path)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', "--filelist_path", required=True)
+    parser.add_argument('-w', '--waveglow_path',
+                        help='Path to waveglow decoder checkpoint with model')
+    parser.add_argument('-o', "--output_dir", required=True)
+    parser.add_argument("-s", "--sigma", default=1.0, type=float)
+    parser.add_argument("--sampling_rate", default=22050, type=int)
+    parser.add_argument("--is_fp16", action="store_true")
+    parser.add_argument("-d", "--denoiser_strength", default=0.0, type=float,
+                        help='Removes model bias. Start with 0.1 and adjust')
+
+    args = parser.parse_args()
+
+    main(args.filelist_path, args.waveglow_path, args.sigma, args.output_dir,
+         args.sampling_rate, args.is_fp16, args.denoiser_strength)
--- a/mel2samp.py
+++ b/mel2samp.py
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************\
+import os
+import random
+import argparse
+import json
+import torch
+import torch.utils.data
+import sys
+from scipy.io.wavfile import read
+
+# We're using the audio processing from TacoTron2 to make sure it matches
+sys.path.insert(0, 'tacotron2')
+from tacotron2.layers import TacotronSTFT
+
+MAX_WAV_VALUE = 32768.0
+
+def files_to_list(filename):
+    """
+    Takes a text file of filenames and makes a list of filenames
+    """
+    with open(filename, encoding='utf-8') as f:
+        files = f.readlines()
+
+    files = [f.rstrip() for f in files]
+    return files
+
+def load_wav_to_torch(full_path):
+    """
+    Loads wavdata into torch array
+    """
+    sampling_rate, data = read(full_path)
+    return torch.from_numpy(data).float(), sampling_rate
+
+
+class Mel2Samp(torch.utils.data.Dataset):
+    """
+    This is the main class that calculates the spectrogram and returns the
+    spectrogram, audio pair.
+    """
+    def __init__(self, training_files, segment_length, filter_length,
+                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
+        self.audio_files = files_to_list(training_files)
+        random.seed(1234)
+        random.shuffle(self.audio_files)
+        self.stft = TacotronSTFT(filter_length=filter_length,
+                                 hop_length=hop_length,
+                                 win_length=win_length,
+                                 sampling_rate=sampling_rate,
+                                 mel_fmin=mel_fmin, mel_fmax=mel_fmax)
+        self.segment_length = segment_length
+        self.sampling_rate = sampling_rate
+
+    def get_mel(self, audio):
+        audio_norm = audio / MAX_WAV_VALUE
+        audio_norm = audio_norm.unsqueeze(0)
+        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
+        melspec = self.stft.mel_spectrogram(audio_norm)
+        melspec = torch.squeeze(melspec, 0)
+        return melspec
+
+    def __getitem__(self, index):
+        # Read audio
+        filename = self.audio_files[index]
+        audio, sampling_rate = load_wav_to_torch(filename)
+        if sampling_rate != self.sampling_rate:
+            raise ValueError("{} SR doesn't match target {} SR".format(
+                sampling_rate, self.sampling_rate))
+
+        # Take segment
+        if audio.size(0) >= self.segment_length:
+            max_audio_start = audio.size(0) - self.segment_length
+            audio_start = random.randint(0, max_audio_start)
+            audio = audio[audio_start:audio_start+self.segment_length]
+        else:
+            audio = torch.nn.functional.pad(audio, (0, self.segment_length - audio.size(0)), 'constant').data
+
+        mel = self.get_mel(audio)
+        audio = audio / MAX_WAV_VALUE
+
+        return (mel, audio)
+
+    def __len__(self):
+        return len(self.audio_files)
+
+# ===================================================================
+# Takes directory of clean audio and makes directory of spectrograms
+# Useful for making test sets
+# ===================================================================
+if __name__ == "__main__":
+    # Get defaults so it can work with no Sacred
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', "--filelist_path", required=True)
+    parser.add_argument('-c', '--config', type=str,
+                        help='JSON file for configuration')
+    parser.add_argument('-o', '--output_dir', type=str,
+                        help='Output directory')
+    args = parser.parse_args()
+
+    with open(args.config) as f:
+        data = f.read()
+    data_config = json.loads(data)["data_config"]
+    mel2samp = Mel2Samp(**data_config)
+
+    filepaths = files_to_list(args.filelist_path)
+
+    # Make directory if it doesn't exist
+    if not os.path.isdir(args.output_dir):
+        os.makedirs(args.output_dir)
+        os.chmod(args.output_dir, 0o775)
+
+    for filepath in filepaths:
+        audio, sr = load_wav_to_torch(filepath)
+        melspectrogram = mel2samp.get_mel(audio)
+        filename = os.path.basename(filepath)
+        new_filepath = args.output_dir + '/' + filename + '.pt'
+        print(new_filepath)
+        torch.save(melspectrogram, new_filepath)
--- a/requirements.txt
+++ b/requirements.txt
+matplotlib==3.9.0
+tensorflow==2.17.0
+numpy==1.23.5
+inflect==0.2.5
+librosa==0.8.1
+scipy==1.13.1
+tensorboardX==1.1
+Unidecode==1.0.22
+pillow
--- a/train.py
+++ b/train.py
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import argparse
+import json
+import os
+import torch
+
+#=====START: ADDED FOR DISTRIBUTED======
+from distributed import init_distributed, apply_gradient_allreduce, reduce_tensor
+from torch.utils.data.distributed import DistributedSampler
+#=====END:   ADDED FOR DISTRIBUTED======
+
+from torch.utils.data import DataLoader
+from glow import WaveGlow, WaveGlowLoss
+from mel2samp import Mel2Samp
+
+def load_checkpoint(checkpoint_path, model, optimizer):
+    assert os.path.isfile(checkpoint_path)
+    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+    iteration = checkpoint_dict['iteration']
+    optimizer.load_state_dict(checkpoint_dict['optimizer'])
+    model_for_loading = checkpoint_dict['model']
+    model.load_state_dict(model_for_loading.state_dict())
+    print("Loaded checkpoint '{}' (iteration {})" .format(
+          checkpoint_path, iteration))
+    return model, optimizer, iteration
+
+def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
+    print("Saving model and optimizer state at iteration {} to {}".format(
+          iteration, filepath))
+    model_for_saving = WaveGlow(**waveglow_config).cuda()
+    model_for_saving.load_state_dict(model.state_dict())
+    torch.save({'model': model_for_saving,
+                'iteration': iteration,
+                'optimizer': optimizer.state_dict(),
+                'learning_rate': learning_rate}, filepath)
+
+def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
+          sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
+          checkpoint_path, with_tensorboard):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    #=====START: ADDED FOR DISTRIBUTED======
+    if num_gpus > 1:
+        init_distributed(rank, num_gpus, group_name, **dist_config)
+    #=====END:   ADDED FOR DISTRIBUTED======
+
+    criterion = WaveGlowLoss(sigma)
+    model = WaveGlow(**waveglow_config).cuda()
+
+    #=====START: ADDED FOR DISTRIBUTED======
+    if num_gpus > 1:
+        model = apply_gradient_allreduce(model)
+    #=====END:   ADDED FOR DISTRIBUTED======
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+
+    if fp16_run:
+        from apex import amp
+        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
+
+    # Load checkpoint if one exists
+    iteration = 0
+    if checkpoint_path != "":
+        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
+                                                      optimizer)
+        iteration += 1  # next iteration is iteration + 1
+
+    trainset = Mel2Samp(**data_config)
+    # =====START: ADDED FOR DISTRIBUTED======
+    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
+    # =====END:   ADDED FOR DISTRIBUTED======
+    train_loader = DataLoader(trainset, num_workers=1, shuffle=False,
+                              sampler=train_sampler,
+                              batch_size=batch_size,
+                              pin_memory=False,
+                              drop_last=True)
+
+    # Get shared output_directory ready
+    if rank == 0:
+        if not os.path.isdir(output_directory):
+            os.makedirs(output_directory)
+            os.chmod(output_directory, 0o775)
+        print("output directory", output_directory)
+
+    if with_tensorboard and rank == 0:
+        from tensorboardX import SummaryWriter
+        logger = SummaryWriter(os.path.join(output_directory, 'logs'))
+
+    model.train()
+    epoch_offset = max(0, int(iteration / len(train_loader)))
+    # ================ MAIN TRAINNIG LOOP! ===================
+    for epoch in range(epoch_offset, epochs):
+        print("Epoch: {}".format(epoch))
+        for i, batch in enumerate(train_loader):
+            model.zero_grad()
+
+            mel, audio = batch
+            mel = torch.autograd.Variable(mel.cuda())
+            audio = torch.autograd.Variable(audio.cuda())
+            outputs = model((mel, audio))
+
+            loss = criterion(outputs)
+            if num_gpus > 1:
+                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
+            else:
+                reduced_loss = loss.item()
+
+            if fp16_run:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+
+            optimizer.step()
+
+            print("{}:\t{:.9f}".format(iteration, reduced_loss))
+            if with_tensorboard and rank == 0:
+                logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch)
+
+            if (iteration % iters_per_checkpoint == 0):
+                if rank == 0:
+                    checkpoint_path = "{}/waveglow_{}".format(
+                        output_directory, iteration)
+                    save_checkpoint(model, optimizer, learning_rate, iteration,
+                                    checkpoint_path)
+
+            iteration += 1
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str,
+                        help='JSON file for configuration')
+    parser.add_argument('-r', '--rank', type=int, default=0,
+                        help='rank of process for distributed')
+    parser.add_argument('-g', '--group_name', type=str, default='',
+                        help='name of group for distributed')
+    args = parser.parse_args()
+
+    # Parse configs.  Globals nicer in this case
+    with open(args.config) as f:
+        data = f.read()
+    config = json.loads(data)
+    train_config = config["train_config"]
+    global data_config
+    data_config = config["data_config"]
+    global dist_config
+    dist_config = config["dist_config"]
+    global waveglow_config
+    waveglow_config = config["waveglow_config"]
+
+    num_gpus = torch.cuda.device_count()
+    if num_gpus > 1:
+        if args.group_name == '':
+            print("WARNING: Multiple GPUs detected but no distributed group set")
+            print("Only running 1 GPU.  Use distributed.py for multiple GPUs")
+            num_gpus = 1
+
+    if num_gpus == 1 and args.rank != 0:
+        raise Exception("Doing single GPU training on rank > 0")
+
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = False
+    train(num_gpus, args.rank, args.group_name, **train_config)
--- a/waveglow_logo.png
+++ b/waveglow_logo.png