Commit aa0c8efc authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #1846 failed with stages
in 0 seconds
# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
# LICENSE is in incl_licenses directory.
from __future__ import absolute_import, division, print_function, unicode_literals
import glob
import os
import numpy as np
import argparse
import json
import torch
from scipy.io.wavfile import write
from env import AttrDict
from meldataset import MAX_WAV_VALUE
from bigvgan import BigVGAN as Generator
h = None
device = None
torch.backends.cudnn.benchmark = False
def load_checkpoint(filepath, device):
assert os.path.isfile(filepath)
print(f"Loading '{filepath}'")
checkpoint_dict = torch.load(filepath, map_location=device)
print("Complete.")
return checkpoint_dict
def scan_checkpoint(cp_dir, prefix):
pattern = os.path.join(cp_dir, prefix + "*")
cp_list = glob.glob(pattern)
if len(cp_list) == 0:
return ""
return sorted(cp_list)[-1]
def inference(a, h):
generator = Generator(h, use_cuda_kernel=a.use_cuda_kernel).to(device)
state_dict_g = load_checkpoint(a.checkpoint_file, device)
generator.load_state_dict(state_dict_g["generator"])
filelist = os.listdir(a.input_mels_dir)
os.makedirs(a.output_dir, exist_ok=True)
generator.eval()
generator.remove_weight_norm()
with torch.no_grad():
for i, filname in enumerate(filelist):
# Load the mel spectrogram in .npy format
x = np.load(os.path.join(a.input_mels_dir, filname))
x = torch.FloatTensor(x).to(device)
if len(x.shape) == 2:
x = x.unsqueeze(0)
y_g_hat = generator(x)
audio = y_g_hat.squeeze()
audio = audio * MAX_WAV_VALUE
audio = audio.cpu().numpy().astype("int16")
output_file = os.path.join(
a.output_dir, os.path.splitext(filname)[0] + "_generated_e2e.wav"
)
write(output_file, h.sampling_rate, audio)
print(output_file)
def main():
print("Initializing Inference Process..")
parser = argparse.ArgumentParser()
parser.add_argument("--input_mels_dir", default="test_mel_files")
parser.add_argument("--output_dir", default="generated_files_from_mel")
parser.add_argument("--checkpoint_file", required=True)
parser.add_argument("--use_cuda_kernel", action="store_true", default=False)
a = parser.parse_args()
config_file = os.path.join(os.path.split(a.checkpoint_file)[0], "config.json")
with open(config_file) as f:
data = f.read()
global h
json_config = json.loads(data)
h = AttrDict(json_config)
torch.manual_seed(h.seed)
global device
if torch.cuda.is_available():
torch.cuda.manual_seed(h.seed)
device = torch.device("cuda")
else:
device = torch.device("cpu")
inference(a, h)
if __name__ == "__main__":
main()
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
# LICENSE is in incl_licenses directory.
import torch
import torch.nn.functional as F
import torch.nn as nn
from librosa.filters import mel as librosa_mel_fn
from scipy import signal
import typing
from typing import Optional, List, Union, Dict, Tuple
from collections import namedtuple
import math
import functools
# Adapted from https://github.com/descriptinc/descript-audio-codec/blob/main/dac/nn/loss.py under the MIT license.
# LICENSE is in incl_licenses directory.
class MultiScaleMelSpectrogramLoss(nn.Module):
"""Compute distance between mel spectrograms. Can be used
in a multi-scale way.
Parameters
----------
n_mels : List[int]
Number of mels per STFT, by default [5, 10, 20, 40, 80, 160, 320],
window_lengths : List[int], optional
Length of each window of each STFT, by default [32, 64, 128, 256, 512, 1024, 2048]
loss_fn : typing.Callable, optional
How to compare each loss, by default nn.L1Loss()
clamp_eps : float, optional
Clamp on the log magnitude, below, by default 1e-5
mag_weight : float, optional
Weight of raw magnitude portion of loss, by default 0.0 (no ampliciation on mag part)
log_weight : float, optional
Weight of log magnitude portion of loss, by default 1.0
pow : float, optional
Power to raise magnitude to before taking log, by default 1.0
weight : float, optional
Weight of this loss, by default 1.0
match_stride : bool, optional
Whether to match the stride of convolutional layers, by default False
Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/spectral.py
Additional code copied and modified from https://github.com/descriptinc/audiotools/blob/master/audiotools/core/audio_signal.py
"""
def __init__(
self,
sampling_rate: int,
n_mels: List[int] = [5, 10, 20, 40, 80, 160, 320],
window_lengths: List[int] = [32, 64, 128, 256, 512, 1024, 2048],
loss_fn: typing.Callable = nn.L1Loss(),
clamp_eps: float = 1e-5,
mag_weight: float = 0.0,
log_weight: float = 1.0,
pow: float = 1.0,
weight: float = 1.0,
match_stride: bool = False,
mel_fmin: List[float] = [0, 0, 0, 0, 0, 0, 0],
mel_fmax: List[float] = [None, None, None, None, None, None, None],
window_type: str = "hann",
):
super().__init__()
self.sampling_rate = sampling_rate
STFTParams = namedtuple(
"STFTParams",
["window_length", "hop_length", "window_type", "match_stride"],
)
self.stft_params = [
STFTParams(
window_length=w,
hop_length=w // 4,
match_stride=match_stride,
window_type=window_type,
)
for w in window_lengths
]
self.n_mels = n_mels
self.loss_fn = loss_fn
self.clamp_eps = clamp_eps
self.log_weight = log_weight
self.mag_weight = mag_weight
self.weight = weight
self.mel_fmin = mel_fmin
self.mel_fmax = mel_fmax
self.pow = pow
@staticmethod
@functools.lru_cache(None)
def get_window(
window_type,
window_length,
):
return signal.get_window(window_type, window_length)
@staticmethod
@functools.lru_cache(None)
def get_mel_filters(sr, n_fft, n_mels, fmin, fmax):
return librosa_mel_fn(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
def mel_spectrogram(
self,
wav,
n_mels,
fmin,
fmax,
window_length,
hop_length,
match_stride,
window_type,
):
"""
Mirrors AudioSignal.mel_spectrogram used by BigVGAN-v2 training from:
https://github.com/descriptinc/audiotools/blob/master/audiotools/core/audio_signal.py
"""
B, C, T = wav.shape
if match_stride:
assert (
hop_length == window_length // 4
), "For match_stride, hop must equal n_fft // 4"
right_pad = math.ceil(T / hop_length) * hop_length - T
pad = (window_length - hop_length) // 2
else:
right_pad = 0
pad = 0
wav = torch.nn.functional.pad(wav, (pad, pad + right_pad), mode="reflect")
window = self.get_window(window_type, window_length)
window = torch.from_numpy(window).to(wav.device).float()
stft = torch.stft(
wav.reshape(-1, T),
n_fft=window_length,
hop_length=hop_length,
window=window,
return_complex=True,
center=True,
)
_, nf, nt = stft.shape
stft = stft.reshape(B, C, nf, nt)
if match_stride:
"""
Drop first two and last two frames, which are added, because of padding. Now num_frames * hop_length = num_samples.
"""
stft = stft[..., 2:-2]
magnitude = torch.abs(stft)
nf = magnitude.shape[2]
mel_basis = self.get_mel_filters(
self.sampling_rate, 2 * (nf - 1), n_mels, fmin, fmax
)
mel_basis = torch.from_numpy(mel_basis).to(wav.device)
mel_spectrogram = magnitude.transpose(2, -1) @ mel_basis.T
mel_spectrogram = mel_spectrogram.transpose(-1, 2)
return mel_spectrogram
def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
"""Computes mel loss between an estimate and a reference
signal.
Parameters
----------
x : torch.Tensor
Estimate signal
y : torch.Tensor
Reference signal
Returns
-------
torch.Tensor
Mel loss.
"""
loss = 0.0
for n_mels, fmin, fmax, s in zip(
self.n_mels, self.mel_fmin, self.mel_fmax, self.stft_params
):
kwargs = {
"n_mels": n_mels,
"fmin": fmin,
"fmax": fmax,
"window_length": s.window_length,
"hop_length": s.hop_length,
"match_stride": s.match_stride,
"window_type": s.window_type,
}
x_mels = self.mel_spectrogram(x, **kwargs)
y_mels = self.mel_spectrogram(y, **kwargs)
x_logmels = torch.log(
x_mels.clamp(min=self.clamp_eps).pow(self.pow)
) / torch.log(torch.tensor(10.0))
y_logmels = torch.log(
y_mels.clamp(min=self.clamp_eps).pow(self.pow)
) / torch.log(torch.tensor(10.0))
loss += self.log_weight * self.loss_fn(x_logmels, y_logmels)
loss += self.mag_weight * self.loss_fn(x_logmels, y_logmels)
return loss
# Loss functions
def feature_loss(
fmap_r: List[List[torch.Tensor]], fmap_g: List[List[torch.Tensor]]
) -> torch.Tensor:
loss = 0
for dr, dg in zip(fmap_r, fmap_g):
for rl, gl in zip(dr, dg):
loss += torch.mean(torch.abs(rl - gl))
return loss * 2 # This equates to lambda=2.0 for the feature matching loss
def discriminator_loss(
disc_real_outputs: List[torch.Tensor], disc_generated_outputs: List[torch.Tensor]
) -> Tuple[torch.Tensor, List[torch.Tensor], List[torch.Tensor]]:
loss = 0
r_losses = []
g_losses = []
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
r_loss = torch.mean((1 - dr) ** 2)
g_loss = torch.mean(dg**2)
loss += r_loss + g_loss
r_losses.append(r_loss.item())
g_losses.append(g_loss.item())
return loss, r_losses, g_losses
def generator_loss(
disc_outputs: List[torch.Tensor],
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
loss = 0
gen_losses = []
for dg in disc_outputs:
l = torch.mean((1 - dg) ** 2)
gen_losses.append(l)
loss += l
return loss, gen_losses
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
# LICENSE is in incl_licenses directory.
import math
import os
import random
import torch
import torch.utils.data
import numpy as np
import librosa
from librosa.filters import mel as librosa_mel_fn
import pathlib
from tqdm import tqdm
from typing import List, Tuple, Optional
from env import AttrDict
MAX_WAV_VALUE = 32767.0 # NOTE: 32768.0 -1 to prevent int16 overflow (results in popping sound in corner cases)
def dynamic_range_compression(x, C=1, clip_val=1e-5):
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
def dynamic_range_decompression(x, C=1):
return np.exp(x) / C
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
return torch.log(torch.clamp(x, min=clip_val) * C)
def dynamic_range_decompression_torch(x, C=1):
return torch.exp(x) / C
def spectral_normalize_torch(magnitudes):
return dynamic_range_compression_torch(magnitudes)
def spectral_de_normalize_torch(magnitudes):
return dynamic_range_decompression_torch(magnitudes)
mel_basis_cache = {}
hann_window_cache = {}
def mel_spectrogram(
y: torch.Tensor,
n_fft: int,
num_mels: int,
sampling_rate: int,
hop_size: int,
win_size: int,
fmin: int,
fmax: int = None,
center: bool = False,
) -> torch.Tensor:
"""
Calculate the mel spectrogram of an input signal.
This function uses slaney norm for the librosa mel filterbank (using librosa.filters.mel) and uses Hann window for STFT (using torch.stft).
Args:
y (torch.Tensor): Input signal.
n_fft (int): FFT size.
num_mels (int): Number of mel bins.
sampling_rate (int): Sampling rate of the input signal.
hop_size (int): Hop size for STFT.
win_size (int): Window size for STFT.
fmin (int): Minimum frequency for mel filterbank.
fmax (int): Maximum frequency for mel filterbank. If None, defaults to half the sampling rate (fmax = sr / 2.0) inside librosa_mel_fn
center (bool): Whether to pad the input to center the frames. Default is False.
Returns:
torch.Tensor: Mel spectrogram.
"""
if torch.min(y) < -1.0:
print(f"[WARNING] Min value of input waveform signal is {torch.min(y)}")
if torch.max(y) > 1.0:
print(f"[WARNING] Max value of input waveform signal is {torch.max(y)}")
device = y.device
key = f"{n_fft}_{num_mels}_{sampling_rate}_{hop_size}_{win_size}_{fmin}_{fmax}_{device}"
if key not in mel_basis_cache:
mel = librosa_mel_fn(
sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
)
mel_basis_cache[key] = torch.from_numpy(mel).float().to(device)
hann_window_cache[key] = torch.hann_window(win_size).to(device)
mel_basis = mel_basis_cache[key]
hann_window = hann_window_cache[key]
padding = (n_fft - hop_size) // 2
y = torch.nn.functional.pad(
y.unsqueeze(1), (padding, padding), mode="reflect"
).squeeze(1)
spec = torch.stft(
y,
n_fft,
hop_length=hop_size,
win_length=win_size,
window=hann_window,
center=center,
pad_mode="reflect",
normalized=False,
onesided=True,
return_complex=True,
)
spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
mel_spec = torch.matmul(mel_basis, spec)
mel_spec = spectral_normalize_torch(mel_spec)
return mel_spec
def get_mel_spectrogram(wav, h):
"""
Generate mel spectrogram from a waveform using given hyperparameters.
Args:
wav (torch.Tensor): Input waveform.
h: Hyperparameters object with attributes n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax.
Returns:
torch.Tensor: Mel spectrogram.
"""
return mel_spectrogram(
wav,
h.n_fft,
h.num_mels,
h.sampling_rate,
h.hop_size,
h.win_size,
h.fmin,
h.fmax,
)
def get_dataset_filelist(a):
training_files = []
validation_files = []
list_unseen_validation_files = []
with open(a.input_training_file, "r", encoding="utf-8") as fi:
training_files = [
os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav")
for x in fi.read().split("\n")
if len(x) > 0
]
print(f"first training file: {training_files[0]}")
with open(a.input_validation_file, "r", encoding="utf-8") as fi:
validation_files = [
os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav")
for x in fi.read().split("\n")
if len(x) > 0
]
print(f"first validation file: {validation_files[0]}")
for i in range(len(a.list_input_unseen_validation_file)):
with open(a.list_input_unseen_validation_file[i], "r", encoding="utf-8") as fi:
unseen_validation_files = [
os.path.join(a.list_input_unseen_wavs_dir[i], x.split("|")[0] + ".wav")
for x in fi.read().split("\n")
if len(x) > 0
]
print(
f"first unseen {i}th validation fileset: {unseen_validation_files[0]}"
)
list_unseen_validation_files.append(unseen_validation_files)
return training_files, validation_files, list_unseen_validation_files
class MelDataset(torch.utils.data.Dataset):
def __init__(
self,
training_files: List[str],
hparams: AttrDict,
segment_size: int,
n_fft: int,
num_mels: int,
hop_size: int,
win_size: int,
sampling_rate: int,
fmin: int,
fmax: Optional[int],
split: bool = True,
shuffle: bool = True,
device: str = None,
fmax_loss: Optional[int] = None,
fine_tuning: bool = False,
base_mels_path: str = None,
is_seen: bool = True,
):
self.audio_files = training_files
random.seed(1234)
if shuffle:
random.shuffle(self.audio_files)
self.hparams = hparams
self.is_seen = is_seen
if self.is_seen:
self.name = pathlib.Path(self.audio_files[0]).parts[0]
else:
self.name = "-".join(pathlib.Path(self.audio_files[0]).parts[:2]).strip("/")
self.segment_size = segment_size
self.sampling_rate = sampling_rate
self.split = split
self.n_fft = n_fft
self.num_mels = num_mels
self.hop_size = hop_size
self.win_size = win_size
self.fmin = fmin
self.fmax = fmax
self.fmax_loss = fmax_loss
self.device = device
self.fine_tuning = fine_tuning
self.base_mels_path = base_mels_path
print("[INFO] checking dataset integrity...")
for i in tqdm(range(len(self.audio_files))):
assert os.path.exists(
self.audio_files[i]
), f"{self.audio_files[i]} not found"
def __getitem__(
self, index: int
) -> Tuple[torch.Tensor, torch.Tensor, str, torch.Tensor]:
try:
filename = self.audio_files[index]
# Use librosa.load that ensures loading waveform into mono with [-1, 1] float values
# Audio is ndarray with shape [T_time]. Disable auto-resampling here to minimize overhead
# The on-the-fly resampling during training will be done only for the obtained random chunk
audio, source_sampling_rate = librosa.load(filename, sr=None, mono=True)
# Main logic that uses <mel, audio> pair for training BigVGAN
if not self.fine_tuning:
if self.split: # Training step
# Obtain randomized audio chunk
if source_sampling_rate != self.sampling_rate:
# Adjust segment size to crop if the source sr is different
target_segment_size = math.ceil(
self.segment_size
* (source_sampling_rate / self.sampling_rate)
)
else:
target_segment_size = self.segment_size
# Compute upper bound index for the random chunk
random_chunk_upper_bound = max(
0, audio.shape[0] - target_segment_size
)
# Crop or pad audio to obtain random chunk with target_segment_size
if audio.shape[0] >= target_segment_size:
audio_start = random.randint(0, random_chunk_upper_bound)
audio = audio[audio_start : audio_start + target_segment_size]
else:
audio = np.pad(
audio,
(0, target_segment_size - audio.shape[0]),
mode="constant",
)
# Resample audio chunk to self.sampling rate
if source_sampling_rate != self.sampling_rate:
audio = librosa.resample(
audio,
orig_sr=source_sampling_rate,
target_sr=self.sampling_rate,
)
if audio.shape[0] > self.segment_size:
# trim last elements to match self.segment_size (e.g., 16385 for 44khz downsampled to 24khz -> 16384)
audio = audio[: self.segment_size]
else: # Validation step
# Resample full audio clip to target sampling rate
if source_sampling_rate != self.sampling_rate:
audio = librosa.resample(
audio,
orig_sr=source_sampling_rate,
target_sr=self.sampling_rate,
)
# Trim last elements to match audio length to self.hop_size * n for evaluation
if (audio.shape[0] % self.hop_size) != 0:
audio = audio[: -(audio.shape[0] % self.hop_size)]
# BigVGAN is trained using volume-normalized waveform
audio = librosa.util.normalize(audio) * 0.95
# Cast ndarray to torch tensor
audio = torch.FloatTensor(audio)
audio = audio.unsqueeze(0) # [B(1), self.segment_size]
# Compute mel spectrogram corresponding to audio
mel = mel_spectrogram(
audio,
self.n_fft,
self.num_mels,
self.sampling_rate,
self.hop_size,
self.win_size,
self.fmin,
self.fmax,
center=False,
) # [B(1), self.num_mels, self.segment_size // self.hop_size]
# Fine-tuning logic that uses pre-computed mel. Example: Using TTS model-generated mel as input
else:
# For fine-tuning, assert that the waveform is in the defined sampling_rate
# Fine-tuning won't support on-the-fly resampling to be fool-proof (the dataset should have been prepared properly)
assert (
source_sampling_rate == self.sampling_rate
), f"For fine_tuning, waveform must be in the spcified sampling rate {self.sampling_rate}, got {source_sampling_rate}"
# Cast ndarray to torch tensor
audio = torch.FloatTensor(audio)
audio = audio.unsqueeze(0) # [B(1), T_time]
# Load pre-computed mel from disk
mel = np.load(
os.path.join(
self.base_mels_path,
os.path.splitext(os.path.split(filename)[-1])[0] + ".npy",
)
)
mel = torch.from_numpy(mel)
if len(mel.shape) < 3:
mel = mel.unsqueeze(0) # ensure [B, C, T]
if self.split:
frames_per_seg = math.ceil(self.segment_size / self.hop_size)
if audio.size(1) >= self.segment_size:
mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
mel = mel[:, :, mel_start : mel_start + frames_per_seg]
audio = audio[
:,
mel_start
* self.hop_size : (mel_start + frames_per_seg)
* self.hop_size,
]
# Pad pre-computed mel and audio to match length to ensuring fine-tuning without error.
# NOTE: this may introduce a single-frame misalignment of the <pre-computed mel, audio>
# To remove possible misalignment, it is recommended to prepare the <pre-computed mel, audio> pair where the audio length is the integer multiple of self.hop_size
mel = torch.nn.functional.pad(
mel, (0, frames_per_seg - mel.size(2)), "constant"
)
audio = torch.nn.functional.pad(
audio, (0, self.segment_size - audio.size(1)), "constant"
)
# Compute mel_loss used by spectral regression objective. Uses self.fmax_loss instead (usually None)
mel_loss = mel_spectrogram(
audio,
self.n_fft,
self.num_mels,
self.sampling_rate,
self.hop_size,
self.win_size,
self.fmin,
self.fmax_loss,
center=False,
) # [B(1), self.num_mels, self.segment_size // self.hop_size]
# Shape sanity checks
assert (
audio.shape[1] == mel.shape[2] * self.hop_size
and audio.shape[1] == mel_loss.shape[2] * self.hop_size
), f"Audio length must be mel frame length * hop_size. Got audio shape {audio.shape} mel shape {mel.shape} mel_loss shape {mel_loss.shape}"
return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
# If it encounters error during loading the data, skip this sample and load random other sample to the batch
except Exception as e:
if self.fine_tuning:
raise e # Terminate training if it is fine-tuning. The dataset should have been prepared properly.
else:
print(
f"[WARNING] Failed to load waveform, skipping! filename: {filename} Error: {e}"
)
return self[random.randrange(len(self))]
def __len__(self):
return len(self.audio_files)
| Field | Response |
| :--------------------------------------------------------------------------------------------------------- | :--------------------------------------------------- |
| Participation considerations from adversely impacted groups protected classes in model design and testing: | None |
| Measures taken to mitigate against unwanted bias: | No measures taken to mitigate against unwanted bias. |
| Field | Response |
| :---------------------------------------------------------------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Intended Application & Domain: | Generating waveform from mel spectrogram. |
| Model Type: | Convolutional Neural Network (CNN) |
| Intended Users: | This model is intended for developers to synthesize and generate waveforms from the AI-generated mel spectrograms. |
| Output: | Audio Waveform |
| Describe how the model works: | Model generates audio waveform corresponding to the input mel spectrogram. |
| Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | Not Applicable |
| Technical Limitations: | This may not perform well on synthetically-generated mel spectrograms that deviate significantly from the profile of mel spectrograms on which this was trained. |
| Verified to have met prescribed NVIDIA quality standards: | Yes |
| Performance Metrics: | Perceptual Evaluation of Speech Quality (PESQ), Virtual Speech Quality Objective Listener (VISQOL), Multi-resolution STFT (MRSTFT), Mel cepstral distortion (MCD), Periodicity RMSE, Voice/Unvoiced F1 Score (V/UV F1) |
| Potential Known Risks: | This model may generate low-quality or distorted soundwaves. |
| Licensing: | https://github.com/NVIDIA/BigVGAN/blob/main/LICENSE |
# Model Overview
## Description:
BigVGAN is a generative AI model specialized in synthesizing audio waveforms using Mel spectrogram as inputs.
<center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800"></center>
BigVGAN is a fully convolutional architecture with several upsampling blocks using transposed convolution followed by multiple residual dilated convolution layers.
BigVGAN consists of a novel module, called anti-aliased multi-periodicity composition (AMP), which is specifically designed for generating waveforms. AMP is specialized in synthesizing high-frequency and periodic soundwaves drawing inspiration from audio signal processing principles.
It applies a periodic activation function, called Snake, which provides an inductive bias to the architecture in generating periodic soundwaves. It also applies anti-aliasing filters to reduce undesired artifacts in the generated waveforms. <br>
This model is ready for commercial use.<br>
## References(s):
- [BigVGAN: A Universal Neural Vocoder with Large-Scale Training](https://arxiv.org/abs/2206.04658) <br>
- [Project Page](https://research.nvidia.com/labs/adlr/projects/bigvgan/) <br>
- [Audio Demo](https://bigvgan-demo.github.io/) <br>
## Model Architecture:
**Architecture Type:** Convolution Neural Network (CNN) <br>
**Network Architecture:** You can see the details of this model on this link: https://github.com/NVIDIA/BigVGAN and the related paper can be found here: https://arxiv.org/abs/2206.04658<br>
**Model Version:** 2.0 <br>
## Input:
**Input Type:** Audio <br>
**Input Format:** Mel Spectrogram <br>
**Input Parameters:** None <br>
**Other Properties Related to Input:** The input mel spectrogram has shape `[batch, channels, frames]`, where `channels` refers to the number of mel bands defined by the model and `frames` refers to the temporal length. The model supports arbitrary long `frames` that fits into the GPU memory.
## Output:
**Input Type:** Audio <br>
**Output Format:** Audio Waveform <br>
**Output Parameters:** None <br>
**Other Properties Related to Output:** The output audio waveform has shape `[batch, 1, time]`, where `1` refers to the mono audio channels and `time` refers to the temporal length. `time` is defined as a fixed integer multiple of input `frames`, which is an upsampling ratio of the model (`time = upsampling ratio * frames`). The output audio waveform consitutes float values with a range of `[-1, 1]`.
## Software Integration:
**Runtime Engine(s):** PyTorch
**Supported Hardware Microarchitecture Compatibility:** NVIDIA Ampere, NVIDIA Hopper, NVIDIA Lovelace, NVIDIA Turing, NVIDIA Volta <br>
## Preferred/Supported Operating System(s):
Linux
## Model Version(s):
v2.0
## Training, Testing, and Evaluation Datasets:
### Training Dataset:
The dataset contains diverse audio types, including speech in multiple languages, environmental sounds, and instruments.
**Links:**
- [AAM: Artificial Audio Multitracks Dataset](https://zenodo.org/records/5794629)
- [AudioCaps](https://audiocaps.github.io/)
- [AudioSet](https://research.google.com/audioset/index.html)
- [common-accent](https://huggingface.co/datasets/DTU54DL/common-accent)
- [Crowd Sourced Emotional Multimodal Actors Dataset (CREMA-D)](https://ieeexplore.ieee.org/document/6849440)
- [DCASE2017 Challenge, Task 4: Large-scale weakly supervised sound event detection for smart cars](https://dcase.community/challenge2017/task-large-scale-sound-event-detection)
- [FSDnoisy18k](https://zenodo.org/records/2529934)
- [Free Universal Sound Separation Dataset](https://zenodo.org/records/3694384)
- [Greatest Hits dataset](https://andrewowens.com/vis/)
- [GTZAN](https://ieeexplore.ieee.org/document/1021072)
- [JL corpus](https://www.kaggle.com/datasets/tli725/jl-corpus)
- [Medley-solos-DB: a cross-collection dataset for musical instrument recognition](https://zenodo.org/records/3464194)
- [MUSAN: A Music, Speech, and Noise Corpus](https://www.openslr.org/17/)
- [MusicBench](https://huggingface.co/datasets/amaai-lab/MusicBench)
- [MusicCaps](https://www.kaggle.com/datasets/googleai/musiccaps)
- [MusicNet](https://www.kaggle.com/datasets/imsparsh/musicnet-dataset)
- [NSynth](https://magenta.tensorflow.org/datasets/nsynth)
- [OnAir-Music-Dataset](https://github.com/sevagh/OnAir-Music-Dataset)
- [Audio Piano Triads Dataset](https://zenodo.org/records/4740877)
- [Pitch Audio Dataset (Surge synthesizer)](https://zenodo.org/records/4677097)
- [SONYC Urban Sound Tagging (SONYC-UST): a multilabel dataset from an urban acoustic sensor network](https://zenodo.org/records/3966543)
- [VocalSound: A Dataset for Improving Human Vocal Sounds Recognition](https://arxiv.org/abs/2205.03433)
- [WavText5K](https://github.com/microsoft/WavText5K)
- [CSS10: A Collection of Single Speaker Speech Datasets for 10 Languages](https://github.com/Kyubyong/css10)
- [Hi-Fi Multi-Speaker English TTS Dataset (Hi-Fi TTS)](https://www.openslr.org/109/)
- [IIIT-H Indic Speech Databases](http://festvox.org/databases/iiit_voices/)
- [Libri-Light: A Benchmark for ASR with Limited or No Supervision](https://arxiv.org/abs/1912.07875)
- [LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech](https://www.openslr.org/60)
- [LibriTTS-R: A Restored Multi-Speaker Text-to-Speech Corpus](https://www.openslr.org/141/)
- [The SIWIS French Speech Synthesis Database](https://datashare.ed.ac.uk/handle/10283/2353)
- [Crowdsourced high-quality Colombian Spanish speech data set](https://openslr.org/72/)
- [TTS-Portuguese Corpus](https://github.com/Edresson/TTS-Portuguese-Corpus)
- [CSTR VCTK Corpus: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit](https://datashare.ed.ac.uk/handle/10283/3443)
\*\* Data Collection Method by dataset <br>
- Human <br>
\*\* Labeling Method by dataset (for those with labels) <br>
- Hybrid: Automated, Human, Unknown <br>
### Evaluating Dataset:
Properties: The audio generation quality of BigVGAN is evaluated using `dev` splits of the [LibriTTS dataset](https://www.openslr.org/60/) and [Hi-Fi TTS dataset](https://www.openslr.org/109/). The datasets include speech in English language with equal balance of genders.
\*\* Data Collection Method by dataset <br>
- Human <br>
\*\* Labeling Method by dataset <br>
- Automated <br>
## Inference:
**Engine:** PyTorch <br>
**Test Hardware:** NVIDIA A100 GPU <br>
## Ethical Considerations:
NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse. For more detailed information on ethical considerations for this model, please see the Model Card++ Explainability, Bias, Safety & Security, and Privacy Subcards. Please report security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/).
| Field | Response |
| :------------------------------------------------------------------------------------------------------------------------------------- | :--------------------------------------------- |
| Generatable or reverse engineerable personal information? | None |
| Protected class data used to create this model? | None |
| Was consent obtained for any personal data used? | Not Applicable (No Personal Data) |
| How often is dataset reviewed? | Before Release |
| Is a mechanism in place to honor data subject right of access or deletion of personal data? | Not Applicable |
| If personal collected for the development of the model, was it collected directly by NVIDIA? | Not Applicable |
| If personal collected for the development of the model by NVIDIA, do you maintain or have access to disclosures made to data subjects? | Not Applicable |
| If personal collected for the development of this AI model, was it minimized to only what was required? | Not Applicable |
| Is data in dataset traceable? | Yes |
| Is there provenance for all datasets used in training? | Yes |
| Does data labeling (annotation, metadata) comply with privacy laws? | Yes |
| Is data compliant with data subject requests for data correction or removal, if such a request was made? | No, not possible with externally-sourced data. |
| Field | Response |
| :---------------------------------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Model Application(s): | Synethic Audio Generation |
| Describe the life critical impact (if present). | Not Applicable |
| Use Case Restrictions: | None |
| Model and dataset restrictions: | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. |
torch
numpy
librosa>=0.8.1
scipy
tensorboard
soundfile
matplotlib
pesq
auraloss
tqdm
nnAudio
ninja
huggingface_hub>=0.23.4
\ No newline at end of file
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
# LICENSE is in incl_licenses directory.
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
import itertools
import os
import time
import argparse
import json
import torch
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DistributedSampler, DataLoader
import torch.multiprocessing as mp
from torch.distributed import init_process_group
from torch.nn.parallel import DistributedDataParallel
from env import AttrDict, build_env
from meldataset import MelDataset, mel_spectrogram, get_dataset_filelist, MAX_WAV_VALUE
from bigvgan import BigVGAN
from discriminators import (
MultiPeriodDiscriminator,
MultiResolutionDiscriminator,
MultiBandDiscriminator,
MultiScaleSubbandCQTDiscriminator,
)
from loss import (
feature_loss,
generator_loss,
discriminator_loss,
MultiScaleMelSpectrogramLoss,
)
from utils import (
plot_spectrogram,
plot_spectrogram_clipped,
scan_checkpoint,
load_checkpoint,
save_checkpoint,
save_audio,
)
import torchaudio as ta
from pesq import pesq
from tqdm import tqdm
import auraloss
torch.backends.cudnn.benchmark = False
def train(rank, a, h):
if h.num_gpus > 1:
# initialize distributed
init_process_group(
backend=h.dist_config["dist_backend"],
init_method=h.dist_config["dist_url"],
world_size=h.dist_config["world_size"] * h.num_gpus,
rank=rank,
)
# Set seed and device
torch.cuda.manual_seed(h.seed)
torch.cuda.set_device(rank)
device = torch.device(f"cuda:{rank:d}")
# Define BigVGAN generator
generator = BigVGAN(h).to(device)
# Define discriminators. MPD is used by default
mpd = MultiPeriodDiscriminator(h).to(device)
# Define additional discriminators. BigVGAN-v1 uses UnivNet's MRD as default
# New in BigVGAN-v2: option to switch to new discriminators: MultiBandDiscriminator / MultiScaleSubbandCQTDiscriminator
if h.get("use_mbd_instead_of_mrd", False): # Switch to MBD
print(
"[INFO] using MultiBandDiscriminator of BigVGAN-v2 instead of MultiResolutionDiscriminator"
)
# Variable name is kept as "mrd" for backward compatibility & minimal code change
mrd = MultiBandDiscriminator(h).to(device)
elif h.get("use_cqtd_instead_of_mrd", False): # Switch to CQTD
print(
"[INFO] using MultiScaleSubbandCQTDiscriminator of BigVGAN-v2 instead of MultiResolutionDiscriminator"
)
mrd = MultiScaleSubbandCQTDiscriminator(h).to(device)
else: # Fallback to original MRD in BigVGAN-v1
mrd = MultiResolutionDiscriminator(h).to(device)
# New in BigVGAN-v2: option to switch to multi-scale L1 mel loss
if h.get("use_multiscale_melloss", False):
print(
"[INFO] using multi-scale Mel l1 loss of BigVGAN-v2 instead of the original single-scale loss"
)
fn_mel_loss_multiscale = MultiScaleMelSpectrogramLoss(
sampling_rate=h.sampling_rate
) # NOTE: accepts waveform as input
else:
fn_mel_loss_singlescale = F.l1_loss
# Print the model & number of parameters, and create or scan the latest checkpoint from checkpoints directory
if rank == 0:
print(generator)
print(mpd)
print(mrd)
print(f"Generator params: {sum(p.numel() for p in generator.parameters())}")
print(f"Discriminator mpd params: {sum(p.numel() for p in mpd.parameters())}")
print(f"Discriminator mrd params: {sum(p.numel() for p in mrd.parameters())}")
os.makedirs(a.checkpoint_path, exist_ok=True)
print(f"Checkpoints directory: {a.checkpoint_path}")
if os.path.isdir(a.checkpoint_path):
# New in v2.1: If the step prefix pattern-based checkpoints are not found, also check for renamed files in Hugging Face Hub to resume training
cp_g = scan_checkpoint(
a.checkpoint_path, prefix="g_", renamed_file="bigvgan_generator.pt"
)
cp_do = scan_checkpoint(
a.checkpoint_path,
prefix="do_",
renamed_file="bigvgan_discriminator_optimizer.pt",
)
# Load the latest checkpoint if exists
steps = 0
if cp_g is None or cp_do is None:
state_dict_do = None
last_epoch = -1
else:
state_dict_g = load_checkpoint(cp_g, device)
state_dict_do = load_checkpoint(cp_do, device)
generator.load_state_dict(state_dict_g["generator"])
mpd.load_state_dict(state_dict_do["mpd"])
mrd.load_state_dict(state_dict_do["mrd"])
steps = state_dict_do["steps"] + 1
last_epoch = state_dict_do["epoch"]
# Initialize DDP, optimizers, and schedulers
if h.num_gpus > 1:
generator = DistributedDataParallel(generator, device_ids=[rank]).to(device)
mpd = DistributedDataParallel(mpd, device_ids=[rank]).to(device)
mrd = DistributedDataParallel(mrd, device_ids=[rank]).to(device)
optim_g = torch.optim.AdamW(
generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2]
)
optim_d = torch.optim.AdamW(
itertools.chain(mrd.parameters(), mpd.parameters()),
h.learning_rate,
betas=[h.adam_b1, h.adam_b2],
)
if state_dict_do is not None:
optim_g.load_state_dict(state_dict_do["optim_g"])
optim_d.load_state_dict(state_dict_do["optim_d"])
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
optim_g, gamma=h.lr_decay, last_epoch=last_epoch
)
scheduler_d = torch.optim.lr_scheduler.ExponentialLR(
optim_d, gamma=h.lr_decay, last_epoch=last_epoch
)
# Define training and validation datasets
"""
unseen_validation_filelist will contain sample filepaths outside the seen training & validation dataset
Example: trained on LibriTTS, validate on VCTK
"""
training_filelist, validation_filelist, list_unseen_validation_filelist = (
get_dataset_filelist(a)
)
trainset = MelDataset(
training_filelist,
h,
h.segment_size,
h.n_fft,
h.num_mels,
h.hop_size,
h.win_size,
h.sampling_rate,
h.fmin,
h.fmax,
shuffle=False if h.num_gpus > 1 else True,
fmax_loss=h.fmax_for_loss,
device=device,
fine_tuning=a.fine_tuning,
base_mels_path=a.input_mels_dir,
is_seen=True,
)
train_sampler = DistributedSampler(trainset) if h.num_gpus > 1 else None
train_loader = DataLoader(
trainset,
num_workers=h.num_workers,
shuffle=False,
sampler=train_sampler,
batch_size=h.batch_size,
pin_memory=True,
drop_last=True,
)
if rank == 0:
validset = MelDataset(
validation_filelist,
h,
h.segment_size,
h.n_fft,
h.num_mels,
h.hop_size,
h.win_size,
h.sampling_rate,
h.fmin,
h.fmax,
False,
False,
fmax_loss=h.fmax_for_loss,
device=device,
fine_tuning=a.fine_tuning,
base_mels_path=a.input_mels_dir,
is_seen=True,
)
validation_loader = DataLoader(
validset,
num_workers=1,
shuffle=False,
sampler=None,
batch_size=1,
pin_memory=True,
drop_last=True,
)
list_unseen_validset = []
list_unseen_validation_loader = []
for i in range(len(list_unseen_validation_filelist)):
unseen_validset = MelDataset(
list_unseen_validation_filelist[i],
h,
h.segment_size,
h.n_fft,
h.num_mels,
h.hop_size,
h.win_size,
h.sampling_rate,
h.fmin,
h.fmax,
False,
False,
fmax_loss=h.fmax_for_loss,
device=device,
fine_tuning=a.fine_tuning,
base_mels_path=a.input_mels_dir,
is_seen=False,
)
unseen_validation_loader = DataLoader(
unseen_validset,
num_workers=1,
shuffle=False,
sampler=None,
batch_size=1,
pin_memory=True,
drop_last=True,
)
list_unseen_validset.append(unseen_validset)
list_unseen_validation_loader.append(unseen_validation_loader)
# Tensorboard logger
sw = SummaryWriter(os.path.join(a.checkpoint_path, "logs"))
if a.save_audio: # Also save audio to disk if --save_audio is set to True
os.makedirs(os.path.join(a.checkpoint_path, "samples"), exist_ok=True)
"""
Validation loop, "mode" parameter is automatically defined as (seen or unseen)_(name of the dataset).
If the name of the dataset contains "nonspeech", it skips PESQ calculation to prevent errors
"""
def validate(rank, a, h, loader, mode="seen"):
assert rank == 0, "validate should only run on rank=0"
generator.eval()
torch.cuda.empty_cache()
val_err_tot = 0
val_pesq_tot = 0
val_mrstft_tot = 0
# Modules for evaluation metrics
pesq_resampler = ta.transforms.Resample(h.sampling_rate, 16000).cuda()
loss_mrstft = auraloss.freq.MultiResolutionSTFTLoss(device="cuda")
if a.save_audio: # Also save audio to disk if --save_audio is set to True
os.makedirs(
os.path.join(a.checkpoint_path, "samples", f"gt_{mode}"),
exist_ok=True,
)
os.makedirs(
os.path.join(a.checkpoint_path, "samples", f"{mode}_{steps:08d}"),
exist_ok=True,
)
with torch.no_grad():
print(f"step {steps} {mode} speaker validation...")
# Loop over validation set and compute metrics
for j, batch in enumerate(tqdm(loader)):
x, y, _, y_mel = batch
y = y.to(device)
if hasattr(generator, "module"):
y_g_hat = generator.module(x.to(device))
else:
y_g_hat = generator(x.to(device))
y_mel = y_mel.to(device, non_blocking=True)
y_g_hat_mel = mel_spectrogram(
y_g_hat.squeeze(1),
h.n_fft,
h.num_mels,
h.sampling_rate,
h.hop_size,
h.win_size,
h.fmin,
h.fmax_for_loss,
)
min_t = min(y_mel.size(-1), y_g_hat_mel.size(-1))
val_err_tot += F.l1_loss(y_mel[...,:min_t], y_g_hat_mel[...,:min_t]).item()
# PESQ calculation. only evaluate PESQ if it's speech signal (nonspeech PESQ will error out)
if (
not "nonspeech" in mode
): # Skips if the name of dataset (in mode string) contains "nonspeech"
# Resample to 16000 for pesq
y_16k = pesq_resampler(y)
y_g_hat_16k = pesq_resampler(y_g_hat.squeeze(1))
y_int_16k = (y_16k[0] * MAX_WAV_VALUE).short().cpu().numpy()
y_g_hat_int_16k = (
(y_g_hat_16k[0] * MAX_WAV_VALUE).short().cpu().numpy()
)
val_pesq_tot += pesq(16000, y_int_16k, y_g_hat_int_16k, "wb")
# MRSTFT calculation
min_t = min(y.size(-1), y_g_hat.size(-1))
val_mrstft_tot += loss_mrstft(y_g_hat[...,:min_t], y[...,:min_t]).item()
# Log audio and figures to Tensorboard
if j % a.eval_subsample == 0: # Subsample every nth from validation set
if steps >= 0:
sw.add_audio(f"gt_{mode}/y_{j}", y[0], steps, h.sampling_rate)
if (
a.save_audio
): # Also save audio to disk if --save_audio is set to True
save_audio(
y[0],
os.path.join(
a.checkpoint_path,
"samples",
f"gt_{mode}",
f"{j:04d}.wav",
),
h.sampling_rate,
)
sw.add_figure(
f"gt_{mode}/y_spec_{j}",
plot_spectrogram(x[0]),
steps,
)
sw.add_audio(
f"generated_{mode}/y_hat_{j}",
y_g_hat[0],
steps,
h.sampling_rate,
)
if (
a.save_audio
): # Also save audio to disk if --save_audio is set to True
save_audio(
y_g_hat[0, 0],
os.path.join(
a.checkpoint_path,
"samples",
f"{mode}_{steps:08d}",
f"{j:04d}.wav",
),
h.sampling_rate,
)
# Spectrogram of synthesized audio
y_hat_spec = mel_spectrogram(
y_g_hat.squeeze(1),
h.n_fft,
h.num_mels,
h.sampling_rate,
h.hop_size,
h.win_size,
h.fmin,
h.fmax,
)
sw.add_figure(
f"generated_{mode}/y_hat_spec_{j}",
plot_spectrogram(y_hat_spec.squeeze(0).cpu().numpy()),
steps,
)
"""
Visualization of spectrogram difference between GT and synthesized audio, difference higher than 1 is clipped for better visualization.
"""
spec_delta = torch.clamp(
torch.abs(x[0] - y_hat_spec.squeeze(0).cpu()),
min=1e-6,
max=1.0,
)
sw.add_figure(
f"delta_dclip1_{mode}/spec_{j}",
plot_spectrogram_clipped(spec_delta.numpy(), clip_max=1.0),
steps,
)
val_err = val_err_tot / (j + 1)
val_pesq = val_pesq_tot / (j + 1)
val_mrstft = val_mrstft_tot / (j + 1)
# Log evaluation metrics to Tensorboard
sw.add_scalar(f"validation_{mode}/mel_spec_error", val_err, steps)
sw.add_scalar(f"validation_{mode}/pesq", val_pesq, steps)
sw.add_scalar(f"validation_{mode}/mrstft", val_mrstft, steps)
generator.train()
# If the checkpoint is loaded, start with validation loop
if steps != 0 and rank == 0 and not a.debug:
if not a.skip_seen:
validate(
rank,
a,
h,
validation_loader,
mode=f"seen_{train_loader.dataset.name}",
)
for i in range(len(list_unseen_validation_loader)):
validate(
rank,
a,
h,
list_unseen_validation_loader[i],
mode=f"unseen_{list_unseen_validation_loader[i].dataset.name}",
)
# Exit the script if --evaluate is set to True
if a.evaluate:
exit()
# Main training loop
generator.train()
mpd.train()
mrd.train()
for epoch in range(max(0, last_epoch), a.training_epochs):
if rank == 0:
start = time.time()
print(f"Epoch: {epoch + 1}")
if h.num_gpus > 1:
train_sampler.set_epoch(epoch)
for i, batch in enumerate(train_loader):
if rank == 0:
start_b = time.time()
x, y, _, y_mel = batch
x = x.to(device, non_blocking=True)
y = y.to(device, non_blocking=True)
y_mel = y_mel.to(device, non_blocking=True)
y = y.unsqueeze(1)
y_g_hat = generator(x)
y_g_hat_mel = mel_spectrogram(
y_g_hat.squeeze(1),
h.n_fft,
h.num_mels,
h.sampling_rate,
h.hop_size,
h.win_size,
h.fmin,
h.fmax_for_loss,
)
optim_d.zero_grad()
# MPD
y_df_hat_r, y_df_hat_g, _, _ = mpd(y, y_g_hat.detach())
loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss(
y_df_hat_r, y_df_hat_g
)
# MRD
y_ds_hat_r, y_ds_hat_g, _, _ = mrd(y, y_g_hat.detach())
loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss(
y_ds_hat_r, y_ds_hat_g
)
loss_disc_all = loss_disc_s + loss_disc_f
# Set clip_grad_norm value
clip_grad_norm = h.get("clip_grad_norm", 1000.0) # Default to 1000
# Whether to freeze D for initial training steps
if steps >= a.freeze_step:
loss_disc_all.backward()
grad_norm_mpd = torch.nn.utils.clip_grad_norm_(
mpd.parameters(), clip_grad_norm
)
grad_norm_mrd = torch.nn.utils.clip_grad_norm_(
mrd.parameters(), clip_grad_norm
)
optim_d.step()
else:
print(
f"[WARNING] skipping D training for the first {a.freeze_step} steps"
)
grad_norm_mpd = 0.0
grad_norm_mrd = 0.0
# Generator
optim_g.zero_grad()
# L1 Mel-Spectrogram Loss
lambda_melloss = h.get(
"lambda_melloss", 45.0
) # Defaults to 45 in BigVGAN-v1 if not set
if h.get("use_multiscale_melloss", False): # uses wav <y, y_g_hat> for loss
loss_mel = fn_mel_loss_multiscale(y, y_g_hat) * lambda_melloss
else: # Uses mel <y_mel, y_g_hat_mel> for loss
loss_mel = fn_mel_loss_singlescale(y_mel, y_g_hat_mel) * lambda_melloss
# MPD loss
y_df_hat_r, y_df_hat_g, fmap_f_r, fmap_f_g = mpd(y, y_g_hat)
loss_fm_f = feature_loss(fmap_f_r, fmap_f_g)
loss_gen_f, losses_gen_f = generator_loss(y_df_hat_g)
# MRD loss
y_ds_hat_r, y_ds_hat_g, fmap_s_r, fmap_s_g = mrd(y, y_g_hat)
loss_fm_s = feature_loss(fmap_s_r, fmap_s_g)
loss_gen_s, losses_gen_s = generator_loss(y_ds_hat_g)
if steps >= a.freeze_step:
loss_gen_all = (
loss_gen_s + loss_gen_f + loss_fm_s + loss_fm_f + loss_mel
)
else:
print(
f"[WARNING] using regression loss only for G for the first {a.freeze_step} steps"
)
loss_gen_all = loss_mel
loss_gen_all.backward()
grad_norm_g = torch.nn.utils.clip_grad_norm_(
generator.parameters(), clip_grad_norm
)
optim_g.step()
if rank == 0:
# STDOUT logging
if steps % a.stdout_interval == 0:
mel_error = (
loss_mel.item() / lambda_melloss
) # Log training mel regression loss to stdout
print(
f"Steps: {steps:d}, "
f"Gen Loss Total: {loss_gen_all:4.3f}, "
f"Mel Error: {mel_error:4.3f}, "
f"s/b: {time.time() - start_b:4.3f} "
f"lr: {optim_g.param_groups[0]['lr']:4.7f} "
f"grad_norm_g: {grad_norm_g:4.3f}"
)
# Checkpointing
if steps % a.checkpoint_interval == 0 and steps != 0:
checkpoint_path = f"{a.checkpoint_path}/g_{steps:08d}"
save_checkpoint(
checkpoint_path,
{
"generator": (
generator.module if h.num_gpus > 1 else generator
).state_dict()
},
)
checkpoint_path = f"{a.checkpoint_path}/do_{steps:08d}"
save_checkpoint(
checkpoint_path,
{
"mpd": (mpd.module if h.num_gpus > 1 else mpd).state_dict(),
"mrd": (mrd.module if h.num_gpus > 1 else mrd).state_dict(),
"optim_g": optim_g.state_dict(),
"optim_d": optim_d.state_dict(),
"steps": steps,
"epoch": epoch,
},
)
# Tensorboard summary logging
if steps % a.summary_interval == 0:
mel_error = (
loss_mel.item() / lambda_melloss
) # Log training mel regression loss to tensorboard
sw.add_scalar("training/gen_loss_total", loss_gen_all.item(), steps)
sw.add_scalar("training/mel_spec_error", mel_error, steps)
sw.add_scalar("training/fm_loss_mpd", loss_fm_f.item(), steps)
sw.add_scalar("training/gen_loss_mpd", loss_gen_f.item(), steps)
sw.add_scalar("training/disc_loss_mpd", loss_disc_f.item(), steps)
sw.add_scalar("training/grad_norm_mpd", grad_norm_mpd, steps)
sw.add_scalar("training/fm_loss_mrd", loss_fm_s.item(), steps)
sw.add_scalar("training/gen_loss_mrd", loss_gen_s.item(), steps)
sw.add_scalar("training/disc_loss_mrd", loss_disc_s.item(), steps)
sw.add_scalar("training/grad_norm_mrd", grad_norm_mrd, steps)
sw.add_scalar("training/grad_norm_g", grad_norm_g, steps)
sw.add_scalar(
"training/learning_rate_d", scheduler_d.get_last_lr()[0], steps
)
sw.add_scalar(
"training/learning_rate_g", scheduler_g.get_last_lr()[0], steps
)
sw.add_scalar("training/epoch", epoch + 1, steps)
# Validation
if steps % a.validation_interval == 0:
# Plot training input x so far used
for i_x in range(x.shape[0]):
sw.add_figure(
f"training_input/x_{i_x}",
plot_spectrogram(x[i_x].cpu()),
steps,
)
sw.add_audio(
f"training_input/y_{i_x}",
y[i_x][0],
steps,
h.sampling_rate,
)
# Seen and unseen speakers validation loops
if not a.debug and steps != 0:
validate(
rank,
a,
h,
validation_loader,
mode=f"seen_{train_loader.dataset.name}",
)
for i in range(len(list_unseen_validation_loader)):
validate(
rank,
a,
h,
list_unseen_validation_loader[i],
mode=f"unseen_{list_unseen_validation_loader[i].dataset.name}",
)
steps += 1
# BigVGAN-v2 learning rate scheduler is changed from epoch-level to step-level
scheduler_g.step()
scheduler_d.step()
if rank == 0:
print(
f"Time taken for epoch {epoch + 1} is {int(time.time() - start)} sec\n"
)
def main():
print("Initializing Training Process..")
parser = argparse.ArgumentParser()
parser.add_argument("--group_name", default=None)
parser.add_argument("--input_wavs_dir", default="LibriTTS")
parser.add_argument("--input_mels_dir", default="ft_dataset")
parser.add_argument(
"--input_training_file", default="tests/LibriTTS/train-full.txt"
)
parser.add_argument(
"--input_validation_file", default="tests/LibriTTS/val-full.txt"
)
parser.add_argument(
"--list_input_unseen_wavs_dir",
nargs="+",
default=["tests/LibriTTS", "tests/LibriTTS"],
)
parser.add_argument(
"--list_input_unseen_validation_file",
nargs="+",
default=["tests/LibriTTS/dev-clean.txt", "tests/LibriTTS/dev-other.txt"],
)
parser.add_argument("--checkpoint_path", default="exp/bigvgan")
parser.add_argument("--config", default="")
parser.add_argument("--training_epochs", default=100000, type=int)
parser.add_argument("--stdout_interval", default=5, type=int)
parser.add_argument("--checkpoint_interval", default=50000, type=int)
parser.add_argument("--summary_interval", default=100, type=int)
parser.add_argument("--validation_interval", default=50000, type=int)
parser.add_argument(
"--freeze_step",
default=0,
type=int,
help="freeze D for the first specified steps. G only uses regression loss for these steps.",
)
parser.add_argument("--fine_tuning", default=False, type=bool)
parser.add_argument(
"--debug",
default=False,
type=bool,
help="debug mode. skips validation loop throughout training",
)
parser.add_argument(
"--evaluate",
default=False,
type=bool,
help="only run evaluation from checkpoint and exit",
)
parser.add_argument(
"--eval_subsample",
default=5,
type=int,
help="subsampling during evaluation loop",
)
parser.add_argument(
"--skip_seen",
default=False,
type=bool,
help="skip seen dataset. useful for test set inference",
)
parser.add_argument(
"--save_audio",
default=False,
type=bool,
help="save audio of test set inference to disk",
)
a = parser.parse_args()
with open(a.config) as f:
data = f.read()
json_config = json.loads(data)
h = AttrDict(json_config)
build_env(a.config, "config.json", a.checkpoint_path)
torch.manual_seed(h.seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(h.seed)
h.num_gpus = torch.cuda.device_count()
h.batch_size = int(h.batch_size / h.num_gpus)
print(f"Batch size per GPU: {h.batch_size}")
else:
pass
if h.num_gpus > 1:
mp.spawn(
train,
nprocs=h.num_gpus,
args=(
a,
h,
),
)
else:
train(0, a, h)
if __name__ == "__main__":
main()
# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
# LICENSE is in incl_licenses directory.
import glob
import os
import matplotlib
import torch
from torch.nn.utils import weight_norm
matplotlib.use("Agg")
import matplotlib.pylab as plt
from meldataset import MAX_WAV_VALUE
from scipy.io.wavfile import write
def plot_spectrogram(spectrogram):
fig, ax = plt.subplots(figsize=(10, 2))
im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
plt.colorbar(im, ax=ax)
fig.canvas.draw()
plt.close()
return fig
def plot_spectrogram_clipped(spectrogram, clip_max=2.0):
fig, ax = plt.subplots(figsize=(10, 2))
im = ax.imshow(
spectrogram,
aspect="auto",
origin="lower",
interpolation="none",
vmin=1e-6,
vmax=clip_max,
)
plt.colorbar(im, ax=ax)
fig.canvas.draw()
plt.close()
return fig
def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
m.weight.data.normal_(mean, std)
def apply_weight_norm(m):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
weight_norm(m)
def get_padding(kernel_size, dilation=1):
return int((kernel_size * dilation - dilation) / 2)
def load_checkpoint(filepath, device):
assert os.path.isfile(filepath)
print(f"Loading '{filepath}'")
checkpoint_dict = torch.load(filepath, map_location=device)
print("Complete.")
return checkpoint_dict
def save_checkpoint(filepath, obj):
print(f"Saving checkpoint to {filepath}")
torch.save(obj, filepath)
print("Complete.")
def scan_checkpoint(cp_dir, prefix, renamed_file=None):
# Fallback to original scanning logic first
pattern = os.path.join(cp_dir, prefix + "????????")
cp_list = glob.glob(pattern)
if len(cp_list) > 0:
last_checkpoint_path = sorted(cp_list)[-1]
print(f"[INFO] Resuming from checkpoint: '{last_checkpoint_path}'")
return last_checkpoint_path
# If no pattern-based checkpoints are found, check for renamed file
if renamed_file:
renamed_path = os.path.join(cp_dir, renamed_file)
if os.path.isfile(renamed_path):
print(f"[INFO] Resuming from renamed checkpoint: '{renamed_file}'")
return renamed_path
return None
def save_audio(audio, path, sr):
# wav: torch with 1d shape
audio = audio * MAX_WAV_VALUE
audio = audio.cpu().numpy().astype("int16")
write(path, sr, audio)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment