Commit 0112b0f0 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #2394 canceled with stages
# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
# 2022 Ximalaya Inc (Yuguang Yang)
# 2024 Alibaba Inc
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
# NeMo(https://github.com/NVIDIA/NeMo)
from typing import Union
import math
import warnings
import torch
from torch.optim.lr_scheduler import _LRScheduler
class WarmupLR(_LRScheduler):
"""The WarmupLR scheduler
This scheduler is almost same as NoamLR Scheduler except for following
difference:
NoamLR:
lr = optimizer.lr * model_size ** -0.5
* min(step ** -0.5, step * warmup_step ** -1.5)
WarmupLR:
lr = optimizer.lr * warmup_step ** 0.5
* min(step ** -0.5, step * warmup_step ** -1.5)
Note that the maximum lr equals to optimizer.lr in this scheduler.
"""
def __init__(
self,
optimizer: torch.optim.Optimizer,
warmup_steps: Union[int, float] = 25000,
last_epoch: int = -1,
):
self.warmup_steps = warmup_steps
# __init__() must be invoked before setting field
# because step() is also invoked in __init__()
super().__init__(optimizer, last_epoch)
def __repr__(self):
return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})"
def get_lr(self):
step_num = self.last_epoch + 1
if self.warmup_steps == 0:
return [lr * step_num**-0.5 for lr in self.base_lrs]
else:
return [
lr * self.warmup_steps**0.5 *
min(step_num**-0.5, step_num * self.warmup_steps**-1.5)
for lr in self.base_lrs
]
def set_step(self, step: int):
self.last_epoch = step
class WarmupPolicy(_LRScheduler):
"""Adds warmup kwargs and warmup logic to lr policy.
All arguments should be passed as kwargs for clarity,
Args:
warmup_steps: Number of training steps in warmup stage
warmup_ratio: Ratio of warmup steps to total steps
max_steps: Total number of steps while training or `None` for
infinite training
"""
def __init__(self,
optimizer,
*,
warmup_steps=None,
warmup_ratio=None,
max_steps=None,
min_lr=0.0,
last_epoch=-1):
assert not (warmup_steps is not None and warmup_ratio is not None),\
"Either use particular number of step or ratio"
assert warmup_ratio is None or max_steps is not None, \
"If there is a ratio, there should be a total steps"
# It is necessary to assign all attributes *before* __init__,
# as class is wrapped by an inner class.
self.max_steps = max_steps
if warmup_steps is not None:
self.warmup_steps = warmup_steps
elif warmup_ratio is not None:
self.warmup_steps = int(warmup_ratio * max_steps)
else:
self.warmup_steps = 0
self.min_lr = min_lr
super().__init__(optimizer, last_epoch)
def get_lr(self):
if not self._get_lr_called_within_step:
warnings.warn(
"To get the last learning rate computed "
"by the scheduler, please use `get_last_lr()`.",
UserWarning,
stacklevel=2)
step = self.last_epoch
if step <= self.warmup_steps and self.warmup_steps > 0:
return self._get_warmup_lr(step)
if step > self.max_steps:
return [self.min_lr for _ in self.base_lrs]
return self._get_lr(step)
def _get_warmup_lr(self, step):
lr_val = (step + 1) / (self.warmup_steps + 1)
return [initial_lr * lr_val for initial_lr in self.base_lrs]
def _get_lr(self, step):
"""Simple const lr policy"""
return self.base_lrs
class SquareRootConstantPolicy(_LRScheduler):
"""Adds warmup kwargs and warmup logic to lr policy.
All arguments should be passed as kwargs for clarity,
Args:
warmup_steps: Number of training steps in warmup stage
warmup_ratio: Ratio of warmup steps to total steps
max_steps: Total number of steps while training or `None` for
infinite training
"""
def __init__(self,
optimizer,
*,
constant_steps=None,
constant_ratio=None,
max_steps=None,
min_lr=0.0,
last_epoch=-1):
assert not (constant_steps is not None
and constant_ratio is not None), \
"Either use particular number of step or ratio"
assert constant_ratio is None or max_steps is not None, \
"If there is a ratio, there should be a total steps"
# It is necessary to assign all attributes *before* __init__,
# as class is wrapped by an inner class.
self.max_steps = max_steps
if constant_steps is not None:
self.constant_steps = constant_steps
elif constant_ratio is not None:
self.constant_steps = int(constant_ratio * max_steps)
else:
self.constant_steps = 0
self.constant_lr = 1 / (constant_steps**0.5)
self.min_lr = min_lr
super().__init__(optimizer, last_epoch)
def get_lr(self):
if not self._get_lr_called_within_step:
warnings.warn(
"To get the last learning rate computed "
"by the scheduler, please use `get_last_lr()`.",
UserWarning,
stacklevel=2)
step = self.last_epoch
if step <= self.constant_steps:
return [self.constant_lr for _ in self.base_lrs]
if step > self.max_steps:
return [self.min_lr for _ in self.base_lrs]
return self._get_lr(step)
def _get_lr(self, step):
"""Simple const lr policy"""
return self.base_lrs
class WarmupHoldPolicy(WarmupPolicy):
"""Variant of WarmupPolicy which maintains high
learning rate for a defined number of steps.
All arguments should be passed as kwargs for clarity,
Args:
warmup_steps: Number of training steps in warmup stage
warmup_ratio: Ratio of warmup steps to total steps
hold_steps: Number of training steps to
hold the learning rate after warm up
hold_ratio: Ratio of hold steps to total steps
max_steps: Total number of steps while training or `None` for
infinite training
"""
def __init__(
self,
optimizer,
*,
warmup_steps=None,
warmup_ratio=None,
hold_steps=None,
hold_ratio=None,
max_steps=None,
min_lr=0.0,
last_epoch=-1,
):
assert not (hold_steps is not None and hold_ratio is not None), \
"Either use particular number of step or ratio"
assert hold_ratio is None or max_steps is not None, \
"If there is a ratio, there should be a total steps"
self.min_lr = min_lr
self._last_warmup_lr = 0.0
# Necessary to duplicate as class attributes are hidden in inner class
self.max_steps = max_steps
if warmup_steps is not None:
self.warmup_steps = warmup_steps
elif warmup_ratio is not None:
self.warmup_steps = int(warmup_ratio * max_steps)
else:
self.warmup_steps = 0
if hold_steps is not None:
self.hold_steps = hold_steps + self.warmup_steps
elif hold_ratio is not None:
self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps
else:
self.hold_steps = 0
super().__init__(
optimizer,
warmup_steps=warmup_steps,
warmup_ratio=warmup_ratio,
max_steps=max_steps,
last_epoch=last_epoch,
min_lr=min_lr,
)
def get_lr(self):
if not self._get_lr_called_within_step:
warnings.warn(
"To get the last learning rate computed by the scheduler,"
" "
"please use `get_last_lr()`.",
UserWarning,
stacklevel=2)
step = self.last_epoch
# Warmup phase
if step <= self.warmup_steps and self.warmup_steps > 0:
return self._get_warmup_lr(step)
# Hold phase
if (step >= self.warmup_steps) and (step < self.hold_steps):
return self.base_lrs
if step > self.max_steps:
return [self.min_lr for _ in self.base_lrs]
return self._get_lr(step)
class WarmupAnnealHoldPolicy(_LRScheduler):
"""Adds warmup kwargs and warmup logic to lr policy.
All arguments should be passed as kwargs for clarity,
Args:
warmup_steps: Number of training steps in warmup stage
warmup_ratio: Ratio of warmup steps to total steps
max_steps: Total number of steps while training or `None` for
infinite training
min_lr: Minimum lr to hold the learning rate after decay at.
constant_steps: Number of steps to keep lr constant at.
constant_ratio: Ratio of steps to keep lr constant.
"""
def __init__(
self,
optimizer,
*,
warmup_steps=None,
warmup_ratio=None,
constant_steps=None,
constant_ratio=None,
max_steps=None,
min_lr=0.0,
last_epoch=-1,
):
assert not (warmup_steps is not None
and warmup_ratio is not None), \
"Either use particular number of step or ratio"
assert not (constant_steps is not None
and constant_ratio is not None), \
"Either use constant_steps or constant_ratio"
assert warmup_ratio is None or max_steps is not None, \
"If there is a ratio, there should be a total steps"
# It is necessary to assign all attributes *before* __init__,
# as class is wrapped by an inner class.
self.max_steps = max_steps
if warmup_steps is not None:
self.warmup_steps = warmup_steps
elif warmup_ratio is not None:
self.warmup_steps = int(warmup_ratio * max_steps)
else:
self.warmup_steps = 0
if constant_steps is not None:
self.constant_steps = constant_steps
elif constant_ratio is not None:
self.constant_steps = int(constant_ratio * max_steps)
else:
self.constant_steps = 0
self.decay_steps = max_steps - (self.constant_steps +
self.warmup_steps)
self.min_lr = min_lr
super().__init__(optimizer, last_epoch)
def get_lr(self):
if not self._get_lr_called_within_step:
warnings.warn(
"To get the last learning rate computed "
"by the scheduler, please use `get_last_lr()`.",
UserWarning,
stacklevel=2)
step = self.last_epoch
# Warmup steps
if self.warmup_steps > 0 and step <= self.warmup_steps:
return self._get_warmup_lr(step)
# Constant steps after warmup and decay
if self.constant_steps > 0 and (
self.warmup_steps + self.decay_steps) < step <= self.max_steps:
return self._get_constant_lr(step)
# Min lr after max steps of updates
if step > self.max_steps:
return [self.min_lr for _ in self.base_lrs]
return self._get_lr(step)
def _get_warmup_lr(self, step):
lr_val = (step + 1) / (self.warmup_steps + 1)
return [initial_lr * lr_val for initial_lr in self.base_lrs]
def _get_constant_lr(self, step):
return [self.min_lr for _ in self.base_lrs]
def _get_lr(self, step):
"""Simple const lr policy"""
return self.base_lrs
def _squareroot_annealing(initial_lr, step, max_steps, min_lr):
mult = ((max_steps - step) / max_steps)**0.5
out_lr = initial_lr * mult
out_lr = max(out_lr, min_lr)
return out_lr
def _square_annealing(initial_lr, step, max_steps, min_lr):
mult = ((max_steps - step) / max_steps)**2
out_lr = initial_lr * mult
out_lr = max(out_lr, min_lr)
return out_lr
def _cosine_annealing(initial_lr, step, max_steps, min_lr):
mult = 0.5 * (1 + math.cos(math.pi * step / max_steps))
out_lr = (initial_lr - min_lr) * mult + min_lr
return out_lr
def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step,
decay_steps, min_lr):
assert max_lr > min_lr
# Use linear warmup for the initial part.
if warmup_steps > 0 and step <= warmup_steps:
return max_lr * float(step) / float(warmup_steps)
# For any steps larger than `decay_steps`, use `min_lr`.
if step > warmup_steps + decay_steps:
return min_lr
# If we are done with the warmup period, use the decay style.
num_steps_ = step - warmup_steps
decay_steps_ = decay_steps
decay_ratio = float(num_steps_) / float(decay_steps_)
assert decay_ratio >= 0.0
assert decay_ratio <= 1.0
delta_lr = max_lr - min_lr
coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
return min_lr + coeff * delta_lr
def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle):
if cycle:
multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps)
decay_steps *= multiplier
else:
step = min(step, decay_steps)
p = step / decay_steps
lr = (initial_lr - min_lr) * math.pow(1.0 - p, power)
lr += min_lr
return lr
def _noam_hold_annealing(initial_lr, step, warmup_steps, hold_steps,
decay_rate, min_lr):
# hold_steps = total number of steps
# to hold the LR, not the warmup + hold steps.
T_warmup_decay = max(1, warmup_steps**decay_rate)
T_hold_decay = max(1, (step - hold_steps)**decay_rate)
lr = (initial_lr * T_warmup_decay) / T_hold_decay
lr = max(lr, min_lr)
return lr
class SquareAnnealing(WarmupPolicy):
def __init__(self,
optimizer,
*,
max_steps,
min_lr=1e-5,
last_epoch=-1,
**kwargs):
super().__init__(optimizer=optimizer,
max_steps=max_steps,
last_epoch=last_epoch,
min_lr=min_lr,
**kwargs)
def _get_lr(self, step):
new_lrs = [
_square_annealing(
initial_lr=initial_lr,
step=step - self.warmup_steps,
max_steps=self.max_steps - self.warmup_steps,
min_lr=self.min_lr,
) for initial_lr in self.base_lrs
]
return new_lrs
class SquareRootAnnealing(WarmupPolicy):
def __init__(self,
optimizer,
*,
max_steps,
min_lr=0,
last_epoch=-1,
**kwargs):
super().__init__(optimizer=optimizer,
max_steps=max_steps,
last_epoch=last_epoch,
min_lr=min_lr,
**kwargs)
def _get_lr(self, step):
new_lrs = [
_squareroot_annealing(initial_lr=initial_lr,
step=step,
max_steps=self.max_steps,
min_lr=self.min_lr)
for initial_lr in self.base_lrs
]
return new_lrs
class CosineAnnealing(WarmupAnnealHoldPolicy):
def __init__(self,
optimizer,
*,
max_steps,
min_lr=0,
last_epoch=-1,
**kwargs):
super().__init__(optimizer=optimizer,
max_steps=max_steps,
last_epoch=last_epoch,
min_lr=min_lr,
**kwargs)
def _get_lr(self, step):
for initial_lr in self.base_lrs:
if initial_lr < self.min_lr:
raise ValueError(
f"{self} received an initial learning rate "
f"that was lower than the minimum learning rate.")
if self.constant_steps is None or self.constant_steps == 0:
new_lrs = [
_cosine_annealing(
initial_lr=initial_lr,
step=step - self.warmup_steps,
max_steps=self.max_steps - self.warmup_steps,
min_lr=self.min_lr,
) for initial_lr in self.base_lrs
]
else:
new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step)
return new_lrs
def _get_warmup_lr(self, step):
if self.constant_steps is None or self.constant_steps == 0:
return super()._get_warmup_lr(step)
else:
# Use linear warmup for the initial part.
return self._get_linear_warmup_with_cosine_annealing_lr(step)
def _get_constant_lr(self, step):
# Only called when `constant_steps` > 0.
return self._get_linear_warmup_with_cosine_annealing_lr(step)
def _get_linear_warmup_with_cosine_annealing_lr(self, step):
# Cosine Schedule for Megatron LM,
# slightly different warmup schedule + constant LR at the end.
new_lrs = [
_linear_warmup_with_cosine_annealing(
max_lr=self.base_lrs[0],
warmup_steps=self.warmup_steps,
step=step,
decay_steps=self.decay_steps,
min_lr=self.min_lr,
) for _ in self.base_lrs
]
return new_lrs
class NoamAnnealing(_LRScheduler):
def __init__(self,
optimizer,
*,
d_model,
warmup_steps=None,
warmup_ratio=None,
max_steps=None,
min_lr=0.0,
last_epoch=-1):
self._normalize = d_model**(-0.5)
assert not (warmup_steps is not None and warmup_ratio is not None), \
"Either use particular number of step or ratio"
assert warmup_ratio is None or max_steps is not None, \
"If there is a ratio, there should be a total steps"
# It is necessary to assign all attributes *before* __init__,
# as class is wrapped by an inner class.
self.max_steps = max_steps
if warmup_steps is not None:
self.warmup_steps = warmup_steps
elif warmup_ratio is not None:
self.warmup_steps = int(warmup_ratio * max_steps)
else:
self.warmup_steps = 0
self.min_lr = min_lr
super().__init__(optimizer, last_epoch)
def get_lr(self):
if not self._get_lr_called_within_step:
warnings.warn(
"To get the last learning rate computed "
"by the scheduler, please use `get_last_lr()`.",
UserWarning,
stacklevel=2)
step = max(1, self.last_epoch)
for initial_lr in self.base_lrs:
if initial_lr < self.min_lr:
raise ValueError(
f"{self} received an initial learning rate "
f"that was lower than the minimum learning rate.")
new_lrs = [
self._noam_annealing(initial_lr=initial_lr, step=step)
for initial_lr in self.base_lrs
]
return new_lrs
def _noam_annealing(self, initial_lr, step):
if self.warmup_steps > 0:
mult = self._normalize * min(step**(-0.5),
step * (self.warmup_steps**(-1.5)))
else:
mult = self._normalize * step**(-0.5)
out_lr = initial_lr * mult
if step > self.warmup_steps:
out_lr = max(out_lr, self.min_lr)
return out_lr
class NoamHoldAnnealing(WarmupHoldPolicy):
def __init__(self,
optimizer,
*,
max_steps,
decay_rate=0.5,
min_lr=0.0,
last_epoch=-1,
**kwargs):
"""
From Nemo:
Implementation of the Noam Hold Annealing policy
from the SqueezeFormer paper.
Unlike NoamAnnealing, the peak learning rate
can be explicitly set for this scheduler.
The schedule first performs linear warmup,
then holds the peak LR, then decays with some schedule for
the remainder of the steps.
Therefore the min-lr is still dependent
on the hyper parameters selected.
It's schedule is determined by three factors-
Warmup Steps: Initial stage, where linear warmup
occurs uptil the peak LR is reached. Unlike NoamAnnealing,
the peak LR is explicitly stated here instead of a scaling factor.
Hold Steps: Intermediate stage, where the peak LR
is maintained for some number of steps. In this region,
the high peak LR allows the model to converge faster
if training is stable. However the high LR
may also cause instability during training.
Should usually be a significant fraction of training
steps (around 30-40% of the entire training steps).
Decay Steps: Final stage, where the LR rapidly decays
with some scaling rate (set by decay rate).
To attain Noam decay, use 0.5,
for Squeezeformer recommended decay, use 1.0.
The fast decay after prolonged high LR during
hold phase allows for rapid convergence.
References:
- [Squeezeformer:
An Efficient Transformer for Automatic Speech Recognition]
(https://arxiv.org/abs/2206.00888)
Args:
optimizer: Pytorch compatible Optimizer object.
warmup_steps: Number of training steps in warmup stage
warmup_ratio: Ratio of warmup steps to total steps
hold_steps: Number of training steps to
hold the learning rate after warm up
hold_ratio: Ratio of hold steps to total steps
max_steps: Total number of steps while training or `None` for
infinite training
decay_rate: Float value describing the polynomial decay
after the hold period. Default value
of 0.5 corresponds to Noam decay.
min_lr: Minimum learning rate.
"""
self.decay_rate = decay_rate
super().__init__(optimizer=optimizer,
max_steps=max_steps,
last_epoch=last_epoch,
min_lr=min_lr,
**kwargs)
def _get_lr(self, step):
if self.warmup_steps is None or self.warmup_steps == 0:
raise ValueError(
"Noam scheduler cannot be used without warmup steps")
if self.hold_steps > 0:
hold_steps = self.hold_steps - self.warmup_steps
else:
hold_steps = 0
new_lrs = [
_noam_hold_annealing(
initial_lr,
step=step,
warmup_steps=self.warmup_steps,
hold_steps=hold_steps,
decay_rate=self.decay_rate,
min_lr=self.min_lr,
) for initial_lr in self.base_lrs
]
return new_lrs
def set_step(self, step: int):
self.last_epoch = step
class ConstantLR(_LRScheduler):
"""The ConstantLR scheduler
This scheduler keeps a constant lr
"""
def __init__(
self,
optimizer: torch.optim.Optimizer,
):
# __init__() must be invoked before setting field
# because step() is also invoked in __init__()
super().__init__(optimizer)
def get_lr(self):
return self.base_lrs
def set_step(self, step: int):
self.last_epoch = step
import glob
import json
import os
import random
import sys
import time
import warnings
import matplotlib
import numpy as np
import torch
import yaml
from torch import distributed as dist
from torch.nn.utils import weight_norm
matplotlib.use("Agg")
import matplotlib.pylab as plt
import re
import pathlib
def seed_everything(seed, cudnn_deterministic=False):
"""
Function that sets seed for pseudo-random number generators in:
pytorch, numpy, python.random
Args:
seed: the integer value seed for global random state
"""
if seed is not None:
# print(f"Global seed set to {seed}")
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# if cudnn_deterministic:
# torch.backends.cudnn.deterministic = True
# warnings.warn('You have chosen to seed training. '
# 'This will turn on the CUDNN deterministic setting, '
# 'which can slow down your training considerably! '
# 'You may see unexpected behavior when restarting '
# 'from checkpoints.')
def is_primary():
return get_rank() == 0
def get_rank():
if not dist.is_available():
return 0
if not dist.is_initialized():
return 0
return dist.get_rank()
def load_yaml_config(path):
with open(path) as f:
config = yaml.full_load(f)
return config
def save_config_to_yaml(config, path):
assert path.endswith('.yaml')
with open(path, 'w') as f:
f.write(yaml.dump(config))
f.close()
def save_dict_to_json(d, path, indent=None):
json.dump(d, open(path, 'w'), indent=indent)
def load_dict_from_json(path):
return json.load(open(path, 'r'))
def write_args(args, path):
args_dict = dict((name, getattr(args, name)) for name in dir(args)
if not name.startswith('_'))
with open(path, 'a') as args_file:
args_file.write('==> torch version: {}\n'.format(torch.__version__))
args_file.write(
'==> cudnn version: {}\n'.format(torch.backends.cudnn.version()))
args_file.write('==> Cmd:\n')
args_file.write(str(sys.argv))
args_file.write('\n==> args:\n')
for k, v in sorted(args_dict.items()):
args_file.write(' %s: %s\n' % (str(k), str(v)))
args_file.close()
class Logger(object):
def __init__(self, args):
self.args = args
self.save_dir = args.save_dir
self.is_primary = is_primary()
if self.is_primary:
os.makedirs(self.save_dir, exist_ok=True)
# save the args and config
self.config_dir = os.path.join(self.save_dir, 'configs')
os.makedirs(self.config_dir, exist_ok=True)
file_name = os.path.join(self.config_dir, 'args.txt')
write_args(args, file_name)
log_dir = os.path.join(self.save_dir, 'logs')
if not os.path.exists(log_dir):
os.makedirs(log_dir, exist_ok=True)
self.text_writer = open(os.path.join(log_dir, 'log.txt'),
'a') # 'w')
if args.tensorboard:
self.log_info('using tensorboard')
self.tb_writer = torch.utils.tensorboard.SummaryWriter(
log_dir=log_dir
) # tensorboard.SummaryWriter(log_dir=log_dir)
else:
self.tb_writer = None
def save_config(self, config):
if self.is_primary:
save_config_to_yaml(config,
os.path.join(self.config_dir, 'config.yaml'))
def log_info(self, info, check_primary=True):
if self.is_primary or (not check_primary):
print(info)
if self.is_primary:
info = str(info)
time_str = time.strftime('%Y-%m-%d-%H-%M')
info = '{}: {}'.format(time_str, info)
if not info.endswith('\n'):
info += '\n'
self.text_writer.write(info)
self.text_writer.flush()
def add_scalar(self, **kargs):
"""Log a scalar variable."""
if self.is_primary:
if self.tb_writer is not None:
self.tb_writer.add_scalar(**kargs)
def add_scalars(self, **kargs):
"""Log a scalar variable."""
if self.is_primary:
if self.tb_writer is not None:
self.tb_writer.add_scalars(**kargs)
def add_image(self, **kargs):
"""Log a scalar variable."""
if self.is_primary:
if self.tb_writer is not None:
self.tb_writer.add_image(**kargs)
def add_images(self, **kargs):
"""Log a scalar variable."""
if self.is_primary:
if self.tb_writer is not None:
self.tb_writer.add_images(**kargs)
def close(self):
if self.is_primary:
self.text_writer.close()
self.tb_writer.close()
def plot_spectrogram(spectrogram):
fig, ax = plt.subplots(figsize=(10, 2))
im = ax.imshow(
spectrogram, aspect="auto", origin="lower", interpolation='none')
plt.colorbar(im, ax=ax)
fig.canvas.draw()
plt.close()
return fig
def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
m.weight.data.normal_(mean, std)
def apply_weight_norm(m):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
weight_norm(m)
def get_padding(kernel_size, dilation=1):
return int((kernel_size * dilation - dilation) / 2)
def load_checkpoint(filepath, device):
assert os.path.isfile(filepath)
print("Loading '{}'".format(filepath))
checkpoint_dict = torch.load(filepath, map_location=device)
print("Complete.")
return checkpoint_dict
def save_checkpoint(filepath, obj, num_ckpt_keep=5):
name = re.match(r'(do|g)_\d+', pathlib.Path(filepath).name).group(1)
ckpts = sorted(pathlib.Path(filepath).parent.glob(f'{name}_*'))
if len(ckpts) > num_ckpt_keep:
[os.remove(c) for c in ckpts[:-num_ckpt_keep]]
print("Saving checkpoint to {}".format(filepath))
torch.save(obj, filepath)
print("Complete.")
def scan_checkpoint(cp_dir, prefix):
pattern = os.path.join(cp_dir, prefix + '????????')
cp_list = glob.glob(pattern)
if len(cp_list) == 0:
return None
return sorted(cp_list)[-1]
# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
# 2023 Horizon Inc. (authors: Xingchen Song)
# 2024 Alibaba Inc
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from contextlib import nullcontext
import logging
import os
import torch
import json
import re
import datetime
import yaml
import deepspeed
import torch.optim as optim
import torch.distributed as dist
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from torch.nn.utils import clip_grad_norm_
from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live
from inspiremusic.dataset.dataset import Dataset
from inspiremusic.utils.scheduler import WarmupLR, NoamHoldAnnealing, ConstantLR
def init_distributed(args):
world_size = int(os.environ.get('WORLD_SIZE', 1))
local_rank = int(os.environ.get('LOCAL_RANK', 0))
rank = int(os.environ.get('RANK', 0))
logging.info('training on multiple gpus, this gpu {}'.format(local_rank) +
', rank {}, world_size {}'.format(rank, world_size))
if args.train_engine == 'torch_ddp':
torch.cuda.set_device(local_rank)
dist.init_process_group(args.dist_backend)
else:
deepspeed.init_distributed(dist_backend=args.dist_backend)
return world_size, local_rank, rank
def init_dataset_and_dataloader(args, configs):
gan = False
data_pipeline = configs['data_pipeline_gan'] if gan is True else configs['data_pipeline']
train_dataset = Dataset(args.train_data, data_pipeline=data_pipeline, mode='train', shuffle=True, partition=True)
cv_dataset = Dataset(args.cv_data, data_pipeline=data_pipeline, mode='train', shuffle=False, partition=False)
# do not use persistent_workers=True, as whisper tokenizer opens tiktoken file each time when the for loop starts
train_data_loader = DataLoader(train_dataset,
batch_size=None,
pin_memory=args.pin_memory,
num_workers=args.num_workers,
prefetch_factor=args.prefetch,
timeout=60)
cv_data_loader = DataLoader(cv_dataset,
batch_size=None,
pin_memory=args.pin_memory,
num_workers=args.num_workers,
prefetch_factor=args.prefetch,
timeout=60)
return train_dataset, cv_dataset, train_data_loader, cv_data_loader
def check_modify_and_save_config(args, configs):
if args.train_engine == "torch_ddp":
configs['train_conf']["dtype"] = 'fp32'
else:
with open(args.deepspeed_config, 'r') as fin:
ds_configs = json.load(fin)
if "fp16" in ds_configs and ds_configs["fp16"]["enabled"]:
configs['train_conf']["dtype"] = "fp16"
elif "bf16" in ds_configs and ds_configs["bf16"]["enabled"]:
configs['train_conf']["dtype"] = "bf16"
else:
configs['train_conf']["dtype"] = "fp32"
assert ds_configs["train_micro_batch_size_per_gpu"] == 1
# if use deepspeed, override ddp config
configs['train_conf']['save_per_step'] = int(configs['train_conf']['save_per_step'] *
configs['train_conf']['accum_grad'] / ds_configs["gradient_accumulation_steps"])
configs['train_conf']['accum_grad'] = ds_configs["gradient_accumulation_steps"]
configs['train_conf']['grad_clip'] = ds_configs["gradient_clipping"]
configs['train_conf']['log_interval'] = ds_configs["steps_per_print"]
return configs
def wrap_cuda_model(args, model):
local_world_size = int(os.environ.get('LOCAL_WORLD_SIZE', 1))
world_size = int(os.environ.get('WORLD_SIZE', 1))
if args.train_engine == "torch_ddp": # native pytorch ddp
assert (torch.cuda.is_available())
model.cuda()
model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True)
else:
if int(os.environ.get('RANK', 0)) == 0:
logging.info("Estimating model states memory needs (zero2)...")
estimate_zero2_model_states_mem_needs_all_live(
model,
num_gpus_per_node=local_world_size,
num_nodes=world_size // local_world_size)
return model
def init_optimizer_and_scheduler(args, configs, model):
if configs['train_conf']['optim'] == 'adam':
optimizer = optim.Adam(model.parameters(), **configs['train_conf']['optim_conf'])
elif configs['train_conf']['optim'] == 'adamw':
optimizer = optim.AdamW(model.parameters(), **configs['train_conf']['optim_conf'])
else:
raise ValueError("unknown optimizer: " + configs['train_conf'])
if configs['train_conf']['scheduler'] == 'warmuplr':
scheduler_type = WarmupLR
scheduler = WarmupLR(optimizer, **configs['train_conf']['scheduler_conf'])
elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing':
scheduler_type = NoamHoldAnnealing
scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf'])
elif configs['train_conf']['scheduler'] == 'constantlr':
scheduler_type = ConstantLR
scheduler = ConstantLR(optimizer)
else:
raise ValueError("unknown scheduler: " + configs['train_conf'])
# use deepspeed optimizer for speedup
if args.train_engine == "deepspeed":
def scheduler(opt):
return scheduler_type(opt, **configs['train_conf']['scheduler_conf'])
model, optimizer, _, scheduler = deepspeed.initialize(
args=args,
model=model,
optimizer=None,
lr_scheduler=scheduler,
model_parameters=model.parameters())
return model, optimizer, scheduler
def init_summarywriter(args):
writer = None
if int(os.environ.get('RANK', 0)) == 0:
os.makedirs(args.model_dir, exist_ok=True)
writer = SummaryWriter(args.tensorboard_dir)
return writer
def save_model(model, model_name, info_dict):
rank = int(os.environ.get('RANK', 0))
model_dir = info_dict["model_dir"]
save_model_path = os.path.join(model_dir, '{}.pt'.format(model_name))
if info_dict["train_engine"] == "torch_ddp":
if rank == 0:
torch.save(model.module.state_dict(), save_model_path)
else:
with torch.no_grad():
model.save_checkpoint(save_dir=model_dir,
tag=model_name,
client_state=info_dict)
if rank == 0:
info_path = re.sub('.pt$', '.yaml', save_model_path)
info_dict['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S')
with open(info_path, 'w') as fout:
data = yaml.dump(info_dict)
fout.write(data)
logging.info('[Rank {}] Checkpoint: save to checkpoint {}'.format(rank, save_model_path))
def inspiremusic_join(group_join, info_dict):
world_size = int(os.environ.get('WORLD_SIZE', 1))
local_rank = int(os.environ.get('LOCAL_RANK', 0))
rank = int(os.environ.get('RANK', 0))
if info_dict["batch_idx"] != 0:
# we try to join all rank in both ddp and deepspeed mode, in case different rank has different lr
try:
dist.monitored_barrier(group=group_join,
timeout=group_join.options._timeout)
return False
except RuntimeError as e:
logging.info("Detected uneven workload distribution: {}\n".format(e) +
"Break current worker to manually join all workers, " +
"world_size {}, current rank {}, current local_rank {}\n".
format(world_size, rank, local_rank))
return True
else:
return False
def batch_forward(model, batch, info_dict, scaler):
device = int(os.environ.get('LOCAL_RANK', 0))
dtype = info_dict["dtype"]
if dtype == "fp16":
dtype = torch.float16
elif dtype == "bf16":
dtype = torch.bfloat16
else: # fp32
dtype = torch.float32
if info_dict['train_engine'] == 'torch_ddp':
autocast = torch.cuda.amp.autocast(enabled=scaler is not None)
else:
autocast = torch.cuda.amp.autocast(enabled=True, dtype=dtype, cache_enabled=False)
with autocast:
info_dict['loss_dict'] = model(batch, device)
return info_dict
def batch_backward(model, info_dict, scaler):
if info_dict["train_engine"] == "deepspeed":
scaled_loss = model.backward(info_dict['loss_dict']['loss'])
else:
scaled_loss = info_dict['loss_dict']['loss'] / info_dict['accum_grad']
if scaler is not None:
scaler.scale(scaled_loss).backward()
else:
scaled_loss.backward()
info_dict['loss_dict']['loss'] = scaled_loss
return info_dict
def update_parameter_and_lr(model, optimizer, scheduler, info_dict, scaler=None):
grad_norm = 0.0
if info_dict['train_engine'] == "deepspeed":
info_dict["is_gradient_accumulation_boundary"] = model.is_gradient_accumulation_boundary()
model.step()
grad_norm = model.get_global_grad_norm()
elif (info_dict['batch_idx'] + 1) % info_dict["accum_grad"] == 0:
if scaler is not None:
scaler.unscale_(optimizer) # Unscale gradients before clipping
grad_norm = clip_grad_norm_(model.parameters(), info_dict['grad_clip'])
scaler.step(optimizer)
scaler.update()
else:
grad_norm = clip_grad_norm_(model.parameters(), info_dict['grad_clip'])
if torch.isfinite(grad_norm):
optimizer.step()
optimizer.zero_grad()
scheduler.step()
info_dict["lr"] = optimizer.param_groups[0]['lr']
info_dict["grad_norm"] = grad_norm
return info_dict
def log_per_step(writer, info_dict):
tag = info_dict["tag"]
epoch = info_dict.get('epoch', 0)
step = info_dict["step"]
batch_idx = info_dict["batch_idx"]
loss_dict = info_dict['loss_dict']
rank = int(os.environ.get('RANK', 0))
# only rank 0 write to tensorboard to avoid multi-process write
if writer is not None:
if (info_dict['train_engine'] == 'deepspeed' and info_dict['is_gradient_accumulation_boundary'] is True) or \
(info_dict['train_engine'] == 'torch_ddp' and (info_dict['batch_idx'] + 1) % info_dict['accum_grad'] == 0):
for k in ['epoch', 'lr', 'grad_norm']:
writer.add_scalar('{}/{}'.format(tag, k), info_dict[k], step + 1)
for k, v in loss_dict.items():
writer.add_scalar('{}/{}'.format(tag, k), v, step + 1)
# TRAIN & CV, Shell log (stdout)
if (info_dict['batch_idx'] + 1) % info_dict['log_interval'] == 0:
log_str = '{} Batch {}/{} '.format(tag, epoch, batch_idx + 1)
for name, value in loss_dict.items():
log_str += '{} {:.6f} '.format(name, value.item())
if tag == "TRAIN":
log_str += 'lr {:.8f} grad_norm {:.6f}'.format(
info_dict["lr"], info_dict['grad_norm'])
log_str += ' rank {}'.format(rank)
logging.debug(log_str)
def log_per_save(writer, info_dict):
tag = info_dict["tag"]
epoch = info_dict["epoch"]
step = info_dict["step"]
loss_dict = info_dict["loss_dict"]
lr = info_dict['lr']
rank = int(os.environ.get('RANK', 0))
logging.info(
'Epoch {} Step {} CV info lr {} {} rank {}'.format(
epoch, step + 1, lr, rank, ' '.join(['{}_{}'.format(k, v) for k, v in loss_dict.items()])))
if writer is not None:
for k in ['epoch', 'lr']:
writer.add_scalar('{}/{}'.format(tag, k), info_dict[k], step + 1)
for k, v in loss_dict.items():
writer.add_scalar('{}/{}'.format(tag, k), v, step + 1)
import os
import sys
def align_trans_scp_file(trans, scp):
trans_dict = {}
with open(trans, 'r') as f:
for line in f:
sec = line.strip().split("\t")
trans_dict[sec[0]] = sec[1]
scp_dict = {}
with open(scp, 'r') as f:
for line in f:
sec = line.strip().split(" ")
scp_dict[sec[0]] = sec[1]
with open("text", "w") as f:
for k, v in scp_dict.items():
f.write("%s\t%s\n"%(k,trans_dict[k]))
if __name__ == '__main__':
trans = sys.argv[1]
scp = sys.argv[2]
align_trans_scp_file(trans, scp)
\ No newline at end of file
v0.1
\ No newline at end of file
from dataclasses import dataclass
import numpy as np
import torch
import torchaudio
from pytorch_lightning import LightningDataModule
from torch.utils.data import Dataset, DataLoader
import soundfile
# import librosa
import random
torch.set_num_threads(1)
@dataclass
class DataConfig:
filelist_path: str
sampling_rate: int
num_samples: int
batch_size: int
num_workers: int
def collate_fn(batch):
batch = [item for item in batch if item is not None]
return torch.stack(batch, dim=0)
class VocosDataModule(LightningDataModule):
def __init__(self, train_params: DataConfig, val_params: DataConfig):
super().__init__()
self.train_config = train_params
self.val_config = val_params
def _get_dataloder(self, cfg: DataConfig, train: bool):
dataset = VocosDataset(cfg, train=train)
dataloader = DataLoader(
dataset, batch_size=cfg.batch_size, num_workers=cfg.num_workers, shuffle=train, pin_memory=True, collate_fn=collate_fn
)
return dataloader
def train_dataloader(self) -> DataLoader:
return self._get_dataloder(self.train_config, train=True)
def val_dataloader(self) -> DataLoader:
return self._get_dataloder(self.val_config, train=False)
class VocosDataset(Dataset):
def __init__(self, cfg: DataConfig, train: bool):
with open(cfg.filelist_path) as f:
self.filelist = f.read().splitlines()
self.sampling_rate = cfg.sampling_rate
self.num_samples = cfg.num_samples
self.train = train
def __len__(self) -> int:
return len(self.filelist)
def __getitem__(self, index: int) -> torch.Tensor:
audio_path = self.filelist[index]
# y, sr = torchaudio.load(audio_path)
# print(audio_path,"111")
try:
y1, sr = soundfile.read(audio_path)
# y1, sr = librosa.load(audio_path,sr=None)
y = torch.tensor(y1).float().unsqueeze(0)
# if y.size(0) > 1:
# # mix to mono
# y = y.mean(dim=0, keepdim=True)
if y.ndim > 2:
# mix to mono
# print("有问题哈,数据处理部分")
# y = y.mean(dim=-1, keepdim=False)
random_channel = random.randint(0, y.size(-1) - 1)
y = y[:, :, random_channel]
gain = np.random.uniform(-1, -6) if self.train else -3
y, _ = torchaudio.sox_effects.apply_effects_tensor(y, sr, [["norm", f"{gain:.2f}"]])
if sr != self.sampling_rate:
y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=self.sampling_rate)
if y.size(-1) < self.num_samples:
pad_length = self.num_samples - y.size(-1)
padding_tensor = y.repeat(1, 1 + pad_length // y.size(-1))
y = torch.cat((y, padding_tensor[:, :pad_length]), dim=1)
elif self.train:
start = np.random.randint(low=0, high=y.size(-1) - self.num_samples + 1)
y = y[:, start : start + self.num_samples]
else:
# During validation, take always the first segment for determinism
y = y[:, : self.num_samples]
return y[0]
except Exception as e:
print(f"Error processing file {audio_path} at index {index}: {e}")
# 这里可以继续选择抛出异常,或者返回一个 None 表示无效数据
return None
# def __getitem__(self, index: int) -> torch.Tensor:
# audio_path = self.filelist[index]
# try:
# y, sr = torchaudio.load(audio_path)
# if y.size(0) > 1:
# # 随机选择一个通道
# random_channel = random.randint(0, y.size(0) - 1)
# y = y[random_channel, :].unsqueeze(0) # 保持返回值为 (1, T) 的形式
# # gain = np.random.uniform(-1, -6) if self.train else -3
# # y, _ = torchaudio.sox_effects.apply_effects_tensor(y, sr, [["norm", f"{gain:.2f}"]])
# if sr != self.sampling_rate:
# y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=self.sampling_rate)
# if y.size(-1) < self.num_samples:
# pad_length = self.num_samples - y.size(-1)
# padding_tensor = y.repeat(1, 1 + pad_length // y.size(-1))
# y = torch.cat((y, padding_tensor[:, :pad_length]), dim=1)
# elif self.train:
# start = np.random.randint(low=0, high=y.size(-1) - self.num_samples + 1)
# y = y[:, start: start + self.num_samples]
# else:
# # During validation, take always the first segment for determinism
# y = y[:, :self.num_samples]
# return y[0]
# except Exception as e:
# print(f"Error processing file {audio_path} at index {index}: {e}")
# # 这里可以继续选择抛出异常,或者返回一个 None 表示无效数据
# return None
\ No newline at end of file
import torch
import torch.nn as nn
import torch.nn.functional as F
# from audiotools import AudioSignal
# from audiotools import ml
# from audiotools import STFTParams
from einops import rearrange
from torch.nn.utils import weight_norm
from collections import namedtuple
STFTParams = namedtuple(
"STFTParams",
["window_length", "hop_length", "window_type", "match_stride", "padding_type"],
)
STFTParams.__new__.__defaults__ = (None, None, None, None, None)
def WNConv1d(*args, **kwargs):
act = kwargs.pop("act", True)
conv = weight_norm(nn.Conv1d(*args, **kwargs))
if not act:
return conv
return nn.Sequential(conv, nn.LeakyReLU(0.1))
def WNConv2d(*args, **kwargs):
act = kwargs.pop("act", True)
conv = weight_norm(nn.Conv2d(*args, **kwargs))
if not act:
return conv
return nn.Sequential(conv, nn.LeakyReLU(0.1))
class MPD(nn.Module):
def __init__(self, period):
super().__init__()
self.period = period
self.convs = nn.ModuleList(
[
WNConv2d(1, 32, (5, 1), (3, 1), padding=(2, 0)),
WNConv2d(32, 128, (5, 1), (3, 1), padding=(2, 0)),
WNConv2d(128, 512, (5, 1), (3, 1), padding=(2, 0)),
WNConv2d(512, 1024, (5, 1), (3, 1), padding=(2, 0)),
WNConv2d(1024, 1024, (5, 1), 1, padding=(2, 0)),
]
)
self.conv_post = WNConv2d(
1024, 1, kernel_size=(3, 1), padding=(1, 0), act=False
)
def pad_to_period(self, x):
t = x.shape[-1]
x = F.pad(x, (0, self.period - t % self.period), mode="reflect")
return x
def forward(self, x):
fmap = []
x = self.pad_to_period(x)
x = rearrange(x, "b c (l p) -> b c l p", p=self.period)
for layer in self.convs:
x = layer(x)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
return fmap
class MSD(nn.Module):
def __init__(self, rate: int = 1, sample_rate: int = 48000):
super().__init__()
self.convs = nn.ModuleList(
[
WNConv1d(1, 16, 15, 1, padding=7),
WNConv1d(16, 64, 41, 4, groups=4, padding=20),
WNConv1d(64, 256, 41, 4, groups=16, padding=20),
WNConv1d(256, 1024, 41, 4, groups=64, padding=20),
WNConv1d(1024, 1024, 41, 4, groups=256, padding=20),
WNConv1d(1024, 1024, 5, 1, padding=2),
]
)
self.conv_post = WNConv1d(1024, 1, 3, 1, padding=1, act=False)
self.sample_rate = sample_rate
self.rate = rate
def forward(self, x):
# x = AudioSignal(x, self.sample_rate)
# x.resample(self.sample_rate // self.rate)
# x = x.audio_data
fmap = []
for l in self.convs:
x = l(x)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
return fmap
BANDS = [(0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)]
class MRD(nn.Module):
def __init__(
self,
window_length: int,
hop_factor: float = 0.25,
sample_rate: int = 24000,
bands: list = BANDS,
):
"""Complex multi-band spectrogram discriminator.
Parameters
----------
window_length : int
Window length of STFT.
hop_factor : float, optional
Hop factor of the STFT, defaults to ``0.25 * window_length``.
sample_rate : int, optional
Sampling rate of audio in Hz, by default 24000
bands : list, optional
Bands to run discriminator over.
"""
super().__init__()
self.window_length = window_length
self.hop_factor = hop_factor
self.sample_rate = sample_rate
self.stft_params = STFTParams(
window_length=window_length,
hop_length=int(window_length * hop_factor),
match_stride=True,
)
n_fft = window_length // 2 + 1
bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
self.bands = bands
self.n_fft = window_length
ch = 32
convs = lambda: nn.ModuleList(
[
WNConv2d(2, ch, (3, 9), (1, 1), padding=(1, 4)),
WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
WNConv2d(ch, ch, (3, 3), (1, 1), padding=(1, 1)),
]
)
self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
self.conv_post = WNConv2d(ch, 1, (3, 3), (1, 1), padding=(1, 1), act=False)
def spectrogram(self, x):
# x = AudioSignal(x, self.sample_rate, stft_params=self.stft_params)
# x = torch.view_as_real(x.stft())
# x.squeeze(0).stft(n_fft=1024,win_length=1024,return_complex=True).size()
# breakpoint()
if x.size(0)==1:
# x = torch.view_as_real(x.squeeze(0).stft(n_fft=self.window_length,return_complex=True).unsqueeze(0))
x = torch.view_as_real(x.squeeze(0).stft(n_fft=self.n_fft,return_complex=True).unsqueeze(0))
else:
# x = torch.view_as_real(x.squeeze(1).stft(n_fft=self.window_length,return_complex=True).unsqueeze(1))
x = torch.view_as_real(x.squeeze(1).stft(n_fft=self.n_fft,return_complex=True).unsqueeze(1))
x = rearrange(x, "b 1 f t c -> (b 1) c t f")
# Split into bands
x_bands = [x[..., b[0] : b[1]] for b in self.bands]
return x_bands
def forward(self, x):
x_bands = self.spectrogram(x)
fmap = []
x = []
for band, stack in zip(x_bands, self.band_convs):
for layer in stack:
band = layer(band)
fmap.append(band)
x.append(band)
x = torch.cat(x, dim=-1)
x = self.conv_post(x)
fmap.append(x)
return fmap
# class DACDiscriminator(ml.BaseModel):
class DACDiscriminator(nn.Module):
def __init__(
self,
rates: list = [],
periods: list = [2, 3, 5, 7, 11],
fft_sizes: list = [2048, 1024, 512],
sample_rate: int = 24000,
bands: list = BANDS,
):
"""Discriminator that combines multiple discriminators.
Parameters
----------
rates : list, optional
sampling rates (in Hz) to run MSD at, by default []
If empty, MSD is not used.
periods : list, optional
periods (of samples) to run MPD at, by default [2, 3, 5, 7, 11]
fft_sizes : list, optional
Window sizes of the FFT to run MRD at, by default [2048, 1024, 512]
sample_rate : int, optional
Sampling rate of audio in Hz, by default 24000
bands : list, optional
Bands to run MRD at, by default `BANDS`
"""
super().__init__()
discs = []
discs += [MPD(p) for p in periods]
discs += [MSD(r, sample_rate=sample_rate) for r in rates]
discs += [MRD(f, sample_rate=sample_rate, bands=bands) for f in fft_sizes]
self.discriminators = nn.ModuleList(discs)
def preprocess(self, y):
# Remove DC offset
y = y - y.mean(dim=-1, keepdims=True)
# Peak normalize the volume of input audio
y = 0.8 * y / (y.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
return y
def forward(self, x):
x = self.preprocess(x)
fmaps = [d(x) for d in self.discriminators]
return fmaps
if __name__ == "__main__":
disc = DACDiscriminator()
x = torch.zeros(1, 1, 24000)
results = disc(x)
breakpoint()
for i, result in enumerate(results):
print(f"disc{i}")
for i, r in enumerate(result):
print(r.shape, r.mean(), r.min(), r.max())
print("00")
from typing import Tuple, List
import torch
from torch import nn
from torch.nn import Conv2d
from torch.nn.utils import weight_norm
class MultiPeriodDiscriminator(nn.Module):
"""
Multi-Period Discriminator module adapted from https://github.com/jik876/hifi-gan.
Additionally, it allows incorporating conditional information with a learned embeddings table.
Args:
periods (tuple[int]): Tuple of periods for each discriminator.
num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
Defaults to None.
"""
def __init__(self, periods: Tuple[int] = (2, 3, 5, 7, 11), num_embeddings: int = None):
super().__init__()
self.discriminators = nn.ModuleList([DiscriminatorP(period=p, num_embeddings=num_embeddings) for p in periods])
def forward(
self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for d in self.discriminators:
y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class DiscriminatorP(nn.Module):
def __init__(
self,
period: int,
in_channels: int = 1,
kernel_size: int = 5,
stride: int = 3,
lrelu_slope: float = 0.1,
num_embeddings: int = None,
):
super().__init__()
self.period = period
self.convs = nn.ModuleList(
[
weight_norm(Conv2d(in_channels, 32, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
weight_norm(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
weight_norm(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
weight_norm(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
weight_norm(Conv2d(1024, 1024, (kernel_size, 1), (1, 1), padding=(kernel_size // 2, 0))),
]
)
if num_embeddings is not None:
self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=1024)
torch.nn.init.zeros_(self.emb.weight)
self.conv_post = weight_norm(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
self.lrelu_slope = lrelu_slope
def forward(
self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
x = x.unsqueeze(1)
fmap = []
# 1d to 2d
b, c, t = x.shape
if t % self.period != 0: # pad first
n_pad = self.period - (t % self.period)
x = torch.nn.functional.pad(x, (0, n_pad), "reflect")
t = t + n_pad
x = x.view(b, c, t // self.period, self.period)
for i, l in enumerate(self.convs):
x = l(x)
x = torch.nn.functional.leaky_relu(x, self.lrelu_slope)
if i > 0:
fmap.append(x)
if cond_embedding_id is not None:
emb = self.emb(cond_embedding_id)
h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
else:
h = 0
x = self.conv_post(x)
fmap.append(x)
x += h
x = torch.flatten(x, 1, -1)
return x, fmap
class MultiResolutionDiscriminator(nn.Module):
def __init__(
self,
resolutions: Tuple[Tuple[int, int, int]] = ((1024, 256, 1024), (2048, 512, 2048), (512, 128, 512)),
num_embeddings: int = None,
):
"""
Multi-Resolution Discriminator module adapted from https://github.com/mindslab-ai/univnet.
Additionally, it allows incorporating conditional information with a learned embeddings table.
Args:
resolutions (tuple[tuple[int, int, int]]): Tuple of resolutions for each discriminator.
Each resolution should be a tuple of (n_fft, hop_length, win_length).
num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
Defaults to None.
"""
super().__init__()
self.discriminators = nn.ModuleList(
[DiscriminatorR(resolution=r, num_embeddings=num_embeddings) for r in resolutions]
)
def forward(
self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for d in self.discriminators:
y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class DiscriminatorR(nn.Module):
def __init__(
self,
resolution: Tuple[int, int, int],
channels: int = 64,
in_channels: int = 1,
num_embeddings: int = None,
lrelu_slope: float = 0.1,
):
super().__init__()
self.resolution = resolution
self.in_channels = in_channels
self.lrelu_slope = lrelu_slope
self.convs = nn.ModuleList(
[
weight_norm(nn.Conv2d(in_channels, channels, kernel_size=(7, 5), stride=(2, 2), padding=(3, 2))),
weight_norm(nn.Conv2d(channels, channels, kernel_size=(5, 3), stride=(2, 1), padding=(2, 1))),
weight_norm(nn.Conv2d(channels, channels, kernel_size=(5, 3), stride=(2, 2), padding=(2, 1))),
weight_norm(nn.Conv2d(channels, channels, kernel_size=3, stride=(2, 1), padding=1)),
weight_norm(nn.Conv2d(channels, channels, kernel_size=3, stride=(2, 2), padding=1)),
]
)
if num_embeddings is not None:
self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=channels)
torch.nn.init.zeros_(self.emb.weight)
self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), padding=(1, 1)))
def forward(
self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
fmap = []
x = self.spectrogram(x)
x = x.unsqueeze(1)
for l in self.convs:
x = l(x)
x = torch.nn.functional.leaky_relu(x, self.lrelu_slope)
fmap.append(x)
if cond_embedding_id is not None:
emb = self.emb(cond_embedding_id)
h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
else:
h = 0
x = self.conv_post(x)
fmap.append(x)
x += h
x = torch.flatten(x, 1, -1)
return x, fmap
def spectrogram(self, x: torch.Tensor) -> torch.Tensor:
n_fft, hop_length, win_length = self.resolution
magnitude_spectrogram = torch.stft(
x,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=None, # interestingly rectangular window kind of works here
center=True,
return_complex=True,
).abs()
return magnitude_spectrogram
import math
import numpy as np
import pytorch_lightning as pl
import torch
import torchaudio
import transformers
import yaml
from decoder.discriminator_dac import DACDiscriminator
from decoder.discriminators import MultiPeriodDiscriminator, MultiResolutionDiscriminator
from decoder.feature_extractors import FeatureExtractor
from decoder.heads import FourierHead
from decoder.helpers import plot_spectrogram_to_numpy
from decoder.loss import DiscriminatorLoss, GeneratorLoss, FeatureMatchingLoss, MelSpecReconstructionLoss, DACGANLoss
from decoder.models import Backbone
from decoder.modules import safe_log
from decoder.pretrained_model import instantiate_class
class VocosExp(pl.LightningModule):
# noinspection PyUnusedLocal
def __init__(
self,
feature_extractor: FeatureExtractor,
backbone: Backbone,
head: FourierHead,
resume_config: str,
resume_model: str,
sample_rate: int = 24000,
initial_learning_rate: float = 2e-4,
num_warmup_steps: int = 0,
mel_loss_coeff: float = 45,
mrd_loss_coeff: float = 1.0,
pretrain_mel_steps: int = 0,
decay_mel_coeff: bool = False,
evaluate_utmos: bool = False,
evaluate_pesq: bool = False,
evaluate_periodicty: bool = False,
resume: bool = False,
):
"""
Args:
feature_extractor (FeatureExtractor): An instance of FeatureExtractor to extract features from audio signals.
backbone (Backbone): An instance of Backbone model.
head (FourierHead): An instance of Fourier head to generate spectral coefficients and reconstruct a waveform.
sample_rate (int): Sampling rate of the audio signals.
initial_learning_rate (float): Initial learning rate for the optimizer.
num_warmup_steps (int): Number of steps for the warmup phase of learning rate scheduler. Default is 0.
mel_loss_coeff (float, optional): Coefficient for Mel-spectrogram loss in the loss function. Default is 45.
mrd_loss_coeff (float, optional): Coefficient for Multi Resolution Discriminator loss. Default is 1.0.
pretrain_mel_steps (int, optional): Number of steps to pre-train the model without the GAN objective. Default is 0.
decay_mel_coeff (bool, optional): If True, the Mel-spectrogram loss coefficient is decayed during training. Default is False.
evaluate_utmos (bool, optional): If True, UTMOS scores are computed for each validation run.
evaluate_pesq (bool, optional): If True, PESQ scores are computed for each validation run.
evaluate_periodicty (bool, optional): If True, periodicity scores are computed for each validation run.
"""
super().__init__()
self.save_hyperparameters(ignore=["feature_extractor", "backbone", "head"])
self.feature_extractor = feature_extractor
self.backbone = backbone
self.head = head
self.resume_config = resume_config
self.resume_model = resume_model
self.resume = resume
self.multiperioddisc = MultiPeriodDiscriminator()
self.multiresddisc = MultiResolutionDiscriminator()
self.dac = DACDiscriminator()
self.dacdiscriminator = DACGANLoss(self.dac)
self.disc_loss = DiscriminatorLoss()
self.gen_loss = GeneratorLoss()
self.feat_matching_loss = FeatureMatchingLoss()
self.melspec_loss = MelSpecReconstructionLoss(sample_rate=sample_rate)
self.train_discriminator = False
self.base_mel_coeff = self.mel_loss_coeff = mel_loss_coeff
def configure_optimizers(self):
disc_params = [
{"params": self.multiperioddisc.parameters()},
{"params": self.multiresddisc.parameters()},
{"params": self.dac.parameters()},
]
gen_params = [
{"params": self.feature_extractor.parameters()},
{"params": self.backbone.parameters()},
{"params": self.head.parameters()},
]
opt_disc = torch.optim.AdamW(disc_params, lr=self.hparams.initial_learning_rate)
opt_gen = torch.optim.AdamW(gen_params, lr=self.hparams.initial_learning_rate)
max_steps = self.trainer.max_steps // 2 # Max steps per optimizer
scheduler_disc = transformers.get_cosine_schedule_with_warmup(
opt_disc, num_warmup_steps=self.hparams.num_warmup_steps, num_training_steps=max_steps,
)
scheduler_gen = transformers.get_cosine_schedule_with_warmup(
opt_gen, num_warmup_steps=self.hparams.num_warmup_steps, num_training_steps=max_steps,
)
return (
[opt_disc, opt_gen],
[{"scheduler": scheduler_disc, "interval": "step"}, {"scheduler": scheduler_gen, "interval": "step"}],
)
def forward(self, audio_input, **kwargs):
features, _, commit_loss = self.feature_extractor(audio_input, **kwargs)
# print('1111', self.feature_extractor.state_dict()['encodec.decoder.model.3.convtr.convtr.weight_g'])
x = self.backbone(features, **kwargs)
audio_output = self.head(x)
return audio_output, commit_loss
def training_step(self, batch, batch_idx, optimizer_idx, **kwargs):
audio_input = batch
# train discriminator
if optimizer_idx == 0 and self.train_discriminator:
with torch.no_grad():
audio_hat, _ = self(audio_input, **kwargs)
loss_dac=self.dacdiscriminator.discriminator_loss(audio_hat.unsqueeze(1),audio_input.unsqueeze(1))
real_score_mp, gen_score_mp, _, _ = self.multiperioddisc(y=audio_input, y_hat=audio_hat, **kwargs,)
real_score_mrd, gen_score_mrd, _, _ = self.multiresddisc(y=audio_input, y_hat=audio_hat, **kwargs,)
loss_mp, loss_mp_real, _ = self.disc_loss(
disc_real_outputs=real_score_mp, disc_generated_outputs=gen_score_mp
)
loss_mrd, loss_mrd_real, _ = self.disc_loss(
disc_real_outputs=real_score_mrd, disc_generated_outputs=gen_score_mrd
)
loss_mp /= len(loss_mp_real)
loss_mrd /= len(loss_mrd_real)
loss = loss_mp + self.hparams.mrd_loss_coeff * loss_mrd + loss_dac
self.log("discriminator/total", loss, prog_bar=True)
self.log("discriminator/multi_period_loss", loss_mp)
self.log("discriminator/multi_res_loss", loss_mrd)
self.log("discriminator/dac", loss_dac)
return loss
# train generator
if optimizer_idx == 1:
audio_hat, commit_loss = self(audio_input, **kwargs)
if self.train_discriminator:
loss_dac_1,loss_dac_2 = self.dacdiscriminator.generator_loss(audio_hat.unsqueeze(1),audio_input.unsqueeze(1))
_, gen_score_mp, fmap_rs_mp, fmap_gs_mp = self.multiperioddisc(
y=audio_input, y_hat=audio_hat, **kwargs,
)
_, gen_score_mrd, fmap_rs_mrd, fmap_gs_mrd = self.multiresddisc(
y=audio_input, y_hat=audio_hat, **kwargs,
)
loss_gen_mp, list_loss_gen_mp = self.gen_loss(disc_outputs=gen_score_mp)
loss_gen_mrd, list_loss_gen_mrd = self.gen_loss(disc_outputs=gen_score_mrd)
loss_gen_mp = loss_gen_mp / len(list_loss_gen_mp)
loss_gen_mrd = loss_gen_mrd / len(list_loss_gen_mrd)
loss_fm_mp = self.feat_matching_loss(fmap_r=fmap_rs_mp, fmap_g=fmap_gs_mp) / len(fmap_rs_mp)
loss_fm_mrd = self.feat_matching_loss(fmap_r=fmap_rs_mrd, fmap_g=fmap_gs_mrd) / len(fmap_rs_mrd)
self.log("generator/multi_period_loss", loss_gen_mp)
self.log("generator/multi_res_loss", loss_gen_mrd)
self.log("generator/feature_matching_mp", loss_fm_mp)
self.log("generator/feature_matching_mrd", loss_fm_mrd)
self.log("generator/loss_dac_1", loss_dac_1)
self.log("generator/loss_dac_2", loss_dac_2)
else:
loss_gen_mp = loss_gen_mrd = loss_fm_mp = loss_fm_mrd = 0
mel_loss = self.melspec_loss(audio_hat, audio_input)
loss = (
loss_gen_mp
+ self.hparams.mrd_loss_coeff * loss_gen_mrd
+ loss_fm_mp
+ self.hparams.mrd_loss_coeff * loss_fm_mrd
+ self.mel_loss_coeff * mel_loss
+ 1000 * commit_loss
+ loss_dac_1
+ loss_dac_2
)
self.log("generator/total_loss", loss, prog_bar=True)
self.log("mel_loss_coeff", self.mel_loss_coeff)
self.log("generator/mel_loss", mel_loss)
self.log("commit_loss", commit_loss)
if self.global_step % 1000 == 0 and self.global_rank == 0:
self.logger.experiment.add_audio(
"train/audio_in", audio_input[0].data.cpu(), self.global_step, self.hparams.sample_rate
)
self.logger.experiment.add_audio(
"train/audio_pred", audio_hat[0].data.cpu(), self.global_step, self.hparams.sample_rate
)
with torch.no_grad():
mel = safe_log(self.melspec_loss.mel_spec(audio_input[0]))
mel_hat = safe_log(self.melspec_loss.mel_spec(audio_hat[0]))
self.logger.experiment.add_image(
"train/mel_target",
plot_spectrogram_to_numpy(mel.data.cpu().numpy()),
self.global_step,
dataformats="HWC",
)
self.logger.experiment.add_image(
"train/mel_pred",
plot_spectrogram_to_numpy(mel_hat.data.cpu().numpy()),
self.global_step,
dataformats="HWC",
)
return loss
def on_validation_epoch_start(self):
if self.hparams.evaluate_utmos:
from metrics.UTMOS import UTMOSScore
if not hasattr(self, "utmos_model"):
self.utmos_model = UTMOSScore(device=self.device)
def validation_step(self, batch, batch_idx, **kwargs):
audio_input = batch
audio_hat, commit_loss = self(audio_input, **kwargs)
audio_16_khz = torchaudio.functional.resample(audio_input, orig_freq=self.hparams.sample_rate, new_freq=16000)
audio_hat_16khz = torchaudio.functional.resample(audio_hat, orig_freq=self.hparams.sample_rate, new_freq=16000)
if self.hparams.evaluate_periodicty:
from metrics.periodicity import calculate_periodicity_metrics
periodicity_loss, pitch_loss, f1_score = calculate_periodicity_metrics(audio_16_khz, audio_hat_16khz)
else:
periodicity_loss = pitch_loss = f1_score = 0
if self.hparams.evaluate_utmos:
utmos_score = self.utmos_model.score(audio_hat_16khz.unsqueeze(1)).mean()
else:
utmos_score = torch.zeros(1, device=self.device)
if self.hparams.evaluate_pesq:
from pesq import pesq
pesq_score = 0
for ref, deg in zip(audio_16_khz.cpu().numpy(), audio_hat_16khz.cpu().numpy()):
pesq_score += pesq(16000, ref, deg, "wb", on_error=1)
pesq_score /= len(audio_16_khz)
pesq_score = torch.tensor(pesq_score)
else:
pesq_score = torch.zeros(1, device=self.device)
mel_loss = self.melspec_loss(audio_hat.unsqueeze(1), audio_input.unsqueeze(1))
total_loss = mel_loss + (5 - utmos_score) + (5 - pesq_score) + 1000 * commit_loss
return {
"val_loss": total_loss,
"mel_loss": mel_loss,
"utmos_score": utmos_score,
"pesq_score": pesq_score,
"periodicity_loss": periodicity_loss,
"pitch_loss": pitch_loss,
"f1_score": f1_score,
"audio_input": audio_input[0],
"audio_pred": audio_hat[0],
}
def validation_epoch_end(self, outputs):
if self.global_rank == 0:
*_, audio_in, audio_pred = outputs[0].values()
self.logger.experiment.add_audio(
"val_in", audio_in.data.cpu().numpy(), self.global_step, self.hparams.sample_rate
)
self.logger.experiment.add_audio(
"val_pred", audio_pred.data.cpu().numpy(), self.global_step, self.hparams.sample_rate
)
mel_target = safe_log(self.melspec_loss.mel_spec(audio_in))
mel_hat = safe_log(self.melspec_loss.mel_spec(audio_pred))
self.logger.experiment.add_image(
"val_mel_target",
plot_spectrogram_to_numpy(mel_target.data.cpu().numpy()),
self.global_step,
dataformats="HWC",
)
self.logger.experiment.add_image(
"val_mel_hat",
plot_spectrogram_to_numpy(mel_hat.data.cpu().numpy()),
self.global_step,
dataformats="HWC",
)
avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
mel_loss = torch.stack([x["mel_loss"] for x in outputs]).mean()
utmos_score = torch.stack([x["utmos_score"] for x in outputs]).mean()
pesq_score = torch.stack([x["pesq_score"] for x in outputs]).mean()
periodicity_loss = np.array([x["periodicity_loss"] for x in outputs]).mean()
pitch_loss = np.array([x["pitch_loss"] for x in outputs]).mean()
f1_score = np.array([x["f1_score"] for x in outputs]).mean()
self.log("val_loss", avg_loss, sync_dist=True)
self.log("val/mel_loss", mel_loss, sync_dist=True)
self.log("val/utmos_score", utmos_score, sync_dist=True)
self.log("val/pesq_score", pesq_score, sync_dist=True)
self.log("val/periodicity_loss", periodicity_loss, sync_dist=True)
self.log("val/pitch_loss", pitch_loss, sync_dist=True)
self.log("val/f1_score", f1_score, sync_dist=True)
@property
def global_step(self):
"""
Override global_step so that it returns the total number of batches processed
"""
return self.trainer.fit_loop.epoch_loop.total_batch_idx
def on_train_batch_start(self, *args):
if self.global_step >= self.hparams.pretrain_mel_steps:
self.train_discriminator = True
else:
self.train_discriminator = False
def on_train_batch_end(self, *args):
def mel_loss_coeff_decay(current_step, num_cycles=0.5):
max_steps = self.trainer.max_steps // 2
if current_step < self.hparams.num_warmup_steps:
return 1.0
progress = float(current_step - self.hparams.num_warmup_steps) / float(
max(1, max_steps - self.hparams.num_warmup_steps)
)
return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
if self.hparams.decay_mel_coeff:
self.mel_loss_coeff = self.base_mel_coeff * mel_loss_coeff_decay(self.global_step + 1)
class WavTokenizer(VocosExp):
"""
WavTokenizer is a subclass of VocosExp that overrides the parent experiment to function as a conditional GAN.
It manages an additional `bandwidth_id` attribute, which denotes a learnable embedding corresponding to
a specific bandwidth value of EnCodec. During training, a random bandwidth_id is generated for each step,
while during validation, a fixed bandwidth_id is used.
"""
def __init__(
self,
feature_extractor: FeatureExtractor,
backbone: Backbone,
head: FourierHead,
resume_config: str,
resume_model: str,
sample_rate: int = 24000,
initial_learning_rate: float = 2e-4,
num_warmup_steps: int = 0,
mel_loss_coeff: float = 45,
mrd_loss_coeff: float = 1.0,
pretrain_mel_steps: int = 0,
decay_mel_coeff: bool = False,
evaluate_utmos: bool = False,
evaluate_pesq: bool = False,
evaluate_periodicty: bool = False,
resume: bool = False,
):
super().__init__(
feature_extractor,
backbone,
head,
resume_config,
resume_model,
sample_rate,
initial_learning_rate,
num_warmup_steps,
mel_loss_coeff,
mrd_loss_coeff,
pretrain_mel_steps,
decay_mel_coeff,
evaluate_utmos,
evaluate_pesq,
evaluate_periodicty,
resume
)
# Override with conditional discriminators
# VocosExp.__init__(self, feature_extractor, backbone, head, resume_config, resume_model)
# if self.resume:
# VocosExp.load_from_checkpoint(self.resume_model)
self.multiperioddisc = MultiPeriodDiscriminator(num_embeddings=len(self.feature_extractor.bandwidths))
self.multiresddisc = MultiResolutionDiscriminator(num_embeddings=len(self.feature_extractor.bandwidths))
self.dac = DACDiscriminator()
if self.resume:
print('加载预训练模型:', self.resume_model)
# with open(self.resume_config, "r") as f:
# config = yaml.safe_load(f)
# feature_extractor = instantiate_class(args=(), init=config['model']['init_args']["feature_extractor"])
# backbone = instantiate_class(args=(), init=config['model']['init_args']["backbone"])
# head = instantiate_class(args=(), init=config['model']['init_args']["head"])
# 不加载量化器部分权重
state_dict_raw = torch.load(self.resume_model, map_location=self.device)['state_dict']
state_dict_fa_qa = dict()
state_dict_fa_en = dict()
state_dict_fa_de = dict()
state_dict_bb = dict()
state_dict_hd = dict()
state_dict_mp = dict()
state_dict_mr = dict()
state_dict_dac = dict()
for k, v in state_dict_raw.items():
# breakpoint()
if k.startswith('feature_extractor.encodec.quantizer'):
# breakpoint()
# print("*****",k)
ss = k[46:48]
if ss[-1] == '.':
num = int(ss[0])
# print("num,k",num,k[36:])
if num <= 7:
state_dict_fa_qa[k[36:]] = v
if k.startswith('feature_extractor.encodec.encoder'):
state_dict_fa_en[k[34:]] = v
if k.startswith('feature_extractor.encodec.decoder'):
state_dict_fa_de[k[34:]] = v
if k.startswith('backbone.'):
state_dict_bb[k[9:]] = v
if k.startswith('head.'):
state_dict_hd[k[5:]] = v
if k.startswith('multiperioddisc.'):
state_dict_mp[k[16:]] = v
if k.startswith('multiresddisc.'):
state_dict_mr[k[14:]] = v
if k.startswith('dac.'):
state_dict_dac[k[4:]] = v
# breakpoint()
# feature_extractor.encodec.quantizer.load_state_dict(state_dict_fa_qa, strict=True)
feature_extractor.encodec.encoder.load_state_dict(state_dict_fa_en, strict=True)
feature_extractor.encodec.decoder.load_state_dict(state_dict_fa_de, strict=True)
feature_extractor.encodec.quantizer.load_state_dict(state_dict_fa_qa, strict=True)
backbone.load_state_dict(state_dict_bb, strict=True)
head.load_state_dict(state_dict_hd, strict=True)
self.feature_extractor = feature_extractor.to(self.device)
self.backbone = backbone.to(self.device)
self.head = head.to(self.device)
self.multiperioddisc.load_state_dict(state_dict_mp, strict=True)
self.multiresddisc.load_state_dict(state_dict_mr, strict=True)
self.dac.load_state_dict(state_dict_dac, strict=True)
def training_step(self, *args):
# print('-------------------train--------------------')
# if self.global_rank == 0 and self.resume:
# config_path = self.resume_config
# model_path = self.resume_model
# self.pretrained_load(config_path, model_path)
# print('加载预训练模型:', model_path)
bandwidth_id = torch.randint(low=0, high=len(self.feature_extractor.bandwidths), size=(1,), device=self.device,)
output = super().training_step(*args, bandwidth_id=bandwidth_id)
return output
def validation_step(self, *args):
# print('-------------------valid--------------------')
bandwidth_id = torch.tensor([0], device=self.device)
output = super().validation_step(*args, bandwidth_id=bandwidth_id)
return output
def validation_epoch_end(self, outputs):
if self.global_rank == 0:
*_, audio_in, _ = outputs[0].values()
# Resynthesis with encodec for reference
self.feature_extractor.encodec.set_target_bandwidth(self.feature_extractor.bandwidths[0])
encodec_audio = self.feature_extractor.encodec(audio_in[None, None, :])
self.logger.experiment.add_audio(
"encodec", encodec_audio[0, 0].data.cpu().numpy(), self.global_step, self.hparams.sample_rate,
)
super().validation_epoch_end(outputs)
from typing import List
import torch
import torchaudio
from torch import nn
import math
# from inspiremusic.wavtokenizer.decoder.modules import safe_log
from inspiremusic.wavtokenizer.encoder.modules import SEANetEncoder, SEANetDecoder
from inspiremusic.wavtokenizer.encoder import EncodecModel
from inspiremusic.wavtokenizer.encoder.quantization import ResidualVectorQuantizer
def safe_log(x: torch.Tensor, clip_val: float = 1e-7) -> torch.Tensor:
"""
Computes the element-wise logarithm of the input tensor with clipping to avoid near-zero values.
Args:
x (Tensor): Input tensor.
clip_val (float, optional): Minimum value to clip the input tensor. Defaults to 1e-7.
Returns:
Tensor: Element-wise logarithm of the input tensor with clipping applied.
"""
return torch.log(torch.clip(x, min=clip_val))
def symlog(x: torch.Tensor) -> torch.Tensor:
return torch.sign(x) * torch.log1p(x.abs())
def symexp(x: torch.Tensor) -> torch.Tensor:
return torch.sign(x) * (torch.exp(x.abs()) - 1)
class FeatureExtractor(nn.Module):
"""Base class for feature extractors."""
def forward(self, audio: torch.Tensor, **kwargs) -> torch.Tensor:
"""
Extract features from the given audio.
Args:
audio (Tensor): Input audio waveform.
Returns:
Tensor: Extracted features of shape (B, C, L), where B is the batch size,
C denotes output features, and L is the sequence length.
"""
raise NotImplementedError("Subclasses must implement the forward method.")
class MelSpectrogramFeatures(FeatureExtractor):
def __init__(self, sample_rate=24000, n_fft=1024, hop_length=256, n_mels=100, padding="center"):
super().__init__()
if padding not in ["center", "same"]:
raise ValueError("Padding must be 'center' or 'same'.")
self.padding = padding
self.mel_spec = torchaudio.transforms.MelSpectrogram(
sample_rate=sample_rate,
n_fft=n_fft,
hop_length=hop_length,
n_mels=n_mels,
center=padding == "center",
power=1,
)
def forward(self, audio, **kwargs):
if self.padding == "same":
pad = self.mel_spec.win_length - self.mel_spec.hop_length
audio = torch.nn.functional.pad(audio, (pad // 2, pad // 2), mode="reflect")
mel = self.mel_spec(audio)
features = safe_log(mel)
return features
class EncodecFeatures(FeatureExtractor):
def __init__(
self,
encodec_model: str = "encodec_24khz",
bandwidths: List[float] = [1.5, 3.0, 6.0, 12.0],
train_codebooks: bool = False,
num_quantizers: int = 1,
dowmsamples: List[int] = [6, 5, 5, 4],
vq_bins: int = 16384,
vq_kmeans: int = 800,
):
super().__init__()
# breakpoint()
self.frame_rate = 25 # not use
# n_q = int(bandwidths[-1]*1000/(math.log2(2048) * self.frame_rate))
n_q = num_quantizers # important
encoder = SEANetEncoder(causal=False, n_residual_layers=1, norm='weight_norm', pad_mode='reflect', lstm=2,
dimension=512, channels=1, n_filters=32, ratios=dowmsamples, activation='ELU',
kernel_size=7, residual_kernel_size=3, last_kernel_size=7, dilation_base=2,
true_skip=False, compress=2)
decoder = SEANetDecoder(causal=False, n_residual_layers=1, norm='weight_norm', pad_mode='reflect', lstm=2,
dimension=512, channels=1, n_filters=32, ratios=[8, 5, 4, 2], activation='ELU',
kernel_size=7, residual_kernel_size=3, last_kernel_size=7, dilation_base=2,
true_skip=False, compress=2)
quantizer = ResidualVectorQuantizer(dimension=512, n_q=n_q, bins=vq_bins, kmeans_iters=vq_kmeans,
decay=0.99, kmeans_init=True)
# breakpoint()
if encodec_model == "encodec_24khz":
self.encodec = EncodecModel(encoder=encoder, decoder=decoder, quantizer=quantizer,
target_bandwidths=bandwidths, sample_rate=24000, channels=1)
else:
raise ValueError(
f"Unsupported encodec_model: {encodec_model}. Supported options are 'encodec_24khz'."
)
for param in self.encodec.parameters():
param.requires_grad = True
# self.num_q = n_q
# codebook_weights = torch.cat([vq.codebook for vq in self.encodec.quantizer.vq.layers[: self.num_q]], dim=0)
# self.codebook_weights = torch.nn.Parameter(codebook_weights, requires_grad=train_codebooks)
self.bandwidths = bandwidths
# @torch.no_grad()
# def get_encodec_codes(self, audio):
# audio = audio.unsqueeze(1)
# emb = self.encodec.encoder(audio)
# codes = self.encodec.quantizer.encode(emb, self.encodec.frame_rate, self.encodec.bandwidth)
# return codes
def forward(self, audio: torch.Tensor, bandwidth_id: torch.Tensor = torch.tensor(0)):
if self.training:
self.encodec.train()
audio = audio.unsqueeze(1) # audio(16,24000)
# breakpoint()
emb = self.encodec.encoder(audio)
q_res = self.encodec.quantizer(emb, self.frame_rate, bandwidth=self.bandwidths[bandwidth_id])
quantized = q_res.quantized
codes = q_res.codes
commit_loss = q_res.penalty # codes(8,16,75),features(16,128,75)
return quantized, codes, commit_loss
# codes = self.get_encodec_codes(audio)
# # Instead of summing in the loop, it stores subsequent VQ dictionaries in a single `self.codebook_weights`
# # with offsets given by the number of bins, and finally summed in a vectorized operation.
# offsets = torch.arange(
# 0, self.encodec.quantizer.bins * len(codes), self.encodec.quantizer.bins, device=audio.device
# )
# embeddings_idxs = codes + offsets.view(-1, 1, 1)
# features = torch.nn.functional.embedding(embeddings_idxs, self.codebook_weights).sum(dim=0)
# return features.transpose(1, 2)
def infer(self, audio: torch.Tensor, bandwidth_id: torch.Tensor):
if self.training:
self.encodec.train()
audio = audio.unsqueeze(1) # audio(16,24000)
emb = self.encodec.encoder(audio)
q_res = self.encodec.quantizer.infer(emb, self.frame_rate, bandwidth=self.bandwidths[bandwidth_id])
quantized = q_res.quantized
codes = q_res.codes
commit_loss = q_res.penalty # codes(8,16,75),features(16,128,75)
return quantized, codes, commit_loss
def _infer(self, audio: torch.Tensor, bandwidth_id: torch.Tensor = torch.tensor(0)):
if self.training:
self.encodec.train()
audio = audio.unsqueeze(1) # audio(16,24000)
emb = self.encodec.encoder(audio)
q_res = self.encodec.quantizer.infer(emb, self.frame_rate, bandwidth=self.bandwidths[bandwidth_id])
quantized = q_res.quantized
codes = q_res.codes
commit_loss = q_res.penalty # codes(8,16,75),features(16,128,75)
return quantized, codes, commit_loss
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment