Commit 51782715 authored by liugh5's avatar liugh5
Browse files

update

parent 8b4e9acd
# Audio processing configs
audio_config:
# Preprocess
wav_normalize: True
trim_silence: True
trim_silence_threshold_db: 60
preemphasize: False
# Feature extraction
sampling_rate: 24000
hop_length: 240
win_length: 1024
n_fft: 1024
n_mels: 80
fmin: 50.0
fmax: 8000.0
phone_level_feature: True
# Normalization
norm_type: "mean_std" # "mean_std" or "global"
max_norm: 1.0
symmetric: False
min_level_db: -100.0
ref_level_db: 20
num_workers: 16
# Audio processing configs
audio_config:
# Preprocess
wav_normalize: True
trim_silence: True
trim_silence_threshold_db: 60
preemphasize: False
# Feature extraction
sampling_rate: 48000
hop_length: 600
win_length: 2400
n_fft: 4096
n_mels: 128
fmin: 0.0
fmax: 12000.0
phone_level_feature: True
# Normalization
norm_type: "mean_std" # "mean_std" or "global"
max_norm: 1.0
symmetric: False
min_level_db: -100.0
ref_level_db: 20
num_workers: 16
# Audio processing configs
audio_config:
# Preprocess
wav_normalize: True
trim_silence: True
trim_silence_threshold_db: 60
preemphasize: False
# Feature extraction
sampling_rate: 8000
hop_length: 100
win_length: 600
n_fft: 2048
n_mels: 80
fmin: 0.0
fmax: 4000.0
phone_level_feature: True
# Normalization
norm_type: "mean_std" # "mean_std" or "global"
max_norm: 1.0
symmetric: False
min_level_db: -100.0
ref_level_db: 20
num_workers: 16
# Audio processing configs
audio_config:
# Preprocess
wav_normalize: True
trim_silence: True
trim_silence_threshold_db: 60
preemphasize: False
# Feature extraction
sampling_rate: 16000
hop_length: 200
win_length: 1000
n_fft: 2048
n_mels: 80
fmin: 0.0
fmax: 8000.0
phone_level_feature: True
se_feature: True
# Normalization
norm_type: "mean_std" # "mean_std" or "global"
max_norm: 1.0
symmetric: False
min_level_db: -100.0
ref_level_db: 20
num_workers: 16
model_type: hifigan
Model:
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
Generator:
params:
in_channels: 80
out_channels: 1
channels: 256
kernel_size: 7
upsample_scales: [10, 5, 2, 2]
upsample_kernal_sizes: [20, 11, 4, 4]
resblock_kernel_sizes: [3, 7, 11]
resblock_dilations:
- [1, 3, 5, 7]
- [1, 3, 5, 7]
- [1, 3, 5, 7]
bias: true
causal: false
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
use_weight_norm: true
nsf_params:
nb_harmonics: 7
sampling_rate: 16000
nsf_norm_type: "global" # "mean_std" , "global"
nsf_f0_global_minimum: 30.0
nsf_f0_global_maximum: 730.0
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
MultiScaleDiscriminator:
params:
scales: 3
downsample_pooling: "DWT"
downsample_pooling_params:
kernel_size: 4
stride: 2
padding: 2
discriminator_params:
in_channels: 1
out_channels: 1
kernel_sizes: [15, 41, 5, 3]
channels: 128
max_downsample_channels: 1024
max_groups: 16
bias: true
downsample_scales: [4, 4, 4, 4, 1]
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
follow_official_norm: true
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
MultiPeriodDiscriminator:
params:
periods: [2, 3, 5, 7, 11]
discriminator_params:
in_channels: 1
out_channels: 1
kernel_sizes: [5, 3]
channels: 32
downsample_scales: [3, 3, 3, 3, 1]
max_downsample_channels: 1024
bias: true
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
use_spectral_norm: false
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
####################################################
# LOSS SETTING #
####################################################
Loss:
generator_adv_loss:
enable: True
params:
average_by_discriminators: False
weights: 1.0
discriminator_adv_loss:
enable: True
params:
average_by_discriminators: False
weights: 1.0
stft_loss:
enable: False # Whether to use multi-resolution STFT loss.
mel_loss:
enable: True
params:
fs: 16000
fft_size: 2048
hop_size: 200
win_length: 1000
window: "hann"
num_mels: 80
fmin: 0
fmax: 8000
log_base: null
weights: 45.0
subband_stft_loss:
enable: False
params:
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
window: "hann_window" # Window function for STFT-based loss
feat_match_loss:
enable: True
params:
average_by_discriminators: false
average_by_layers: false
weights: 2.0
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 16
batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: True
num_workers: 2 # FIXME: set > 0 may stuck on macos
remove_short_samples: False
allow_cache: False
generator_grad_norm: -1
discriminator_grad_norm: -1
###########################################################
# INTERVAL SETTING #
###########################################################
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
train_max_steps: 2500000 # Number of training steps.
save_interval_steps: 20000 # Interval steps to save checkpoint.
eval_interval_steps: 10000 # Interval steps to evaluate the network.
log_interval_steps: 1000 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
model_type: hifigan
Model:
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
Generator:
params:
in_channels: 80
out_channels: 1
channels: 256
kernel_size: 7
upsample_scales: [10, 5, 2, 2]
upsample_kernal_sizes: [20, 11, 4, 4]
resblock_kernel_sizes: [3, 7, 11]
resblock_dilations:
- [1, 3, 5, 7]
- [1, 3, 5, 7]
- [1, 3, 5, 7]
bias: true
causal: false
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
use_weight_norm: true
nsf_params:
nb_harmonics: 7
sampling_rate: 16000
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
MultiScaleDiscriminator:
params:
scales: 3
downsample_pooling: "DWT"
downsample_pooling_params:
kernel_size: 4
stride: 2
padding: 2
discriminator_params:
in_channels: 1
out_channels: 1
kernel_sizes: [15, 41, 5, 3]
channels: 128
max_downsample_channels: 1024
max_groups: 16
bias: true
downsample_scales: [4, 4, 4, 4, 1]
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
follow_official_norm: true
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
MultiPeriodDiscriminator:
params:
periods: [2, 3, 5, 7, 11]
discriminator_params:
in_channels: 1
out_channels: 1
kernel_sizes: [5, 3]
channels: 32
downsample_scales: [3, 3, 3, 3, 1]
max_downsample_channels: 1024
bias: true
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
use_spectral_norm: false
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
####################################################
# LOSS SETTING #
####################################################
Loss:
generator_adv_loss:
enable: True
params:
average_by_discriminators: False
weights: 1.0
discriminator_adv_loss:
enable: True
params:
average_by_discriminators: False
weights: 1.0
stft_loss:
enable: False # Whether to use multi-resolution STFT loss.
mel_loss:
enable: True
params:
fs: 16000
fft_size: 2048
hop_size: 200
win_length: 1000
window: "hann"
num_mels: 80
fmin: 0
fmax: 8000
log_base: null
weights: 45.0
subband_stft_loss:
enable: False
params:
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
window: "hann_window" # Window function for STFT-based loss
feat_match_loss:
enable: True
params:
average_by_discriminators: false
average_by_layers: false
weights: 2.0
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 16
batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: True
num_workers: 2 # FIXME: set > 0 may stuck on macos
remove_short_samples: False
allow_cache: False
generator_grad_norm: -1
discriminator_grad_norm: -1
###########################################################
# INTERVAL SETTING #
###########################################################
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
train_max_steps: 2500000 # Number of training steps.
save_interval_steps: 20000 # Interval steps to save checkpoint.
eval_interval_steps: 10000 # Interval steps to evaluate the network.
log_interval_steps: 1000 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
model_type: hifigan
Model:
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
Generator:
params:
in_channels: 80
out_channels: 1
channels: 256
kernel_size: 7
upsample_scales: [10, 5, 2, 2]
upsample_kernal_sizes: [20, 11, 4, 4]
resblock_kernel_sizes: [3, 7, 11]
resblock_dilations:
- [1, 3, 5, 7]
- [1, 3, 5, 7]
- [1, 3, 5, 7]
bias: true
causal: false
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
use_weight_norm: true
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
MultiScaleDiscriminator:
params:
scales: 3
downsample_pooling: "DWT"
downsample_pooling_params:
kernel_size: 4
stride: 2
padding: 2
discriminator_params:
in_channels: 1
out_channels: 1
kernel_sizes: [15, 41, 5, 3]
channels: 128
max_downsample_channels: 1024
max_groups: 16
bias: true
downsample_scales: [4, 4, 4, 4, 1]
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
follow_official_norm: true
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
MultiPeriodDiscriminator:
params:
periods: [2, 3, 5, 7, 11]
discriminator_params:
in_channels: 1
out_channels: 1
kernel_sizes: [5, 3]
channels: 32
downsample_scales: [3, 3, 3, 3, 1]
max_downsample_channels: 1024
bias: true
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
use_spectral_norm: false
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
####################################################
# LOSS SETTING #
####################################################
Loss:
generator_adv_loss:
enable: True
params:
average_by_discriminators: False
weights: 1.0
discriminator_adv_loss:
enable: True
params:
average_by_discriminators: False
weights: 1.0
stft_loss:
enable: False # Whether to use multi-resolution STFT loss.
mel_loss:
enable: True
params:
fs: 16000
fft_size: 2048
hop_size: 200
win_length: 1000
window: "hann"
num_mels: 80
fmin: 0
fmax: 8000
log_base: null
weights: 45.0
subband_stft_loss:
enable: False
params:
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
window: "hann_window" # Window function for STFT-based loss
feat_match_loss:
enable: True
params:
average_by_discriminators: false
average_by_layers: false
weights: 2.0
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 16
batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: True
num_workers: 2 # FIXME: set > 0 may stuck on macos
remove_short_samples: False
allow_cache: True
generator_grad_norm: -1
discriminator_grad_norm: -1
###########################################################
# INTERVAL SETTING #
###########################################################
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
train_max_steps: 2500000 # Number of training steps.
save_interval_steps: 20000 # Interval steps to save checkpoint.
eval_interval_steps: 10000 # Interval steps to evaluate the network.
log_interval_steps: 1000 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
model_type: hifigan
Model:
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
Generator:
params:
in_channels: 80
out_channels: 1
channels: 256
kernel_size: 7
upsample_scales: [10, 5, 2, 2]
upsample_kernal_sizes: [20, 10, 4, 4]
resblock_kernel_sizes: [3, 7, 11]
resblock_dilations:
- [1, 3, 5, 7]
- [1, 3, 5, 7]
- [1, 3, 5, 7]
bias: true
causal: true
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
use_weight_norm: true
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
MultiScaleDiscriminator:
params:
scales: 3
downsample_pooling: "DWT"
downsample_pooling_params:
kernel_size: 4
stride: 2
padding: 2
discriminator_params:
in_channels: 1
out_channels: 1
kernel_sizes: [15, 41, 5, 3]
channels: 128
max_downsample_channels: 1024
max_groups: 16
bias: true
downsample_scales: [4, 4, 4, 4, 1]
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
follow_official_norm: true
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
MultiPeriodDiscriminator:
params:
periods: [2, 3, 5, 7, 11]
discriminator_params:
in_channels: 1
out_channels: 1
kernel_sizes: [5, 3]
channels: 32
downsample_scales: [3, 3, 3, 3, 1]
max_downsample_channels: 1024
bias: true
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
use_spectral_norm: false
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
####################################################
# LOSS SETTING #
####################################################
Loss:
generator_adv_loss:
enable: True
params:
average_by_discriminators: False
weights: 1.0
discriminator_adv_loss:
enable: True
params:
average_by_discriminators: False
weights: 1.0
stft_loss:
enable: False # Whether to use multi-resolution STFT loss.
mel_loss:
enable: True
params:
fs: 16000
fft_size: 2048
hop_size: 200
win_length: 1000
window: "hann"
num_mels: 80
fmin: 0
fmax: 8000
log_base: null
weights: 45.0
subband_stft_loss:
enable: False
params:
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
window: "hann_window" # Window function for STFT-based loss
feat_match_loss:
enable: True
params:
average_by_discriminators: false
average_by_layers: false
weights: 2.0
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 16
batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: True
num_workers: 2 # FIXME: set > 0 may stuck on macos
remove_short_samples: False
allow_cache: True
generator_grad_norm: -1
discriminator_grad_norm: -1
###########################################################
# INTERVAL SETTING #
###########################################################
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
train_max_steps: 2500000 # Number of training steps.
save_interval_steps: 20000 # Interval steps to save checkpoint.
eval_interval_steps: 10000 # Interval steps to evaluate the network.
log_interval_steps: 1000 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
model_type: hifigan
Model:
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
Generator:
params:
in_channels: 80
out_channels: 1
channels: 512
kernel_size: 7
upsample_scales: [8, 5, 3, 2]
upsample_kernal_sizes: [16, 10, 6, 4]
resblock_kernel_sizes: [3, 7, 11]
resblock_dilations:
- [1, 3, 5]
- [1, 3, 5]
- [1, 3, 5]
bias: true
causal: true
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
use_weight_norm: true
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
MultiScaleDiscriminator:
params:
scales: 3
downsample_pooling: "DWT"
downsample_pooling_params:
kernel_size: 4
stride: 2
padding: 2
discriminator_params:
in_channels: 1
out_channels: 1
kernel_sizes: [15, 41, 5, 3]
channels: 128
max_downsample_channels: 1024
max_groups: 16
bias: true
downsample_scales: [4, 4, 4, 4, 1]
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
follow_official_norm: true
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
MultiPeriodDiscriminator:
params:
periods: [2, 3, 5, 7, 11]
discriminator_params:
in_channels: 1
out_channels: 1
kernel_sizes: [5, 3]
channels: 32
downsample_scales: [3, 3, 3, 3, 1]
max_downsample_channels: 1024
bias: true
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
use_spectral_norm: false
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
####################################################
# LOSS SETTING #
####################################################
Loss:
generator_adv_loss:
enable: True
params:
average_by_discriminators: False
weights: 1.0
discriminator_adv_loss:
enable: True
params:
average_by_discriminators: False
weights: 1.0
stft_loss:
enable: False # Whether to use multi-resolution STFT loss.
mel_loss:
enable: True
params:
fs: 24000
fft_size: 1024
hop_size: 240
win_length: 1024
window: "hann"
num_mels: 80
fmin: 0
fmax: 8000
log_base: null
weights: 45.0
subband_stft_loss:
enable: False
params:
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
window: "hann_window" # Window function for STFT-based loss
feat_match_loss:
enable: True
params:
average_by_discriminators: false
average_by_layers: false
weights: 2.0
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 16
batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: True
num_workers: 2 # FIXME: set > 0 may stuck on macos
remove_short_samples: False
allow_cache: True
generator_grad_norm: -1
discriminator_grad_norm: -1
###########################################################
# INTERVAL SETTING #
###########################################################
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
train_max_steps: 2500000 # Number of training steps.
save_interval_steps: 20000 # Interval steps to save checkpoint.
eval_interval_steps: 10000 # Interval steps to evaluate the network.
log_interval_steps: 1000 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
model_type: hifigan
Model:
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
Generator:
params:
in_channels: 128
out_channels: 1
channels: 512
kernel_size: 7
upsample_scales: [10, 5, 3, 2, 2]
upsample_kernal_sizes: [20, 10, 6, 4, 4]
resblock_kernel_sizes: [3, 7, 11]
resblock_dilations:
- [1, 3, 5, 7]
- [1, 3, 5, 7]
- [1, 3, 5, 7]
bias: true
causal: true
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
use_weight_norm: true
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
MultiScaleDiscriminator:
params:
scales: 3
downsample_pooling: "DWT"
downsample_pooling_params:
kernel_size: 4
stride: 2
padding: 2
discriminator_params:
in_channels: 1
out_channels: 1
kernel_sizes: [15, 41, 5, 3]
channels: 128
max_downsample_channels: 1024
max_groups: 16
bias: true
downsample_scales: [4, 4, 4, 4, 1]
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
follow_official_norm: true
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
MultiPeriodDiscriminator:
params:
periods: [2, 3, 5, 7, 11]
discriminator_params:
in_channels: 1
out_channels: 1
kernel_sizes: [5, 3]
channels: 32
downsample_scales: [3, 3, 3, 3, 1]
max_downsample_channels: 1024
bias: true
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
use_spectral_norm: false
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
####################################################
# LOSS SETTING #
####################################################
Loss:
generator_adv_loss:
enable: True
params:
average_by_discriminators: False
weights: 1.0
discriminator_adv_loss:
enable: True
params:
average_by_discriminators: False
weights: 1.0
stft_loss:
enable: False # Whether to use multi-resolution STFT loss.
mel_loss:
enable: True
params:
fs: 48000
fft_size: 4096
hop_size: 600
win_length: 2400
window: "hann"
num_mels: 128
fmin: 0
fmax: 12000
log_base: null
weights: 45.0
subband_stft_loss:
enable: False
params:
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
window: "hann_window" # Window function for STFT-based loss
feat_match_loss:
enable: True
params:
average_by_discriminators: false
average_by_layers: false
weights: 2.0
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 16
batch_max_steps: 19200 # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: True
num_workers: 2 # FIXME: set > 0 may stuck on macos
remove_short_samples: False
allow_cache: True
generator_grad_norm: -1
discriminator_grad_norm: -1
###########################################################
# INTERVAL SETTING #
###########################################################
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
train_max_steps: 2500000 # Number of training steps.
save_interval_steps: 20000 # Interval steps to save checkpoint.
eval_interval_steps: 10000 # Interval steps to evaluate the network.
log_interval_steps: 1000 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
model_type: hifigan
Model:
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
Generator:
params:
in_channels: 80
out_channels: 1
channels: 256
kernel_size: 7
upsample_scales: [5, 5, 2, 2]
upsample_kernal_sizes: [10, 10, 4, 4]
resblock_kernel_sizes: [3, 7, 11]
resblock_dilations:
- [1, 3, 5, 7]
- [1, 3, 5, 7]
- [1, 3, 5, 7]
bias: true
causal: true
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
use_weight_norm: true
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
MultiScaleDiscriminator:
params:
scales: 3
downsample_pooling: "DWT"
downsample_pooling_params:
kernel_size: 4
stride: 2
padding: 2
discriminator_params:
in_channels: 1
out_channels: 1
kernel_sizes: [15, 41, 5, 3]
channels: 128
max_downsample_channels: 1024
max_groups: 16
bias: true
downsample_scales: [4, 4, 4, 4, 1]
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
follow_official_norm: true
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
MultiPeriodDiscriminator:
params:
periods: [2, 3, 5, 7, 11]
discriminator_params:
in_channels: 1
out_channels: 1
kernel_sizes: [5, 3]
channels: 32
downsample_scales: [3, 3, 3, 3, 1]
max_downsample_channels: 1024
bias: true
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
use_spectral_norm: false
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
####################################################
# LOSS SETTING #
####################################################
Loss:
generator_adv_loss:
enable: True
params:
average_by_discriminators: False
weights: 1.0
discriminator_adv_loss:
enable: True
params:
average_by_discriminators: False
weights: 1.0
stft_loss:
enable: False # Whether to use multi-resolution STFT loss.
mel_loss:
enable: True
params:
fs: 8000
fft_size: 2048
hop_size: 100
win_length: 600
window: "hann"
num_mels: 80
fmin: 0
fmax: 8000
log_base: null
weights: 45.0
subband_stft_loss:
enable: False
params:
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
window: "hann_window" # Window function for STFT-based loss
feat_match_loss:
enable: True
params:
average_by_discriminators: false
average_by_layers: false
weights: 2.0
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 16
batch_max_steps: 6000 # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: True
num_workers: 2 # FIXME: set > 0 may stuck on macos
remove_short_samples: False
allow_cache: True
generator_grad_norm: -1
discriminator_grad_norm: -1
###########################################################
# INTERVAL SETTING #
###########################################################
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
train_max_steps: 2500000 # Number of training steps.
save_interval_steps: 20000 # Interval steps to save checkpoint.
eval_interval_steps: 10000 # Interval steps to evaluate the network.
log_interval_steps: 1000 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
model_type: hifigan
Model:
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
Generator:
params:
in_channels: 80
out_channels: 1
channels: 512
kernel_size: 7
upsample_scales: [8, 5, 3, 2]
upsample_kernal_sizes: [16, 10, 6, 4]
resblock_kernel_sizes: [3, 7, 11]
resblock_dilations:
- [1, 3, 5]
- [1, 3, 5]
- [1, 3, 5]
bias: true
causal: true
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
use_weight_norm: true
nsf_params:
nb_harmonics: 7
sampling_rate: 24000
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
MultiScaleDiscriminator:
params:
scales: 3
downsample_pooling: "DWT"
downsample_pooling_params:
kernel_size: 4
stride: 2
padding: 2
discriminator_params:
in_channels: 1
out_channels: 1
kernel_sizes: [15, 41, 5, 3]
channels: 128
max_downsample_channels: 1024
max_groups: 16
bias: true
downsample_scales: [4, 4, 4, 4, 1]
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
follow_official_norm: true
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
MultiPeriodDiscriminator:
params:
periods: [2, 3, 5, 7, 11]
discriminator_params:
in_channels: 1
out_channels: 1
kernel_sizes: [5, 3]
channels: 32
downsample_scales: [3, 3, 3, 3, 1]
max_downsample_channels: 1024
bias: true
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.1
use_spectral_norm: false
optimizer:
type: Adam
params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler:
type: MultiStepLR
params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
####################################################
# LOSS SETTING #
####################################################
Loss:
generator_adv_loss:
enable: True
params:
average_by_discriminators: False
weights: 1.0
discriminator_adv_loss:
enable: True
params:
average_by_discriminators: False
weights: 1.0
stft_loss:
enable: False # Whether to use multi-resolution STFT loss.
mel_loss:
enable: True
params:
fs: 24000
fft_size: 1024
hop_size: 240
win_length: 1024
window: "hann"
num_mels: 80
fmin: 0
fmax: 8000
log_base: null
weights: 45.0
subband_stft_loss:
enable: False
params:
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
window: "hann_window" # Window function for STFT-based loss
feat_match_loss:
enable: True
params:
average_by_discriminators: false
average_by_layers: false
weights: 2.0
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 16
batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: False
num_workers: 2 # FIXME: set > 0 may stuck on macos
remove_short_samples: False
allow_cache: True
generator_grad_norm: -1
discriminator_grad_norm: -1
###########################################################
# INTERVAL SETTING #
###########################################################
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
train_max_steps: 2500000 # Number of training steps.
save_interval_steps: 20000 # Interval steps to save checkpoint.
eval_interval_steps: 10000 # Interval steps to evaluate the network.
log_interval_steps: 1000 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
model_type: sambert
Model:
#########################################################
# SAMBERT NETWORK ARCHITECTURE SETTING #
#########################################################
KanTtsSAMBERT:
params:
max_len: 800
embedding_dim: 512
encoder_num_layers: 8
encoder_num_heads: 8
encoder_num_units: 128
encoder_ffn_inner_dim: 1024
encoder_dropout: 0.1
encoder_attention_dropout: 0.1
encoder_relu_dropout: 0.1
encoder_projection_units: 32
speaker_units: 32
emotion_units: 32
predictor_filter_size: 41
predictor_fsmn_num_layers: 3
predictor_num_memory_units: 128
predictor_ffn_inner_dim: 256
predictor_dropout: 0.1
predictor_shift: 0
predictor_lstm_units: 128
dur_pred_prenet_units: [128, 128]
dur_pred_lstm_units: 128
decoder_prenet_units: [256, 256]
decoder_num_layers: 12
decoder_num_heads: 8
decoder_num_units: 128
decoder_ffn_inner_dim: 1024
decoder_dropout: 0.1
decoder_attention_dropout: 0.1
decoder_relu_dropout: 0.1
outputs_per_step: 3
num_mels: 80
postnet_filter_size: 41
postnet_fsmn_num_layers: 4
postnet_num_memory_units: 256
postnet_ffn_inner_dim: 512
postnet_dropout: 0.1
postnet_shift: 17
postnet_lstm_units: 128
MAS: False
optimizer:
type: Adam
params:
lr: 0.001
betas: [0.9, 0.98]
eps: 1.0e-9
weight_decay: 0.0
scheduler:
type: NoamLR
params:
warmup_steps: 4000
linguistic_unit:
cleaners: english_cleaners
lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
speaker_list: F7
####################################################
# LOSS SETTING #
####################################################
Loss:
MelReconLoss:
enable: True
params:
loss_type: mae
ProsodyReconLoss:
enable: True
params:
loss_type: mae
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 32
pin_memory: False
num_workers: 4 # FIXME: set > 0 may stuck on macos
remove_short_samples: False
allow_cache: True
grad_norm: 1.0
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps: 1000000 # Number of training steps.
save_interval_steps: 20000 # Interval steps to save checkpoint.
eval_interval_steps: 10000 # Interval steps to evaluate the network.
log_interval_steps: 1000 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
model_type: sambert
Model:
#########################################################
# SAMBERT NETWORK ARCHITECTURE SETTING #
#########################################################
KanTtsSAMBERT:
params:
max_len: 800
embedding_dim: 512
encoder_num_layers: 8
encoder_num_heads: 8
encoder_num_units: 128
encoder_ffn_inner_dim: 1024
encoder_dropout: 0.1
encoder_attention_dropout: 0.1
encoder_relu_dropout: 0.1
encoder_projection_units: 32
speaker_units: 32
emotion_units: 32
predictor_filter_size: 41
predictor_fsmn_num_layers: 3
predictor_num_memory_units: 128
predictor_ffn_inner_dim: 256
predictor_dropout: 0.1
predictor_shift: 0
predictor_lstm_units: 128
dur_pred_prenet_units: [128, 128]
dur_pred_lstm_units: 128
decoder_prenet_units: [256, 256]
decoder_num_layers: 12
decoder_num_heads: 8
decoder_num_units: 128
decoder_ffn_inner_dim: 1024
decoder_dropout: 0.1
decoder_attention_dropout: 0.1
decoder_relu_dropout: 0.1
outputs_per_step: 3
num_mels: 80
postnet_filter_size: 41
postnet_fsmn_num_layers: 4
postnet_num_memory_units: 256
postnet_ffn_inner_dim: 512
postnet_dropout: 0.1
postnet_shift: 17
postnet_lstm_units: 128
MAS: True
optimizer:
type: Adam
params:
lr: 0.001
betas: [0.9, 0.98]
eps: 1.0e-9
weight_decay: 0.0
scheduler:
type: NoamLR
params:
warmup_steps: 4000
linguistic_unit:
cleaners: english_cleaners
lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
speaker_list: F7
####################################################
# LOSS SETTING #
####################################################
Loss:
MelReconLoss:
enable: True
params:
loss_type: mae
ProsodyReconLoss:
enable: True
params:
loss_type: mae
AttentionCTCLoss:
enable: True
AttentionBinarizationLoss:
enable: True
params:
start_epoch: 0
warmup_epoch: 100
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 32
pin_memory: False
num_workers: 4 # FIXME: set > 0 may stuck on macos
remove_short_samples: False
allow_cache: True
grad_norm: 1.0
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps: 1000000 # Number of training steps.
save_interval_steps: 20000 # Interval steps to save checkpoint.
eval_interval_steps: 10000 # Interval steps to evaluate the network.
log_interval_steps: 1000 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
model_type: sambert
Model:
#########################################################
# SAMBERT NETWORK ARCHITECTURE SETTING #
#########################################################
KanTtsSAMBERT:
params:
max_len: 800
embedding_dim: 512
encoder_num_layers: 8
encoder_num_heads: 8
encoder_num_units: 128
encoder_ffn_inner_dim: 1024
encoder_dropout: 0.1
encoder_attention_dropout: 0.1
encoder_relu_dropout: 0.1
encoder_projection_units: 32
speaker_units: 32
emotion_units: 32
predictor_filter_size: 41
predictor_fsmn_num_layers: 3
predictor_num_memory_units: 128
predictor_ffn_inner_dim: 256
predictor_dropout: 0.1
predictor_shift: 0
predictor_lstm_units: 128
dur_pred_prenet_units: [128, 128]
dur_pred_lstm_units: 128
decoder_prenet_units: [256, 256]
decoder_num_layers: 12
decoder_num_heads: 8
decoder_num_units: 128
decoder_ffn_inner_dim: 1024
decoder_dropout: 0.1
decoder_attention_dropout: 0.1
decoder_relu_dropout: 0.1
outputs_per_step: 3
num_mels: 80
postnet_filter_size: 41
postnet_fsmn_num_layers: 4
postnet_num_memory_units: 256
postnet_ffn_inner_dim: 512
postnet_dropout: 0.1
postnet_shift: 17
postnet_lstm_units: 128
MAS: True
using_byte: True
optimizer:
type: Adam
params:
lr: 0.001
betas: [0.9, 0.98]
eps: 1.0e-9
weight_decay: 0.0
scheduler:
type: NoamLR
params:
warmup_steps: 4000
linguistic_unit:
cleaners: english_cleaners
lfeat_type_list: byte_index,emo_category,speaker_category
speaker_list: F7
####################################################
# LOSS SETTING #
####################################################
Loss:
MelReconLoss:
enable: True
params:
loss_type: mae
ProsodyReconLoss:
enable: True
params:
loss_type: mae
AttentionCTCLoss:
enable: True
AttentionBinarizationLoss:
enable: True
params:
start_epoch: 0
warmup_epoch: 100
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 8
pin_memory: False
num_workers: 4 # FIXME: set > 0 may stuck on macos
remove_short_samples: False
allow_cache: True
grad_norm: 1.0
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps: 1000000 # Number of training steps.
save_interval_steps: 20000 # Interval steps to save checkpoint.
eval_interval_steps: 10000 # Interval steps to evaluate the network.
log_interval_steps: 1000 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
model_type: sambert
Model:
#########################################################
# SAMBERT NETWORK ARCHITECTURE SETTING #
#########################################################
KanTtsSAMBERT:
params:
max_len: 800
embedding_dim: 512
encoder_num_layers: 8
encoder_num_heads: 8
encoder_num_units: 128
encoder_ffn_inner_dim: 1024
encoder_dropout: 0.1
encoder_attention_dropout: 0.1
encoder_relu_dropout: 0.1
encoder_projection_units: 32
speaker_units: 32
emotion_units: 32
predictor_filter_size: 41
predictor_fsmn_num_layers: 3
predictor_num_memory_units: 128
predictor_ffn_inner_dim: 256
predictor_dropout: 0.1
predictor_shift: 0
predictor_lstm_units: 128
dur_pred_prenet_units: [128, 128]
dur_pred_lstm_units: 128
decoder_prenet_units: [256, 256]
decoder_num_layers: 12
decoder_num_heads: 8
decoder_num_units: 128
decoder_ffn_inner_dim: 1024
decoder_dropout: 0.1
decoder_attention_dropout: 0.1
decoder_relu_dropout: 0.1
outputs_per_step: 3
num_mels: 80
postnet_filter_size: 41
postnet_fsmn_num_layers: 4
postnet_num_memory_units: 256
postnet_ffn_inner_dim: 512
postnet_dropout: 0.1
postnet_shift: 17
postnet_lstm_units: 128
MAS: False
optimizer:
type: Adam
params:
lr: 0.001
betas: [0.9, 0.98]
eps: 1.0e-9
weight_decay: 0.0
scheduler:
type: NoamLR
params:
warmup_steps: 4000
linguistic_unit:
cleaners: english_cleaners
lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
speaker_list: F7
####################################################
# LOSS SETTING #
####################################################
Loss:
MelReconLoss:
enable: True
params:
loss_type: mae
ProsodyReconLoss:
enable: True
params:
loss_type: mae
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 32
pin_memory: False
num_workers: 4 # FIXME: set > 0 may stuck on macos
remove_short_samples: False
allow_cache: True
grad_norm: 1.0
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps: 1000000 # Number of training steps.
save_interval_steps: 20000 # Interval steps to save checkpoint.
eval_interval_steps: 10000 # Interval steps to evaluate the network.
log_interval_steps: 1000 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
model_type: sambert
Model:
#########################################################
# SAMBERT NETWORK ARCHITECTURE SETTING #
#########################################################
KanTtsSAMBERT:
params:
max_len: 900
embedding_dim: 512
encoder_num_layers: 8
encoder_num_heads: 8
encoder_num_units: 128
encoder_ffn_inner_dim: 1024
encoder_dropout: 0.1
encoder_attention_dropout: 0.1
encoder_relu_dropout: 0.1
encoder_projection_units: 32
speaker_units: 32
emotion_units: 32
predictor_filter_size: 41
predictor_fsmn_num_layers: 3
predictor_num_memory_units: 128
predictor_ffn_inner_dim: 256
predictor_dropout: 0.1
predictor_shift: 0
predictor_lstm_units: 128
dur_pred_prenet_units: [128, 128]
dur_pred_lstm_units: 128
decoder_prenet_units: [256, 256]
decoder_num_layers: 12
decoder_num_heads: 8
decoder_num_units: 128
decoder_ffn_inner_dim: 1024
decoder_dropout: 0.1
decoder_attention_dropout: 0.1
decoder_relu_dropout: 0.1
outputs_per_step: 3
num_mels: 128
postnet_filter_size: 41
postnet_fsmn_num_layers: 4
postnet_num_memory_units: 256
postnet_ffn_inner_dim: 512
postnet_dropout: 0.1
postnet_shift: 17
postnet_lstm_units: 128
MAS: False
optimizer:
type: Adam
params:
lr: 0.001
betas: [0.9, 0.98]
eps: 1.0e-9
weight_decay: 0.0
scheduler:
type: NoamLR
params:
warmup_steps: 4000
linguistic_unit:
cleaners: english_cleaners
lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
speaker_list: F7
####################################################
# LOSS SETTING #
####################################################
Loss:
MelReconLoss:
enable: True
params:
loss_type: mae
ProsodyReconLoss:
enable: True
params:
loss_type: mae
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 32
pin_memory: False
num_workers: 4 # FIXME: set > 0 may stuck on macos
remove_short_samples: False
allow_cache: True
grad_norm: 1.0
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps: 1000000 # Number of training steps.
save_interval_steps: 20000 # Interval steps to save checkpoint.
eval_interval_steps: 10000 # Interval steps to evaluate the network.
log_interval_steps: 1000 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
model_type: sambert
Model:
#########################################################
# SAMBERT NETWORK ARCHITECTURE SETTING #
#########################################################
KanTtsSAMBERT:
params:
max_len: 800
embedding_dim: 512
encoder_num_layers: 8
encoder_num_heads: 8
encoder_num_units: 128
encoder_ffn_inner_dim: 1024
encoder_dropout: 0.1
encoder_attention_dropout: 0.1
encoder_relu_dropout: 0.1
encoder_projection_units: 32
speaker_units: 32
emotion_units: 32
predictor_filter_size: 41
predictor_fsmn_num_layers: 3
predictor_num_memory_units: 128
predictor_ffn_inner_dim: 256
predictor_dropout: 0.1
predictor_shift: 0
predictor_lstm_units: 128
dur_pred_prenet_units: [128, 128]
dur_pred_lstm_units: 128
decoder_prenet_units: [256, 256]
decoder_num_layers: 12
decoder_num_heads: 8
decoder_num_units: 128
decoder_ffn_inner_dim: 1024
decoder_dropout: 0.1
decoder_attention_dropout: 0.1
decoder_relu_dropout: 0.1
outputs_per_step: 3
num_mels: 80
postnet_filter_size: 41
postnet_fsmn_num_layers: 4
postnet_num_memory_units: 256
postnet_ffn_inner_dim: 512
postnet_dropout: 0.1
postnet_shift: 17
postnet_lstm_units: 128
MAS: False
FP: True
optimizer:
type: Adam
params:
lr: 0.001
betas: [0.9, 0.98]
eps: 1.0e-9
weight_decay: 0.0
scheduler:
type: NoamLR
params:
warmup_steps: 4000
linguistic_unit:
cleaners: english_cleaners
lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
speaker_list: F7,F74,M7,FBYN,FRXL,xiaoyu
####################################################
# LOSS SETTING #
####################################################
Loss:
MelReconLoss:
enable: True
params:
loss_type: mae
ProsodyReconLoss:
enable: True
params:
loss_type: mae
FpCELoss:
enable: True
params:
loss_type: ce
weight: [1,4,4,8]
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 16
pin_memory: False
num_workers: 4 # FIXME: set > 0 may stuck on macos
remove_short_samples: False
allow_cache: True
grad_norm: 1.0
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps: 1000000 # Number of training steps.
save_interval_steps: 20000 # Interval steps to save checkpoint.
eval_interval_steps: 10000 # Interval steps to evaluate the network.
log_interval_steps: 1000 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
model_type: sambert
Model:
#########################################################
# SAMBERT NETWORK ARCHITECTURE SETTING #
#########################################################
KanTtsSAMBERT:
params:
max_len: 800
embedding_dim: 512
encoder_num_layers: 8
encoder_num_heads: 8
encoder_num_units: 128
encoder_ffn_inner_dim: 1024
encoder_dropout: 0.1
encoder_attention_dropout: 0.1
encoder_relu_dropout: 0.1
encoder_projection_units: 32
speaker_units: 32
emotion_units: 32
predictor_filter_size: 41
predictor_fsmn_num_layers: 3
predictor_num_memory_units: 128
predictor_ffn_inner_dim: 256
predictor_dropout: 0.1
predictor_shift: 0
predictor_lstm_units: 128
dur_pred_prenet_units: [128, 128]
dur_pred_lstm_units: 128
decoder_prenet_units: [256, 256]
decoder_num_layers: 12
decoder_num_heads: 8
decoder_num_units: 128
decoder_ffn_inner_dim: 1024
decoder_dropout: 0.1
decoder_attention_dropout: 0.1
decoder_relu_dropout: 0.1
outputs_per_step: 3
num_mels: 82
postnet_filter_size: 41
postnet_fsmn_num_layers: 4
postnet_num_memory_units: 256
postnet_ffn_inner_dim: 512
postnet_dropout: 0.1
postnet_shift: 17
postnet_lstm_units: 128
MAS: False
NSF: True
optimizer:
type: Adam
params:
lr: 0.001
betas: [0.9, 0.98]
eps: 1.0e-9
weight_decay: 0.0
scheduler:
type: NoamLR
params:
warmup_steps: 4000
linguistic_unit:
cleaners: english_cleaners
lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
####################################################
# LOSS SETTING #
####################################################
Loss:
MelReconLoss:
enable: True
params:
loss_type: mae
ProsodyReconLoss:
enable: True
params:
loss_type: mae
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 32
pin_memory: False
num_workers: 4 # FIXME: set > 0 may stuck on macos
remove_short_samples: False
allow_cache: True
grad_norm: 1.0
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps: 10000000 # Number of training steps.
save_interval_steps: 20000 # Interval steps to save checkpoint.
eval_interval_steps: 2300500 # Interval steps to evaluate the network.
log_interval_steps: 1000 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
model_type: sambert
Model:
#########################################################
# SAMBERT NETWORK ARCHITECTURE SETTING #
#########################################################
KanTtsSAMBERT:
params:
max_len: 800
embedding_dim: 512
encoder_num_layers: 8
encoder_num_heads: 8
encoder_num_units: 128
encoder_ffn_inner_dim: 1024
encoder_dropout: 0.1
encoder_attention_dropout: 0.1
encoder_relu_dropout: 0.1
encoder_projection_units: 32
speaker_units: 32
emotion_units: 32
predictor_filter_size: 41
predictor_fsmn_num_layers: 3
predictor_num_memory_units: 128
predictor_ffn_inner_dim: 256
predictor_dropout: 0.1
predictor_shift: 0
predictor_lstm_units: 128
dur_pred_prenet_units: [128, 128]
dur_pred_lstm_units: 128
decoder_prenet_units: [256, 256]
decoder_num_layers: 12
decoder_num_heads: 8
decoder_num_units: 128
decoder_ffn_inner_dim: 1024
decoder_dropout: 0.1
decoder_attention_dropout: 0.1
decoder_relu_dropout: 0.1
outputs_per_step: 3
num_mels: 82
postnet_filter_size: 41
postnet_fsmn_num_layers: 4
postnet_num_memory_units: 256
postnet_ffn_inner_dim: 512
postnet_dropout: 0.1
postnet_shift: 17
postnet_lstm_units: 128
MAS: False
NSF: True
optimizer:
type: Adam
params:
lr: 0.001
betas: [0.9, 0.98]
eps: 1.0e-9
weight_decay: 0.0
scheduler:
type: NoamLR
params:
warmup_steps: 4000
linguistic_unit:
cleaners: english_cleaners
lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
speaker_list: F7
####################################################
# LOSS SETTING #
####################################################
Loss:
MelReconLoss:
enable: True
params:
loss_type: mae
ProsodyReconLoss:
enable: True
params:
loss_type: mae
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 32
pin_memory: False
num_workers: 4 # FIXME: set > 0 may stuck on macos
remove_short_samples: False
allow_cache: True
grad_norm: 1.0
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps: 1000000 # Number of training steps.
save_interval_steps: 20000 # Interval steps to save checkpoint.
eval_interval_steps: 10000 # Interval steps to evaluate the network.
log_interval_steps: 1000 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment