Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
sambert-hifigan_pytorch
Commits
51782715
Commit
51782715
authored
Feb 23, 2024
by
liugh5
Browse files
update
parent
8b4e9acd
Changes
182
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
2512 deletions
+0
-2512
kantts/configs/audio_config_24k.yaml
kantts/configs/audio_config_24k.yaml
+0
-27
kantts/configs/audio_config_48k.yaml
kantts/configs/audio_config_48k.yaml
+0
-27
kantts/configs/audio_config_8k.yaml
kantts/configs/audio_config_8k.yaml
+0
-28
kantts/configs/audio_config_se_16k.yaml
kantts/configs/audio_config_se_16k.yaml
+0
-28
kantts/configs/hifigan_noncausal_nsf_global_v1_16k.yaml
kantts/configs/hifigan_noncausal_nsf_global_v1_16k.yaml
+0
-195
kantts/configs/hifigan_noncausal_nsf_v1_16k.yaml
kantts/configs/hifigan_noncausal_nsf_v1_16k.yaml
+0
-192
kantts/configs/hifigan_noncausal_v1_16k.yaml
kantts/configs/hifigan_noncausal_v1_16k.yaml
+0
-188
kantts/configs/hifigan_v1_16k.yaml
kantts/configs/hifigan_v1_16k.yaml
+0
-188
kantts/configs/hifigan_v1_24k.yaml
kantts/configs/hifigan_v1_24k.yaml
+0
-188
kantts/configs/hifigan_v1_48k.yaml
kantts/configs/hifigan_v1_48k.yaml
+0
-188
kantts/configs/hifigan_v1_8k.yaml
kantts/configs/hifigan_v1_8k.yaml
+0
-188
kantts/configs/hifigan_v1_nsf_24k.yaml
kantts/configs/hifigan_v1_nsf_24k.yaml
+0
-191
kantts/configs/sambert_16k.yaml
kantts/configs/sambert_16k.yaml
+0
-105
kantts/configs/sambert_16k_MAS.yaml
kantts/configs/sambert_16k_MAS.yaml
+0
-118
kantts/configs/sambert_16k_MAS_byte.yaml
kantts/configs/sambert_16k_MAS_byte.yaml
+0
-119
kantts/configs/sambert_24k.yaml
kantts/configs/sambert_24k.yaml
+0
-107
kantts/configs/sambert_48k.yaml
kantts/configs/sambert_48k.yaml
+0
-107
kantts/configs/sambert_fp_8k.yaml
kantts/configs/sambert_fp_8k.yaml
+0
-113
kantts/configs/sambert_nsf_16k.yaml
kantts/configs/sambert_nsf_16k.yaml
+0
-107
kantts/configs/sambert_nsf_24k.yaml
kantts/configs/sambert_nsf_24k.yaml
+0
-108
No files found.
kantts/configs/audio_config_24k.yaml
deleted
100644 → 0
View file @
8b4e9acd
# Audio processing configs
audio_config
:
# Preprocess
wav_normalize
:
True
trim_silence
:
True
trim_silence_threshold_db
:
60
preemphasize
:
False
# Feature extraction
sampling_rate
:
24000
hop_length
:
240
win_length
:
1024
n_fft
:
1024
n_mels
:
80
fmin
:
50.0
fmax
:
8000.0
phone_level_feature
:
True
# Normalization
norm_type
:
"
mean_std"
# "mean_std" or "global"
max_norm
:
1.0
symmetric
:
False
min_level_db
:
-100.0
ref_level_db
:
20
num_workers
:
16
kantts/configs/audio_config_48k.yaml
deleted
100644 → 0
View file @
8b4e9acd
# Audio processing configs
audio_config
:
# Preprocess
wav_normalize
:
True
trim_silence
:
True
trim_silence_threshold_db
:
60
preemphasize
:
False
# Feature extraction
sampling_rate
:
48000
hop_length
:
600
win_length
:
2400
n_fft
:
4096
n_mels
:
128
fmin
:
0.0
fmax
:
12000.0
phone_level_feature
:
True
# Normalization
norm_type
:
"
mean_std"
# "mean_std" or "global"
max_norm
:
1.0
symmetric
:
False
min_level_db
:
-100.0
ref_level_db
:
20
num_workers
:
16
kantts/configs/audio_config_8k.yaml
deleted
100644 → 0
View file @
8b4e9acd
# Audio processing configs
audio_config
:
# Preprocess
wav_normalize
:
True
trim_silence
:
True
trim_silence_threshold_db
:
60
preemphasize
:
False
# Feature extraction
sampling_rate
:
8000
hop_length
:
100
win_length
:
600
n_fft
:
2048
n_mels
:
80
fmin
:
0.0
fmax
:
4000.0
phone_level_feature
:
True
# Normalization
norm_type
:
"
mean_std"
# "mean_std" or "global"
max_norm
:
1.0
symmetric
:
False
min_level_db
:
-100.0
ref_level_db
:
20
num_workers
:
16
kantts/configs/audio_config_se_16k.yaml
deleted
100644 → 0
View file @
8b4e9acd
# Audio processing configs
audio_config
:
# Preprocess
wav_normalize
:
True
trim_silence
:
True
trim_silence_threshold_db
:
60
preemphasize
:
False
# Feature extraction
sampling_rate
:
16000
hop_length
:
200
win_length
:
1000
n_fft
:
2048
n_mels
:
80
fmin
:
0.0
fmax
:
8000.0
phone_level_feature
:
True
se_feature
:
True
# Normalization
norm_type
:
"
mean_std"
# "mean_std" or "global"
max_norm
:
1.0
symmetric
:
False
min_level_db
:
-100.0
ref_level_db
:
20
num_workers
:
16
kantts/configs/hifigan_noncausal_nsf_global_v1_16k.yaml
deleted
100644 → 0
View file @
8b4e9acd
model_type
:
hifigan
Model
:
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
Generator
:
params
:
in_channels
:
80
out_channels
:
1
channels
:
256
kernel_size
:
7
upsample_scales
:
[
10
,
5
,
2
,
2
]
upsample_kernal_sizes
:
[
20
,
11
,
4
,
4
]
resblock_kernel_sizes
:
[
3
,
7
,
11
]
resblock_dilations
:
-
[
1
,
3
,
5
,
7
]
-
[
1
,
3
,
5
,
7
]
-
[
1
,
3
,
5
,
7
]
bias
:
true
causal
:
false
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
use_weight_norm
:
true
nsf_params
:
nb_harmonics
:
7
sampling_rate
:
16000
nsf_norm_type
:
"
global"
# "mean_std" , "global"
nsf_f0_global_minimum
:
30.0
nsf_f0_global_maximum
:
730.0
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
MultiScaleDiscriminator
:
params
:
scales
:
3
downsample_pooling
:
"
DWT"
downsample_pooling_params
:
kernel_size
:
4
stride
:
2
padding
:
2
discriminator_params
:
in_channels
:
1
out_channels
:
1
kernel_sizes
:
[
15
,
41
,
5
,
3
]
channels
:
128
max_downsample_channels
:
1024
max_groups
:
16
bias
:
true
downsample_scales
:
[
4
,
4
,
4
,
4
,
1
]
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
follow_official_norm
:
true
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
MultiPeriodDiscriminator
:
params
:
periods
:
[
2
,
3
,
5
,
7
,
11
]
discriminator_params
:
in_channels
:
1
out_channels
:
1
kernel_sizes
:
[
5
,
3
]
channels
:
32
downsample_scales
:
[
3
,
3
,
3
,
3
,
1
]
max_downsample_channels
:
1024
bias
:
true
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
use_spectral_norm
:
false
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
####################################################
# LOSS SETTING #
####################################################
Loss
:
generator_adv_loss
:
enable
:
True
params
:
average_by_discriminators
:
False
weights
:
1.0
discriminator_adv_loss
:
enable
:
True
params
:
average_by_discriminators
:
False
weights
:
1.0
stft_loss
:
enable
:
False
# Whether to use multi-resolution STFT loss.
mel_loss
:
enable
:
True
params
:
fs
:
16000
fft_size
:
2048
hop_size
:
200
win_length
:
1000
window
:
"
hann"
num_mels
:
80
fmin
:
0
fmax
:
8000
log_base
:
null
weights
:
45.0
subband_stft_loss
:
enable
:
False
params
:
fft_sizes
:
[
384
,
683
,
171
]
# List of FFT size for STFT-based loss.
hop_sizes
:
[
35
,
75
,
15
]
# List of hop size for STFT-based loss
win_lengths
:
[
150
,
300
,
60
]
# List of window length for STFT-based loss.
window
:
"
hann_window"
# Window function for STFT-based loss
feat_match_loss
:
enable
:
True
params
:
average_by_discriminators
:
false
average_by_layers
:
false
weights
:
2.0
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
16
batch_max_steps
:
9600
# Length of each audio in batch. Make sure dividable by hop_size.
pin_memory
:
True
num_workers
:
2
# FIXME: set > 0 may stuck on macos
remove_short_samples
:
False
allow_cache
:
False
generator_grad_norm
:
-1
discriminator_grad_norm
:
-1
###########################################################
# INTERVAL SETTING #
###########################################################
generator_train_start_steps
:
1
# Number of steps to start to train discriminator.
discriminator_train_start_steps
:
0
# Number of steps to start to train discriminator.
train_max_steps
:
2500000
# Number of training steps.
save_interval_steps
:
20000
# Interval steps to save checkpoint.
eval_interval_steps
:
10000
# Interval steps to evaluate the network.
log_interval_steps
:
1000
# Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results
:
4
# Number of results to be saved as intermediate results.
kantts/configs/hifigan_noncausal_nsf_v1_16k.yaml
deleted
100644 → 0
View file @
8b4e9acd
model_type
:
hifigan
Model
:
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
Generator
:
params
:
in_channels
:
80
out_channels
:
1
channels
:
256
kernel_size
:
7
upsample_scales
:
[
10
,
5
,
2
,
2
]
upsample_kernal_sizes
:
[
20
,
11
,
4
,
4
]
resblock_kernel_sizes
:
[
3
,
7
,
11
]
resblock_dilations
:
-
[
1
,
3
,
5
,
7
]
-
[
1
,
3
,
5
,
7
]
-
[
1
,
3
,
5
,
7
]
bias
:
true
causal
:
false
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
use_weight_norm
:
true
nsf_params
:
nb_harmonics
:
7
sampling_rate
:
16000
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
MultiScaleDiscriminator
:
params
:
scales
:
3
downsample_pooling
:
"
DWT"
downsample_pooling_params
:
kernel_size
:
4
stride
:
2
padding
:
2
discriminator_params
:
in_channels
:
1
out_channels
:
1
kernel_sizes
:
[
15
,
41
,
5
,
3
]
channels
:
128
max_downsample_channels
:
1024
max_groups
:
16
bias
:
true
downsample_scales
:
[
4
,
4
,
4
,
4
,
1
]
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
follow_official_norm
:
true
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
MultiPeriodDiscriminator
:
params
:
periods
:
[
2
,
3
,
5
,
7
,
11
]
discriminator_params
:
in_channels
:
1
out_channels
:
1
kernel_sizes
:
[
5
,
3
]
channels
:
32
downsample_scales
:
[
3
,
3
,
3
,
3
,
1
]
max_downsample_channels
:
1024
bias
:
true
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
use_spectral_norm
:
false
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
####################################################
# LOSS SETTING #
####################################################
Loss
:
generator_adv_loss
:
enable
:
True
params
:
average_by_discriminators
:
False
weights
:
1.0
discriminator_adv_loss
:
enable
:
True
params
:
average_by_discriminators
:
False
weights
:
1.0
stft_loss
:
enable
:
False
# Whether to use multi-resolution STFT loss.
mel_loss
:
enable
:
True
params
:
fs
:
16000
fft_size
:
2048
hop_size
:
200
win_length
:
1000
window
:
"
hann"
num_mels
:
80
fmin
:
0
fmax
:
8000
log_base
:
null
weights
:
45.0
subband_stft_loss
:
enable
:
False
params
:
fft_sizes
:
[
384
,
683
,
171
]
# List of FFT size for STFT-based loss.
hop_sizes
:
[
35
,
75
,
15
]
# List of hop size for STFT-based loss
win_lengths
:
[
150
,
300
,
60
]
# List of window length for STFT-based loss.
window
:
"
hann_window"
# Window function for STFT-based loss
feat_match_loss
:
enable
:
True
params
:
average_by_discriminators
:
false
average_by_layers
:
false
weights
:
2.0
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
16
batch_max_steps
:
9600
# Length of each audio in batch. Make sure dividable by hop_size.
pin_memory
:
True
num_workers
:
2
# FIXME: set > 0 may stuck on macos
remove_short_samples
:
False
allow_cache
:
False
generator_grad_norm
:
-1
discriminator_grad_norm
:
-1
###########################################################
# INTERVAL SETTING #
###########################################################
generator_train_start_steps
:
1
# Number of steps to start to train discriminator.
discriminator_train_start_steps
:
0
# Number of steps to start to train discriminator.
train_max_steps
:
2500000
# Number of training steps.
save_interval_steps
:
20000
# Interval steps to save checkpoint.
eval_interval_steps
:
10000
# Interval steps to evaluate the network.
log_interval_steps
:
1000
# Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results
:
4
# Number of results to be saved as intermediate results.
kantts/configs/hifigan_noncausal_v1_16k.yaml
deleted
100644 → 0
View file @
8b4e9acd
model_type
:
hifigan
Model
:
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
Generator
:
params
:
in_channels
:
80
out_channels
:
1
channels
:
256
kernel_size
:
7
upsample_scales
:
[
10
,
5
,
2
,
2
]
upsample_kernal_sizes
:
[
20
,
11
,
4
,
4
]
resblock_kernel_sizes
:
[
3
,
7
,
11
]
resblock_dilations
:
-
[
1
,
3
,
5
,
7
]
-
[
1
,
3
,
5
,
7
]
-
[
1
,
3
,
5
,
7
]
bias
:
true
causal
:
false
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
use_weight_norm
:
true
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
MultiScaleDiscriminator
:
params
:
scales
:
3
downsample_pooling
:
"
DWT"
downsample_pooling_params
:
kernel_size
:
4
stride
:
2
padding
:
2
discriminator_params
:
in_channels
:
1
out_channels
:
1
kernel_sizes
:
[
15
,
41
,
5
,
3
]
channels
:
128
max_downsample_channels
:
1024
max_groups
:
16
bias
:
true
downsample_scales
:
[
4
,
4
,
4
,
4
,
1
]
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
follow_official_norm
:
true
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
MultiPeriodDiscriminator
:
params
:
periods
:
[
2
,
3
,
5
,
7
,
11
]
discriminator_params
:
in_channels
:
1
out_channels
:
1
kernel_sizes
:
[
5
,
3
]
channels
:
32
downsample_scales
:
[
3
,
3
,
3
,
3
,
1
]
max_downsample_channels
:
1024
bias
:
true
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
use_spectral_norm
:
false
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
####################################################
# LOSS SETTING #
####################################################
Loss
:
generator_adv_loss
:
enable
:
True
params
:
average_by_discriminators
:
False
weights
:
1.0
discriminator_adv_loss
:
enable
:
True
params
:
average_by_discriminators
:
False
weights
:
1.0
stft_loss
:
enable
:
False
# Whether to use multi-resolution STFT loss.
mel_loss
:
enable
:
True
params
:
fs
:
16000
fft_size
:
2048
hop_size
:
200
win_length
:
1000
window
:
"
hann"
num_mels
:
80
fmin
:
0
fmax
:
8000
log_base
:
null
weights
:
45.0
subband_stft_loss
:
enable
:
False
params
:
fft_sizes
:
[
384
,
683
,
171
]
# List of FFT size for STFT-based loss.
hop_sizes
:
[
35
,
75
,
15
]
# List of hop size for STFT-based loss
win_lengths
:
[
150
,
300
,
60
]
# List of window length for STFT-based loss.
window
:
"
hann_window"
# Window function for STFT-based loss
feat_match_loss
:
enable
:
True
params
:
average_by_discriminators
:
false
average_by_layers
:
false
weights
:
2.0
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
16
batch_max_steps
:
9600
# Length of each audio in batch. Make sure dividable by hop_size.
pin_memory
:
True
num_workers
:
2
# FIXME: set > 0 may stuck on macos
remove_short_samples
:
False
allow_cache
:
True
generator_grad_norm
:
-1
discriminator_grad_norm
:
-1
###########################################################
# INTERVAL SETTING #
###########################################################
generator_train_start_steps
:
1
# Number of steps to start to train discriminator.
discriminator_train_start_steps
:
0
# Number of steps to start to train discriminator.
train_max_steps
:
2500000
# Number of training steps.
save_interval_steps
:
20000
# Interval steps to save checkpoint.
eval_interval_steps
:
10000
# Interval steps to evaluate the network.
log_interval_steps
:
1000
# Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results
:
4
# Number of results to be saved as intermediate results.
kantts/configs/hifigan_v1_16k.yaml
deleted
100644 → 0
View file @
8b4e9acd
model_type
:
hifigan
Model
:
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
Generator
:
params
:
in_channels
:
80
out_channels
:
1
channels
:
256
kernel_size
:
7
upsample_scales
:
[
10
,
5
,
2
,
2
]
upsample_kernal_sizes
:
[
20
,
10
,
4
,
4
]
resblock_kernel_sizes
:
[
3
,
7
,
11
]
resblock_dilations
:
-
[
1
,
3
,
5
,
7
]
-
[
1
,
3
,
5
,
7
]
-
[
1
,
3
,
5
,
7
]
bias
:
true
causal
:
true
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
use_weight_norm
:
true
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
MultiScaleDiscriminator
:
params
:
scales
:
3
downsample_pooling
:
"
DWT"
downsample_pooling_params
:
kernel_size
:
4
stride
:
2
padding
:
2
discriminator_params
:
in_channels
:
1
out_channels
:
1
kernel_sizes
:
[
15
,
41
,
5
,
3
]
channels
:
128
max_downsample_channels
:
1024
max_groups
:
16
bias
:
true
downsample_scales
:
[
4
,
4
,
4
,
4
,
1
]
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
follow_official_norm
:
true
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
MultiPeriodDiscriminator
:
params
:
periods
:
[
2
,
3
,
5
,
7
,
11
]
discriminator_params
:
in_channels
:
1
out_channels
:
1
kernel_sizes
:
[
5
,
3
]
channels
:
32
downsample_scales
:
[
3
,
3
,
3
,
3
,
1
]
max_downsample_channels
:
1024
bias
:
true
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
use_spectral_norm
:
false
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
####################################################
# LOSS SETTING #
####################################################
Loss
:
generator_adv_loss
:
enable
:
True
params
:
average_by_discriminators
:
False
weights
:
1.0
discriminator_adv_loss
:
enable
:
True
params
:
average_by_discriminators
:
False
weights
:
1.0
stft_loss
:
enable
:
False
# Whether to use multi-resolution STFT loss.
mel_loss
:
enable
:
True
params
:
fs
:
16000
fft_size
:
2048
hop_size
:
200
win_length
:
1000
window
:
"
hann"
num_mels
:
80
fmin
:
0
fmax
:
8000
log_base
:
null
weights
:
45.0
subband_stft_loss
:
enable
:
False
params
:
fft_sizes
:
[
384
,
683
,
171
]
# List of FFT size for STFT-based loss.
hop_sizes
:
[
35
,
75
,
15
]
# List of hop size for STFT-based loss
win_lengths
:
[
150
,
300
,
60
]
# List of window length for STFT-based loss.
window
:
"
hann_window"
# Window function for STFT-based loss
feat_match_loss
:
enable
:
True
params
:
average_by_discriminators
:
false
average_by_layers
:
false
weights
:
2.0
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
16
batch_max_steps
:
9600
# Length of each audio in batch. Make sure dividable by hop_size.
pin_memory
:
True
num_workers
:
2
# FIXME: set > 0 may stuck on macos
remove_short_samples
:
False
allow_cache
:
True
generator_grad_norm
:
-1
discriminator_grad_norm
:
-1
###########################################################
# INTERVAL SETTING #
###########################################################
generator_train_start_steps
:
1
# Number of steps to start to train discriminator.
discriminator_train_start_steps
:
0
# Number of steps to start to train discriminator.
train_max_steps
:
2500000
# Number of training steps.
save_interval_steps
:
20000
# Interval steps to save checkpoint.
eval_interval_steps
:
10000
# Interval steps to evaluate the network.
log_interval_steps
:
1000
# Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results
:
4
# Number of results to be saved as intermediate results.
kantts/configs/hifigan_v1_24k.yaml
deleted
100644 → 0
View file @
8b4e9acd
model_type
:
hifigan
Model
:
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
Generator
:
params
:
in_channels
:
80
out_channels
:
1
channels
:
512
kernel_size
:
7
upsample_scales
:
[
8
,
5
,
3
,
2
]
upsample_kernal_sizes
:
[
16
,
10
,
6
,
4
]
resblock_kernel_sizes
:
[
3
,
7
,
11
]
resblock_dilations
:
-
[
1
,
3
,
5
]
-
[
1
,
3
,
5
]
-
[
1
,
3
,
5
]
bias
:
true
causal
:
true
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
use_weight_norm
:
true
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
MultiScaleDiscriminator
:
params
:
scales
:
3
downsample_pooling
:
"
DWT"
downsample_pooling_params
:
kernel_size
:
4
stride
:
2
padding
:
2
discriminator_params
:
in_channels
:
1
out_channels
:
1
kernel_sizes
:
[
15
,
41
,
5
,
3
]
channels
:
128
max_downsample_channels
:
1024
max_groups
:
16
bias
:
true
downsample_scales
:
[
4
,
4
,
4
,
4
,
1
]
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
follow_official_norm
:
true
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
MultiPeriodDiscriminator
:
params
:
periods
:
[
2
,
3
,
5
,
7
,
11
]
discriminator_params
:
in_channels
:
1
out_channels
:
1
kernel_sizes
:
[
5
,
3
]
channels
:
32
downsample_scales
:
[
3
,
3
,
3
,
3
,
1
]
max_downsample_channels
:
1024
bias
:
true
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
use_spectral_norm
:
false
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
####################################################
# LOSS SETTING #
####################################################
Loss
:
generator_adv_loss
:
enable
:
True
params
:
average_by_discriminators
:
False
weights
:
1.0
discriminator_adv_loss
:
enable
:
True
params
:
average_by_discriminators
:
False
weights
:
1.0
stft_loss
:
enable
:
False
# Whether to use multi-resolution STFT loss.
mel_loss
:
enable
:
True
params
:
fs
:
24000
fft_size
:
1024
hop_size
:
240
win_length
:
1024
window
:
"
hann"
num_mels
:
80
fmin
:
0
fmax
:
8000
log_base
:
null
weights
:
45.0
subband_stft_loss
:
enable
:
False
params
:
fft_sizes
:
[
384
,
683
,
171
]
# List of FFT size for STFT-based loss.
hop_sizes
:
[
35
,
75
,
15
]
# List of hop size for STFT-based loss
win_lengths
:
[
150
,
300
,
60
]
# List of window length for STFT-based loss.
window
:
"
hann_window"
# Window function for STFT-based loss
feat_match_loss
:
enable
:
True
params
:
average_by_discriminators
:
false
average_by_layers
:
false
weights
:
2.0
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
16
batch_max_steps
:
9600
# Length of each audio in batch. Make sure dividable by hop_size.
pin_memory
:
True
num_workers
:
2
# FIXME: set > 0 may stuck on macos
remove_short_samples
:
False
allow_cache
:
True
generator_grad_norm
:
-1
discriminator_grad_norm
:
-1
###########################################################
# INTERVAL SETTING #
###########################################################
generator_train_start_steps
:
1
# Number of steps to start to train discriminator.
discriminator_train_start_steps
:
0
# Number of steps to start to train discriminator.
train_max_steps
:
2500000
# Number of training steps.
save_interval_steps
:
20000
# Interval steps to save checkpoint.
eval_interval_steps
:
10000
# Interval steps to evaluate the network.
log_interval_steps
:
1000
# Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results
:
4
# Number of results to be saved as intermediate results.
kantts/configs/hifigan_v1_48k.yaml
deleted
100644 → 0
View file @
8b4e9acd
model_type
:
hifigan
Model
:
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
Generator
:
params
:
in_channels
:
128
out_channels
:
1
channels
:
512
kernel_size
:
7
upsample_scales
:
[
10
,
5
,
3
,
2
,
2
]
upsample_kernal_sizes
:
[
20
,
10
,
6
,
4
,
4
]
resblock_kernel_sizes
:
[
3
,
7
,
11
]
resblock_dilations
:
-
[
1
,
3
,
5
,
7
]
-
[
1
,
3
,
5
,
7
]
-
[
1
,
3
,
5
,
7
]
bias
:
true
causal
:
true
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
use_weight_norm
:
true
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
MultiScaleDiscriminator
:
params
:
scales
:
3
downsample_pooling
:
"
DWT"
downsample_pooling_params
:
kernel_size
:
4
stride
:
2
padding
:
2
discriminator_params
:
in_channels
:
1
out_channels
:
1
kernel_sizes
:
[
15
,
41
,
5
,
3
]
channels
:
128
max_downsample_channels
:
1024
max_groups
:
16
bias
:
true
downsample_scales
:
[
4
,
4
,
4
,
4
,
1
]
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
follow_official_norm
:
true
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
MultiPeriodDiscriminator
:
params
:
periods
:
[
2
,
3
,
5
,
7
,
11
]
discriminator_params
:
in_channels
:
1
out_channels
:
1
kernel_sizes
:
[
5
,
3
]
channels
:
32
downsample_scales
:
[
3
,
3
,
3
,
3
,
1
]
max_downsample_channels
:
1024
bias
:
true
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
use_spectral_norm
:
false
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
####################################################
# LOSS SETTING #
####################################################
Loss
:
generator_adv_loss
:
enable
:
True
params
:
average_by_discriminators
:
False
weights
:
1.0
discriminator_adv_loss
:
enable
:
True
params
:
average_by_discriminators
:
False
weights
:
1.0
stft_loss
:
enable
:
False
# Whether to use multi-resolution STFT loss.
mel_loss
:
enable
:
True
params
:
fs
:
48000
fft_size
:
4096
hop_size
:
600
win_length
:
2400
window
:
"
hann"
num_mels
:
128
fmin
:
0
fmax
:
12000
log_base
:
null
weights
:
45.0
subband_stft_loss
:
enable
:
False
params
:
fft_sizes
:
[
384
,
683
,
171
]
# List of FFT size for STFT-based loss.
hop_sizes
:
[
35
,
75
,
15
]
# List of hop size for STFT-based loss
win_lengths
:
[
150
,
300
,
60
]
# List of window length for STFT-based loss.
window
:
"
hann_window"
# Window function for STFT-based loss
feat_match_loss
:
enable
:
True
params
:
average_by_discriminators
:
false
average_by_layers
:
false
weights
:
2.0
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
16
batch_max_steps
:
19200
# Length of each audio in batch. Make sure dividable by hop_size.
pin_memory
:
True
num_workers
:
2
# FIXME: set > 0 may stuck on macos
remove_short_samples
:
False
allow_cache
:
True
generator_grad_norm
:
-1
discriminator_grad_norm
:
-1
###########################################################
# INTERVAL SETTING #
###########################################################
generator_train_start_steps
:
1
# Number of steps to start to train discriminator.
discriminator_train_start_steps
:
0
# Number of steps to start to train discriminator.
train_max_steps
:
2500000
# Number of training steps.
save_interval_steps
:
20000
# Interval steps to save checkpoint.
eval_interval_steps
:
10000
# Interval steps to evaluate the network.
log_interval_steps
:
1000
# Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results
:
4
# Number of results to be saved as intermediate results.
kantts/configs/hifigan_v1_8k.yaml
deleted
100644 → 0
View file @
8b4e9acd
model_type
:
hifigan
Model
:
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
Generator
:
params
:
in_channels
:
80
out_channels
:
1
channels
:
256
kernel_size
:
7
upsample_scales
:
[
5
,
5
,
2
,
2
]
upsample_kernal_sizes
:
[
10
,
10
,
4
,
4
]
resblock_kernel_sizes
:
[
3
,
7
,
11
]
resblock_dilations
:
-
[
1
,
3
,
5
,
7
]
-
[
1
,
3
,
5
,
7
]
-
[
1
,
3
,
5
,
7
]
bias
:
true
causal
:
true
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
use_weight_norm
:
true
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
MultiScaleDiscriminator
:
params
:
scales
:
3
downsample_pooling
:
"
DWT"
downsample_pooling_params
:
kernel_size
:
4
stride
:
2
padding
:
2
discriminator_params
:
in_channels
:
1
out_channels
:
1
kernel_sizes
:
[
15
,
41
,
5
,
3
]
channels
:
128
max_downsample_channels
:
1024
max_groups
:
16
bias
:
true
downsample_scales
:
[
4
,
4
,
4
,
4
,
1
]
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
follow_official_norm
:
true
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
MultiPeriodDiscriminator
:
params
:
periods
:
[
2
,
3
,
5
,
7
,
11
]
discriminator_params
:
in_channels
:
1
out_channels
:
1
kernel_sizes
:
[
5
,
3
]
channels
:
32
downsample_scales
:
[
3
,
3
,
3
,
3
,
1
]
max_downsample_channels
:
1024
bias
:
true
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
use_spectral_norm
:
false
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
####################################################
# LOSS SETTING #
####################################################
Loss
:
generator_adv_loss
:
enable
:
True
params
:
average_by_discriminators
:
False
weights
:
1.0
discriminator_adv_loss
:
enable
:
True
params
:
average_by_discriminators
:
False
weights
:
1.0
stft_loss
:
enable
:
False
# Whether to use multi-resolution STFT loss.
mel_loss
:
enable
:
True
params
:
fs
:
8000
fft_size
:
2048
hop_size
:
100
win_length
:
600
window
:
"
hann"
num_mels
:
80
fmin
:
0
fmax
:
8000
log_base
:
null
weights
:
45.0
subband_stft_loss
:
enable
:
False
params
:
fft_sizes
:
[
384
,
683
,
171
]
# List of FFT size for STFT-based loss.
hop_sizes
:
[
35
,
75
,
15
]
# List of hop size for STFT-based loss
win_lengths
:
[
150
,
300
,
60
]
# List of window length for STFT-based loss.
window
:
"
hann_window"
# Window function for STFT-based loss
feat_match_loss
:
enable
:
True
params
:
average_by_discriminators
:
false
average_by_layers
:
false
weights
:
2.0
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
16
batch_max_steps
:
6000
# Length of each audio in batch. Make sure dividable by hop_size.
pin_memory
:
True
num_workers
:
2
# FIXME: set > 0 may stuck on macos
remove_short_samples
:
False
allow_cache
:
True
generator_grad_norm
:
-1
discriminator_grad_norm
:
-1
###########################################################
# INTERVAL SETTING #
###########################################################
generator_train_start_steps
:
1
# Number of steps to start to train discriminator.
discriminator_train_start_steps
:
0
# Number of steps to start to train discriminator.
train_max_steps
:
2500000
# Number of training steps.
save_interval_steps
:
20000
# Interval steps to save checkpoint.
eval_interval_steps
:
10000
# Interval steps to evaluate the network.
log_interval_steps
:
1000
# Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results
:
4
# Number of results to be saved as intermediate results.
kantts/configs/hifigan_v1_nsf_24k.yaml
deleted
100644 → 0
View file @
8b4e9acd
model_type
:
hifigan
Model
:
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
Generator
:
params
:
in_channels
:
80
out_channels
:
1
channels
:
512
kernel_size
:
7
upsample_scales
:
[
8
,
5
,
3
,
2
]
upsample_kernal_sizes
:
[
16
,
10
,
6
,
4
]
resblock_kernel_sizes
:
[
3
,
7
,
11
]
resblock_dilations
:
-
[
1
,
3
,
5
]
-
[
1
,
3
,
5
]
-
[
1
,
3
,
5
]
bias
:
true
causal
:
true
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
use_weight_norm
:
true
nsf_params
:
nb_harmonics
:
7
sampling_rate
:
24000
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
MultiScaleDiscriminator
:
params
:
scales
:
3
downsample_pooling
:
"
DWT"
downsample_pooling_params
:
kernel_size
:
4
stride
:
2
padding
:
2
discriminator_params
:
in_channels
:
1
out_channels
:
1
kernel_sizes
:
[
15
,
41
,
5
,
3
]
channels
:
128
max_downsample_channels
:
1024
max_groups
:
16
bias
:
true
downsample_scales
:
[
4
,
4
,
4
,
4
,
1
]
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
follow_official_norm
:
true
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
MultiPeriodDiscriminator
:
params
:
periods
:
[
2
,
3
,
5
,
7
,
11
]
discriminator_params
:
in_channels
:
1
out_channels
:
1
kernel_sizes
:
[
5
,
3
]
channels
:
32
downsample_scales
:
[
3
,
3
,
3
,
3
,
1
]
max_downsample_channels
:
1024
bias
:
true
nonlinear_activation
:
"
LeakyReLU"
nonlinear_activation_params
:
negative_slope
:
0.1
use_spectral_norm
:
false
optimizer
:
type
:
Adam
params
:
lr
:
2.0e-4
betas
:
[
0.5
,
0.9
]
weight_decay
:
0.0
scheduler
:
type
:
MultiStepLR
params
:
gamma
:
0.5
milestones
:
-
200000
-
400000
-
600000
-
800000
####################################################
# LOSS SETTING #
####################################################
Loss
:
generator_adv_loss
:
enable
:
True
params
:
average_by_discriminators
:
False
weights
:
1.0
discriminator_adv_loss
:
enable
:
True
params
:
average_by_discriminators
:
False
weights
:
1.0
stft_loss
:
enable
:
False
# Whether to use multi-resolution STFT loss.
mel_loss
:
enable
:
True
params
:
fs
:
24000
fft_size
:
1024
hop_size
:
240
win_length
:
1024
window
:
"
hann"
num_mels
:
80
fmin
:
0
fmax
:
8000
log_base
:
null
weights
:
45.0
subband_stft_loss
:
enable
:
False
params
:
fft_sizes
:
[
384
,
683
,
171
]
# List of FFT size for STFT-based loss.
hop_sizes
:
[
35
,
75
,
15
]
# List of hop size for STFT-based loss
win_lengths
:
[
150
,
300
,
60
]
# List of window length for STFT-based loss.
window
:
"
hann_window"
# Window function for STFT-based loss
feat_match_loss
:
enable
:
True
params
:
average_by_discriminators
:
false
average_by_layers
:
false
weights
:
2.0
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
16
batch_max_steps
:
9600
# Length of each audio in batch. Make sure dividable by hop_size.
pin_memory
:
False
num_workers
:
2
# FIXME: set > 0 may stuck on macos
remove_short_samples
:
False
allow_cache
:
True
generator_grad_norm
:
-1
discriminator_grad_norm
:
-1
###########################################################
# INTERVAL SETTING #
###########################################################
generator_train_start_steps
:
1
# Number of steps to start to train discriminator.
discriminator_train_start_steps
:
0
# Number of steps to start to train discriminator.
train_max_steps
:
2500000
# Number of training steps.
save_interval_steps
:
20000
# Interval steps to save checkpoint.
eval_interval_steps
:
10000
# Interval steps to evaluate the network.
log_interval_steps
:
1000
# Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results
:
4
# Number of results to be saved as intermediate results.
kantts/configs/sambert_16k.yaml
deleted
100644 → 0
View file @
8b4e9acd
model_type
:
sambert
Model
:
#########################################################
# SAMBERT NETWORK ARCHITECTURE SETTING #
#########################################################
KanTtsSAMBERT
:
params
:
max_len
:
800
embedding_dim
:
512
encoder_num_layers
:
8
encoder_num_heads
:
8
encoder_num_units
:
128
encoder_ffn_inner_dim
:
1024
encoder_dropout
:
0.1
encoder_attention_dropout
:
0.1
encoder_relu_dropout
:
0.1
encoder_projection_units
:
32
speaker_units
:
32
emotion_units
:
32
predictor_filter_size
:
41
predictor_fsmn_num_layers
:
3
predictor_num_memory_units
:
128
predictor_ffn_inner_dim
:
256
predictor_dropout
:
0.1
predictor_shift
:
0
predictor_lstm_units
:
128
dur_pred_prenet_units
:
[
128
,
128
]
dur_pred_lstm_units
:
128
decoder_prenet_units
:
[
256
,
256
]
decoder_num_layers
:
12
decoder_num_heads
:
8
decoder_num_units
:
128
decoder_ffn_inner_dim
:
1024
decoder_dropout
:
0.1
decoder_attention_dropout
:
0.1
decoder_relu_dropout
:
0.1
outputs_per_step
:
3
num_mels
:
80
postnet_filter_size
:
41
postnet_fsmn_num_layers
:
4
postnet_num_memory_units
:
256
postnet_ffn_inner_dim
:
512
postnet_dropout
:
0.1
postnet_shift
:
17
postnet_lstm_units
:
128
MAS
:
False
optimizer
:
type
:
Adam
params
:
lr
:
0.001
betas
:
[
0.9
,
0.98
]
eps
:
1.0e-9
weight_decay
:
0.0
scheduler
:
type
:
NoamLR
params
:
warmup_steps
:
4000
linguistic_unit
:
cleaners
:
english_cleaners
lfeat_type_list
:
sy,tone,syllable_flag,word_segment,emo_category,speaker_category
speaker_list
:
F7
####################################################
# LOSS SETTING #
####################################################
Loss
:
MelReconLoss
:
enable
:
True
params
:
loss_type
:
mae
ProsodyReconLoss
:
enable
:
True
params
:
loss_type
:
mae
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
32
pin_memory
:
False
num_workers
:
4
# FIXME: set > 0 may stuck on macos
remove_short_samples
:
False
allow_cache
:
True
grad_norm
:
1.0
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps
:
1000000
# Number of training steps.
save_interval_steps
:
20000
# Interval steps to save checkpoint.
eval_interval_steps
:
10000
# Interval steps to evaluate the network.
log_interval_steps
:
1000
# Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results
:
4
# Number of results to be saved as intermediate results.
kantts/configs/sambert_16k_MAS.yaml
deleted
100644 → 0
View file @
8b4e9acd
model_type
:
sambert
Model
:
#########################################################
# SAMBERT NETWORK ARCHITECTURE SETTING #
#########################################################
KanTtsSAMBERT
:
params
:
max_len
:
800
embedding_dim
:
512
encoder_num_layers
:
8
encoder_num_heads
:
8
encoder_num_units
:
128
encoder_ffn_inner_dim
:
1024
encoder_dropout
:
0.1
encoder_attention_dropout
:
0.1
encoder_relu_dropout
:
0.1
encoder_projection_units
:
32
speaker_units
:
32
emotion_units
:
32
predictor_filter_size
:
41
predictor_fsmn_num_layers
:
3
predictor_num_memory_units
:
128
predictor_ffn_inner_dim
:
256
predictor_dropout
:
0.1
predictor_shift
:
0
predictor_lstm_units
:
128
dur_pred_prenet_units
:
[
128
,
128
]
dur_pred_lstm_units
:
128
decoder_prenet_units
:
[
256
,
256
]
decoder_num_layers
:
12
decoder_num_heads
:
8
decoder_num_units
:
128
decoder_ffn_inner_dim
:
1024
decoder_dropout
:
0.1
decoder_attention_dropout
:
0.1
decoder_relu_dropout
:
0.1
outputs_per_step
:
3
num_mels
:
80
postnet_filter_size
:
41
postnet_fsmn_num_layers
:
4
postnet_num_memory_units
:
256
postnet_ffn_inner_dim
:
512
postnet_dropout
:
0.1
postnet_shift
:
17
postnet_lstm_units
:
128
MAS
:
True
optimizer
:
type
:
Adam
params
:
lr
:
0.001
betas
:
[
0.9
,
0.98
]
eps
:
1.0e-9
weight_decay
:
0.0
scheduler
:
type
:
NoamLR
params
:
warmup_steps
:
4000
linguistic_unit
:
cleaners
:
english_cleaners
lfeat_type_list
:
sy,tone,syllable_flag,word_segment,emo_category,speaker_category
speaker_list
:
F7
####################################################
# LOSS SETTING #
####################################################
Loss
:
MelReconLoss
:
enable
:
True
params
:
loss_type
:
mae
ProsodyReconLoss
:
enable
:
True
params
:
loss_type
:
mae
AttentionCTCLoss
:
enable
:
True
AttentionBinarizationLoss
:
enable
:
True
params
:
start_epoch
:
0
warmup_epoch
:
100
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
32
pin_memory
:
False
num_workers
:
4
# FIXME: set > 0 may stuck on macos
remove_short_samples
:
False
allow_cache
:
True
grad_norm
:
1.0
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps
:
1000000
# Number of training steps.
save_interval_steps
:
20000
# Interval steps to save checkpoint.
eval_interval_steps
:
10000
# Interval steps to evaluate the network.
log_interval_steps
:
1000
# Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results
:
4
# Number of results to be saved as intermediate results.
kantts/configs/sambert_16k_MAS_byte.yaml
deleted
100644 → 0
View file @
8b4e9acd
model_type
:
sambert
Model
:
#########################################################
# SAMBERT NETWORK ARCHITECTURE SETTING #
#########################################################
KanTtsSAMBERT
:
params
:
max_len
:
800
embedding_dim
:
512
encoder_num_layers
:
8
encoder_num_heads
:
8
encoder_num_units
:
128
encoder_ffn_inner_dim
:
1024
encoder_dropout
:
0.1
encoder_attention_dropout
:
0.1
encoder_relu_dropout
:
0.1
encoder_projection_units
:
32
speaker_units
:
32
emotion_units
:
32
predictor_filter_size
:
41
predictor_fsmn_num_layers
:
3
predictor_num_memory_units
:
128
predictor_ffn_inner_dim
:
256
predictor_dropout
:
0.1
predictor_shift
:
0
predictor_lstm_units
:
128
dur_pred_prenet_units
:
[
128
,
128
]
dur_pred_lstm_units
:
128
decoder_prenet_units
:
[
256
,
256
]
decoder_num_layers
:
12
decoder_num_heads
:
8
decoder_num_units
:
128
decoder_ffn_inner_dim
:
1024
decoder_dropout
:
0.1
decoder_attention_dropout
:
0.1
decoder_relu_dropout
:
0.1
outputs_per_step
:
3
num_mels
:
80
postnet_filter_size
:
41
postnet_fsmn_num_layers
:
4
postnet_num_memory_units
:
256
postnet_ffn_inner_dim
:
512
postnet_dropout
:
0.1
postnet_shift
:
17
postnet_lstm_units
:
128
MAS
:
True
using_byte
:
True
optimizer
:
type
:
Adam
params
:
lr
:
0.001
betas
:
[
0.9
,
0.98
]
eps
:
1.0e-9
weight_decay
:
0.0
scheduler
:
type
:
NoamLR
params
:
warmup_steps
:
4000
linguistic_unit
:
cleaners
:
english_cleaners
lfeat_type_list
:
byte_index,emo_category,speaker_category
speaker_list
:
F7
####################################################
# LOSS SETTING #
####################################################
Loss
:
MelReconLoss
:
enable
:
True
params
:
loss_type
:
mae
ProsodyReconLoss
:
enable
:
True
params
:
loss_type
:
mae
AttentionCTCLoss
:
enable
:
True
AttentionBinarizationLoss
:
enable
:
True
params
:
start_epoch
:
0
warmup_epoch
:
100
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
8
pin_memory
:
False
num_workers
:
4
# FIXME: set > 0 may stuck on macos
remove_short_samples
:
False
allow_cache
:
True
grad_norm
:
1.0
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps
:
1000000
# Number of training steps.
save_interval_steps
:
20000
# Interval steps to save checkpoint.
eval_interval_steps
:
10000
# Interval steps to evaluate the network.
log_interval_steps
:
1000
# Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results
:
4
# Number of results to be saved as intermediate results.
kantts/configs/sambert_24k.yaml
deleted
100644 → 0
View file @
8b4e9acd
model_type
:
sambert
Model
:
#########################################################
# SAMBERT NETWORK ARCHITECTURE SETTING #
#########################################################
KanTtsSAMBERT
:
params
:
max_len
:
800
embedding_dim
:
512
encoder_num_layers
:
8
encoder_num_heads
:
8
encoder_num_units
:
128
encoder_ffn_inner_dim
:
1024
encoder_dropout
:
0.1
encoder_attention_dropout
:
0.1
encoder_relu_dropout
:
0.1
encoder_projection_units
:
32
speaker_units
:
32
emotion_units
:
32
predictor_filter_size
:
41
predictor_fsmn_num_layers
:
3
predictor_num_memory_units
:
128
predictor_ffn_inner_dim
:
256
predictor_dropout
:
0.1
predictor_shift
:
0
predictor_lstm_units
:
128
dur_pred_prenet_units
:
[
128
,
128
]
dur_pred_lstm_units
:
128
decoder_prenet_units
:
[
256
,
256
]
decoder_num_layers
:
12
decoder_num_heads
:
8
decoder_num_units
:
128
decoder_ffn_inner_dim
:
1024
decoder_dropout
:
0.1
decoder_attention_dropout
:
0.1
decoder_relu_dropout
:
0.1
outputs_per_step
:
3
num_mels
:
80
postnet_filter_size
:
41
postnet_fsmn_num_layers
:
4
postnet_num_memory_units
:
256
postnet_ffn_inner_dim
:
512
postnet_dropout
:
0.1
postnet_shift
:
17
postnet_lstm_units
:
128
MAS
:
False
optimizer
:
type
:
Adam
params
:
lr
:
0.001
betas
:
[
0.9
,
0.98
]
eps
:
1.0e-9
weight_decay
:
0.0
scheduler
:
type
:
NoamLR
params
:
warmup_steps
:
4000
linguistic_unit
:
cleaners
:
english_cleaners
lfeat_type_list
:
sy,tone,syllable_flag,word_segment,emo_category,speaker_category
speaker_list
:
F7
####################################################
# LOSS SETTING #
####################################################
Loss
:
MelReconLoss
:
enable
:
True
params
:
loss_type
:
mae
ProsodyReconLoss
:
enable
:
True
params
:
loss_type
:
mae
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
32
pin_memory
:
False
num_workers
:
4
# FIXME: set > 0 may stuck on macos
remove_short_samples
:
False
allow_cache
:
True
grad_norm
:
1.0
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps
:
1000000
# Number of training steps.
save_interval_steps
:
20000
# Interval steps to save checkpoint.
eval_interval_steps
:
10000
# Interval steps to evaluate the network.
log_interval_steps
:
1000
# Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results
:
4
# Number of results to be saved as intermediate results.
kantts/configs/sambert_48k.yaml
deleted
100644 → 0
View file @
8b4e9acd
model_type
:
sambert
Model
:
#########################################################
# SAMBERT NETWORK ARCHITECTURE SETTING #
#########################################################
KanTtsSAMBERT
:
params
:
max_len
:
900
embedding_dim
:
512
encoder_num_layers
:
8
encoder_num_heads
:
8
encoder_num_units
:
128
encoder_ffn_inner_dim
:
1024
encoder_dropout
:
0.1
encoder_attention_dropout
:
0.1
encoder_relu_dropout
:
0.1
encoder_projection_units
:
32
speaker_units
:
32
emotion_units
:
32
predictor_filter_size
:
41
predictor_fsmn_num_layers
:
3
predictor_num_memory_units
:
128
predictor_ffn_inner_dim
:
256
predictor_dropout
:
0.1
predictor_shift
:
0
predictor_lstm_units
:
128
dur_pred_prenet_units
:
[
128
,
128
]
dur_pred_lstm_units
:
128
decoder_prenet_units
:
[
256
,
256
]
decoder_num_layers
:
12
decoder_num_heads
:
8
decoder_num_units
:
128
decoder_ffn_inner_dim
:
1024
decoder_dropout
:
0.1
decoder_attention_dropout
:
0.1
decoder_relu_dropout
:
0.1
outputs_per_step
:
3
num_mels
:
128
postnet_filter_size
:
41
postnet_fsmn_num_layers
:
4
postnet_num_memory_units
:
256
postnet_ffn_inner_dim
:
512
postnet_dropout
:
0.1
postnet_shift
:
17
postnet_lstm_units
:
128
MAS
:
False
optimizer
:
type
:
Adam
params
:
lr
:
0.001
betas
:
[
0.9
,
0.98
]
eps
:
1.0e-9
weight_decay
:
0.0
scheduler
:
type
:
NoamLR
params
:
warmup_steps
:
4000
linguistic_unit
:
cleaners
:
english_cleaners
lfeat_type_list
:
sy,tone,syllable_flag,word_segment,emo_category,speaker_category
speaker_list
:
F7
####################################################
# LOSS SETTING #
####################################################
Loss
:
MelReconLoss
:
enable
:
True
params
:
loss_type
:
mae
ProsodyReconLoss
:
enable
:
True
params
:
loss_type
:
mae
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
32
pin_memory
:
False
num_workers
:
4
# FIXME: set > 0 may stuck on macos
remove_short_samples
:
False
allow_cache
:
True
grad_norm
:
1.0
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps
:
1000000
# Number of training steps.
save_interval_steps
:
20000
# Interval steps to save checkpoint.
eval_interval_steps
:
10000
# Interval steps to evaluate the network.
log_interval_steps
:
1000
# Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results
:
4
# Number of results to be saved as intermediate results.
kantts/configs/sambert_fp_8k.yaml
deleted
100644 → 0
View file @
8b4e9acd
model_type
:
sambert
Model
:
#########################################################
# SAMBERT NETWORK ARCHITECTURE SETTING #
#########################################################
KanTtsSAMBERT
:
params
:
max_len
:
800
embedding_dim
:
512
encoder_num_layers
:
8
encoder_num_heads
:
8
encoder_num_units
:
128
encoder_ffn_inner_dim
:
1024
encoder_dropout
:
0.1
encoder_attention_dropout
:
0.1
encoder_relu_dropout
:
0.1
encoder_projection_units
:
32
speaker_units
:
32
emotion_units
:
32
predictor_filter_size
:
41
predictor_fsmn_num_layers
:
3
predictor_num_memory_units
:
128
predictor_ffn_inner_dim
:
256
predictor_dropout
:
0.1
predictor_shift
:
0
predictor_lstm_units
:
128
dur_pred_prenet_units
:
[
128
,
128
]
dur_pred_lstm_units
:
128
decoder_prenet_units
:
[
256
,
256
]
decoder_num_layers
:
12
decoder_num_heads
:
8
decoder_num_units
:
128
decoder_ffn_inner_dim
:
1024
decoder_dropout
:
0.1
decoder_attention_dropout
:
0.1
decoder_relu_dropout
:
0.1
outputs_per_step
:
3
num_mels
:
80
postnet_filter_size
:
41
postnet_fsmn_num_layers
:
4
postnet_num_memory_units
:
256
postnet_ffn_inner_dim
:
512
postnet_dropout
:
0.1
postnet_shift
:
17
postnet_lstm_units
:
128
MAS
:
False
FP
:
True
optimizer
:
type
:
Adam
params
:
lr
:
0.001
betas
:
[
0.9
,
0.98
]
eps
:
1.0e-9
weight_decay
:
0.0
scheduler
:
type
:
NoamLR
params
:
warmup_steps
:
4000
linguistic_unit
:
cleaners
:
english_cleaners
lfeat_type_list
:
sy,tone,syllable_flag,word_segment,emo_category,speaker_category
speaker_list
:
F7,F74,M7,FBYN,FRXL,xiaoyu
####################################################
# LOSS SETTING #
####################################################
Loss
:
MelReconLoss
:
enable
:
True
params
:
loss_type
:
mae
ProsodyReconLoss
:
enable
:
True
params
:
loss_type
:
mae
FpCELoss
:
enable
:
True
params
:
loss_type
:
ce
weight
:
[
1
,
4
,
4
,
8
]
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
16
pin_memory
:
False
num_workers
:
4
# FIXME: set > 0 may stuck on macos
remove_short_samples
:
False
allow_cache
:
True
grad_norm
:
1.0
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps
:
1000000
# Number of training steps.
save_interval_steps
:
20000
# Interval steps to save checkpoint.
eval_interval_steps
:
10000
# Interval steps to evaluate the network.
log_interval_steps
:
1000
# Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results
:
4
# Number of results to be saved as intermediate results.
kantts/configs/sambert_nsf_16k.yaml
deleted
100644 → 0
View file @
8b4e9acd
model_type
:
sambert
Model
:
#########################################################
# SAMBERT NETWORK ARCHITECTURE SETTING #
#########################################################
KanTtsSAMBERT
:
params
:
max_len
:
800
embedding_dim
:
512
encoder_num_layers
:
8
encoder_num_heads
:
8
encoder_num_units
:
128
encoder_ffn_inner_dim
:
1024
encoder_dropout
:
0.1
encoder_attention_dropout
:
0.1
encoder_relu_dropout
:
0.1
encoder_projection_units
:
32
speaker_units
:
32
emotion_units
:
32
predictor_filter_size
:
41
predictor_fsmn_num_layers
:
3
predictor_num_memory_units
:
128
predictor_ffn_inner_dim
:
256
predictor_dropout
:
0.1
predictor_shift
:
0
predictor_lstm_units
:
128
dur_pred_prenet_units
:
[
128
,
128
]
dur_pred_lstm_units
:
128
decoder_prenet_units
:
[
256
,
256
]
decoder_num_layers
:
12
decoder_num_heads
:
8
decoder_num_units
:
128
decoder_ffn_inner_dim
:
1024
decoder_dropout
:
0.1
decoder_attention_dropout
:
0.1
decoder_relu_dropout
:
0.1
outputs_per_step
:
3
num_mels
:
82
postnet_filter_size
:
41
postnet_fsmn_num_layers
:
4
postnet_num_memory_units
:
256
postnet_ffn_inner_dim
:
512
postnet_dropout
:
0.1
postnet_shift
:
17
postnet_lstm_units
:
128
MAS
:
False
NSF
:
True
optimizer
:
type
:
Adam
params
:
lr
:
0.001
betas
:
[
0.9
,
0.98
]
eps
:
1.0e-9
weight_decay
:
0.0
scheduler
:
type
:
NoamLR
params
:
warmup_steps
:
4000
linguistic_unit
:
cleaners
:
english_cleaners
lfeat_type_list
:
sy,tone,syllable_flag,word_segment,emo_category,speaker_category
speaker_list
:
F7,F74,FBYN,FRXL,M7,xiaoyu
####################################################
# LOSS SETTING #
####################################################
Loss
:
MelReconLoss
:
enable
:
True
params
:
loss_type
:
mae
ProsodyReconLoss
:
enable
:
True
params
:
loss_type
:
mae
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
32
pin_memory
:
False
num_workers
:
4
# FIXME: set > 0 may stuck on macos
remove_short_samples
:
False
allow_cache
:
True
grad_norm
:
1.0
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps
:
10000000
# Number of training steps.
save_interval_steps
:
20000
# Interval steps to save checkpoint.
eval_interval_steps
:
2300500
# Interval steps to evaluate the network.
log_interval_steps
:
1000
# Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results
:
4
# Number of results to be saved as intermediate results.
kantts/configs/sambert_nsf_24k.yaml
deleted
100644 → 0
View file @
8b4e9acd
model_type
:
sambert
Model
:
#########################################################
# SAMBERT NETWORK ARCHITECTURE SETTING #
#########################################################
KanTtsSAMBERT
:
params
:
max_len
:
800
embedding_dim
:
512
encoder_num_layers
:
8
encoder_num_heads
:
8
encoder_num_units
:
128
encoder_ffn_inner_dim
:
1024
encoder_dropout
:
0.1
encoder_attention_dropout
:
0.1
encoder_relu_dropout
:
0.1
encoder_projection_units
:
32
speaker_units
:
32
emotion_units
:
32
predictor_filter_size
:
41
predictor_fsmn_num_layers
:
3
predictor_num_memory_units
:
128
predictor_ffn_inner_dim
:
256
predictor_dropout
:
0.1
predictor_shift
:
0
predictor_lstm_units
:
128
dur_pred_prenet_units
:
[
128
,
128
]
dur_pred_lstm_units
:
128
decoder_prenet_units
:
[
256
,
256
]
decoder_num_layers
:
12
decoder_num_heads
:
8
decoder_num_units
:
128
decoder_ffn_inner_dim
:
1024
decoder_dropout
:
0.1
decoder_attention_dropout
:
0.1
decoder_relu_dropout
:
0.1
outputs_per_step
:
3
num_mels
:
82
postnet_filter_size
:
41
postnet_fsmn_num_layers
:
4
postnet_num_memory_units
:
256
postnet_ffn_inner_dim
:
512
postnet_dropout
:
0.1
postnet_shift
:
17
postnet_lstm_units
:
128
MAS
:
False
NSF
:
True
optimizer
:
type
:
Adam
params
:
lr
:
0.001
betas
:
[
0.9
,
0.98
]
eps
:
1.0e-9
weight_decay
:
0.0
scheduler
:
type
:
NoamLR
params
:
warmup_steps
:
4000
linguistic_unit
:
cleaners
:
english_cleaners
lfeat_type_list
:
sy,tone,syllable_flag,word_segment,emo_category,speaker_category
speaker_list
:
F7
####################################################
# LOSS SETTING #
####################################################
Loss
:
MelReconLoss
:
enable
:
True
params
:
loss_type
:
mae
ProsodyReconLoss
:
enable
:
True
params
:
loss_type
:
mae
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size
:
32
pin_memory
:
False
num_workers
:
4
# FIXME: set > 0 may stuck on macos
remove_short_samples
:
False
allow_cache
:
True
grad_norm
:
1.0
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps
:
1000000
# Number of training steps.
save_interval_steps
:
20000
# Interval steps to save checkpoint.
eval_interval_steps
:
10000
# Interval steps to evaluate the network.
log_interval_steps
:
1000
# Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results
:
4
# Number of results to be saved as intermediate results.
Prev
1
2
3
4
5
6
…
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment