# This config contains the default values for training VITS model on LJSpeech dataset. # If you want to train model on other dataset, you can change config values according to your dataset. # Most dataset-specific arguments are in the head of the config file, see below. # TODO: remove unnecessary arguments, refactoring name: VITS train_dataset: ??? validation_datasets: ??? sup_data_path: null sup_data_types: null phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" # Default values from librosa.pyin pitch_fmin: 65.40639132514966 pitch_fmax: 2093.004522404789 sample_rate: 22050 n_mel_channels: 80 n_window_size: 1024 n_window_stride: 256 n_fft: 1024 lowfreq: 0 highfreq: null window: hann model: pitch_fmin: ${pitch_fmin} pitch_fmax: ${pitch_fmax} sample_rate: ${sample_rate} n_mel_channels: ${n_mel_channels} n_window_size: ${n_window_size} n_window_stride: ${n_window_stride} n_fft: ${n_fft} lowfreq: ${lowfreq} highfreq: ${highfreq} window: ${window} mel_fmin: 0.0 mel_fmax: null n_speakers: 0 segment_size: 8192 c_mel: 45 c_kl: 1. use_spectral_norm: false text_normalizer: _target_: nemo_text_processing.text_normalization.normalize.Normalizer lang: en input_case: cased text_normalizer_call_kwargs: verbose: false punct_pre_process: true punct_post_process: true text_tokenizer: _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer punct: true apostrophe: true pad_with_space: false g2p: _target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p phoneme_dict: ${phoneme_dict_path} heteronyms: ${heteronyms_path} phoneme_probability: 0.8 # Relies on the heteronyms list for anything that needs to be disambiguated ignore_ambiguous_words: false use_chars: true use_stresses: true train_ds: dataset: _target_: "nemo.collections.tts.data.dataset.TTSDataset" manifest_filepath: ${train_dataset} sample_rate: ${model.sample_rate} sup_data_path: ${sup_data_path} sup_data_types: ${sup_data_types} n_fft: ${model.n_fft} win_length: ${model.n_window_size} hop_length: ${model.n_window_stride} window: ${model.window} n_mels: ${model.n_mel_channels} lowfreq: ${model.lowfreq} highfreq: ${model.highfreq} max_duration: null min_duration: 0.1 ignore_file: null trim: False pitch_fmin: ${model.pitch_fmin} pitch_fmax: ${model.pitch_fmax} dataloader_params: num_workers: 8 pin_memory: false batch_sampler: batch_size: 32 boundaries: [32,300,400,500,600,700,800,900,1000] num_replicas: ${trainer.devices} shuffle: true validation_ds: dataset: _target_: "nemo.collections.tts.data.dataset.TTSDataset" manifest_filepath: ${validation_datasets} sample_rate: ${model.sample_rate} sup_data_path: ${sup_data_path} sup_data_types: ${sup_data_types} n_fft: ${model.n_fft} win_length: ${model.n_window_size} hop_length: ${model.n_window_stride} window: ${model.window} n_mels: ${model.n_mel_channels} lowfreq: ${model.lowfreq} highfreq: ${model.highfreq} max_duration: null min_duration: 0.1 ignore_file: null trim: False pitch_fmin: ${model.pitch_fmin} pitch_fmax: ${model.pitch_fmax} dataloader_params: drop_last: false shuffle: false batch_size: 16 num_workers: 4 pin_memory: false preprocessor: _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures nfilt: ${model.n_mel_channels} highfreq: ${model.highfreq} log: true log_zero_guard_type: clamp log_zero_guard_value: 1e-05 lowfreq: ${model.lowfreq} n_fft: ${model.n_fft} n_window_size: ${model.n_window_size} n_window_stride: ${model.n_window_stride} pad_to: 1 pad_value: 0 sample_rate: ${model.sample_rate} window: ${model.window} normalize: null preemph: null dither: 0.0 frame_splicing: 1 stft_conv: false nb_augmentation_prob : 0 mag_power: 1.0 exact_pad: true use_grads: true synthesizer: _target_: nemo.collections.tts.modules.vits_modules.SynthesizerTrn inter_channels: 192 hidden_channels: 192 filter_channels: 768 n_heads: 2 n_layers: 6 kernel_size: 3 p_dropout: 0.1 resblock: "1" resblock_kernel_sizes: [3,7,11] resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] upsample_rates: [8,8,2,2] upsample_initial_channel: 512 upsample_kernel_sizes: [16,16,4,4] n_speakers: ${model.n_speakers} gin_channels: 256 # for multi-speaker optim: _target_: torch.optim.AdamW lr: 2e-4 betas: [0.9, 0.99] eps: 1e-9 sched: name: ExponentialLR lr_decay: 0.999875 trainer: num_nodes: 1 devices: 2 accelerator: gpu strategy: ddp precision: 32 # amp_backend: 'apex' # amp_level: 'O2' # benchmark: true max_epochs: -1 accumulate_grad_batches: 1 enable_checkpointing: false # Provided by exp_manager logger: false # Provided by exp_manager log_every_n_steps: 50 check_val_every_n_epoch: 1 exp_manager: exp_dir: ??? name: ${name} create_tensorboard_logger: true create_checkpoint_callback: true checkpoint_callback_params: monitor: loss_gen_all mode: min resume_if_exists: false resume_ignore_no_checkpoint: false