# This config contains the default values for self-supervised pre-training of a Conformer ASR model, large size (~120M). # Architecture and training config: # Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective # batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. # Here are the recommended configs for different variants of Conformer-CTC, other parameters are the same as in this config file. # One extra layer (compared to original paper) is added to the medium and large variants to compensate for replacing the LSTM decoder with a linear one. # # +-------------+---------+---------+----------+------------+-----+ # | Model | d_model | n_heads | n_layers | time_masks | lr | # +=============+=========+========+===========+============+=====+ # | Small (13M)| 176 | 4 | 16 | 5 | 5.0 | # +-------------+---------+--------+-----------+------------+-----+ # | Medium (30M)| 256 | 4 | 18 | 5 | 5.0 | # +-------------+---------+--------+-----------+------------+-----+ # | Large (121M)| 512 | 8 | 18 | 10 | 2.0 | # +---------------------------------------------------------------+ # # If you do not want to train with AMP, you may use weight decay of 0.0 or reduce the number of time maskings to 2 # with time_width=100. It may help when you want to train for fewer epochs and need faster convergence. # With weight_decay=0.0, learning rate may need to get reduced to 2.0. name: "Conformer-SSL" init_from_pretrained_model: "ssl_en_conformer_large" model: sample_rate: 22050 combined_loss: true pitch_augment: true augment_sim_alpha: 1.0 stop_gradient: false augment_ctc: true aug_loss_type: "cosine" pad_multiple: 1 train_ds: manifest_speaker_verification_fp: ??? manifest_content_fp: ??? sample_rate: ${model.sample_rate} batch_size_content: 8 # you may increase batch_size if your memory allows batch_size_sv: 20 shuffle: true num_workers_sv: 4 num_workers_content: 6 pin_memory: false max_duration_content: 16.7 min_duration_content: 8.0 segment_max_duration: 2 sup_data_path: ??? pitch_augment: ${model.pitch_augment} cache_pitch_augment: true pad_multiple: ${model.pad_multiple} validation_ds: manifest_speaker_verification_fp: ??? manifest_content_fp: ??? sample_rate: ${model.sample_rate} batch_size_content: 4 # you may increase batch_size if your memory allows batch_size_sv: 8 shuffle: false num_workers_sv: 0 num_workers_content: 0 pin_memory: true use_start_end_token: false max_duration_content: 16.7 min_duration_content: 8.0 segment_max_duration: 2 sup_data_path: ??? pitch_augment: ${model.pitch_augment} cache_pitch_augment: true pad_multiple: ${model.pad_multiple} preprocessor: _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor sample_rate: ${model.sample_rate} normalize: "per_feature" window_size: null window_stride: null n_window_size: 1024 n_window_stride: 256 window: "hann" features: 80 n_fft: 1024 log: true frame_splicing: 1 dither: 0.00001 pad_to: 16 pad_value: 0.0 spec_augment: _target_: nemo.collections.asr.modules.MaskedPatchAugmentation freq_masks: 3 freq_width: 20 patch_size: 48 mask_patches: 0.5 downstream_heads: task_names: ['speaker_verification', 'content'] speaker_embed_size: 256 num_speakers: 5994 content_embed_size: 128 encoder: _target_: nemo.collections.asr.modules.ConformerEncoder feat_in: ${model.preprocessor.features} feat_out: -1 # you may set it if you need different output size other than the default d_model n_layers: 18 d_model: 512 # Sub-sampling params subsampling: striding # vggnet or striding, vggnet may give better results but needs more memory subsampling_factor: 4 # must be power of 2 subsampling_conv_channels: -1 # -1 sets it to d_model # Feed forward module's params ff_expansion_factor: 4 # Multi-headed Attention Module's params self_attention_model: rel_pos # rel_pos or abs_pos n_heads: 8 # may need to be lower for smaller d_models # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention att_context_size: [-1, -1] # -1 means unlimited context xscaling: true # scales up the input embeddings by sqrt(d_model) untie_biases: true # unties the biases of the TransformerXL layers pos_emb_max_len: 5000 # Convolution module's params conv_kernel_size: 31 conv_norm_type: 'batch_norm' # batch_norm or layer_norm ### regularization dropout: 0.1 # The dropout used in most of the Conformer Modules dropout_emb: 0.0 # The dropout used for embeddings dropout_att: 0.1 # The dropout for multi-headed attention modules decoder_out: 128 optim_backbone: _target_: torch.optim.Adam lr: 5e-5 sched: min_lr: 1e-6 warmup_steps: 2000 optim_downstream: _target_: torch.optim.Adam lr: 1e-4 sched: min_lr: 1e-6 warmup_steps: 1000 trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 1000 max_steps: 500000 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto strategy: ddp accumulate_grad_batches: 1 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs sync_batchnorm: true enable_checkpointing: False # Provided by exp_manager logger: false # Provided by exp_manager benchmark: false # needs to be false for models with variable-length speech input as it slows down training exp_manager: exp_dir: null name: ${name} create_tensorboard_logger: true create_checkpoint_callback: true checkpoint_callback_params: # in case of multiple validation sets, first one is used monitor: "val_loss" mode: "min" save_top_k: 5 resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. # you need to set these two to True to continue the training resume_if_exists: false resume_ignore_no_checkpoint: false # You may use this section to create a W&B logger create_wandb_logger: false wandb_logger_kwargs: name: null project: null