name: &name "ContextNet5x1" sample_rate: &sample_rate 16000 repeat: &repeat 1 dropout: &dropout 0.0 separable: &separable true model: train_ds: manifest_filepath: ??? sample_rate: 16000 batch_size: 32 trim_silence: True max_duration: 16.7 shuffle: True num_workers: 8 pin_memory: true # tarred datasets is_tarred: false tarred_audio_filepaths: null shard_strategy: "scatter" shuffle_n: 2048 # bucketing params bucketing_strategy: "synced_randomized" bucketing_batch_size: null validation_ds: manifest_filepath: ??? sample_rate: 16000 batch_size: 32 shuffle: False num_workers: 8 pin_memory: true tokenizer: dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) type: ??? # Can be either bpe or wpe preprocessor: _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor normalize: "per_feature" window_size: 0.02 sample_rate: *sample_rate window_stride: 0.01 window: "hann" features: &n_mels 64 n_fft: 512 frame_splicing: 1 dither: 0.00001 spec_augment: _target_: nemo.collections.asr.modules.SpectrogramAugmentation rect_freq: 50 rect_masks: 5 rect_time: 120 encoder: _target_: nemo.collections.asr.modules.ConvASREncoder feat_in: *n_mels activation: relu conv_mask: true jasper: - filters: 128 repeat: 1 kernel: [11] stride: [1] dilation: [1] dropout: *dropout residual: true separable: *separable se: true se_context_size: -1 - filters: 256 repeat: *repeat kernel: [13] stride: [1] dilation: [1] dropout: *dropout residual: true separable: *separable se: true se_context_size: -1 - filters: 256 repeat: *repeat kernel: [15] stride: [1] dilation: [1] dropout: *dropout residual: true separable: *separable se: true se_context_size: -1 - filters: 256 repeat: *repeat kernel: [17] stride: [1] dilation: [1] dropout: *dropout residual: true separable: *separable se: true se_context_size: -1 - filters: 256 repeat: *repeat kernel: [19] stride: [1] dilation: [1] dropout: *dropout residual: true separable: *separable se: true se_context_size: -1 - filters: 256 repeat: 1 kernel: [21] stride: [1] dilation: [1] dropout: 0.0 residual: false separable: *separable se: true se_context_size: -1 - filters: &enc_feat_out 1024 repeat: 1 kernel: [1] stride: [1] dilation: [1] dropout: 0.0 residual: false separable: *separable se: true se_context_size: -1 decoder: _target_: nemo.collections.asr.modules.ConvASRDecoder feat_in: 1024 num_classes: -1 # filled with vocabulary size from tokenizer at runtime vocabulary: [] # filled with vocabulary from tokenizer at runtime optim: name: adam # _target_: nemo.core.optim.optimizers.Adam lr: .1 # optimizer arguments betas: [0.9, 0.999] weight_decay: 0.0001 # scheduler setup sched: name: CosineAnnealing # scheduler config override warmup_steps: null warmup_ratio: 0.05 min_lr: 1e-6 last_epoch: -1 trainer: devices: 1 # number of gpus max_epochs: 5 max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp accumulate_grad_batches: 1 enable_checkpointing: False # Provided by exp_manager logger: False # Provided by exp_manager log_every_n_steps: 1 # Interval of logging. val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations benchmark: false # needs to be false for models with variable-length speech input as it slows down training exp_manager: exp_dir: null name: *name create_tensorboard_logger: True create_checkpoint_callback: True create_wandb_logger: False wandb_logger_kwargs: name: null project: null