name: &name "MarbleNet-3x2x64" model: sample_rate: 16000 repeat: 2 dropout: 0.0 kernel_size_factor: 1.0 labels: ['background', 'speech'] train_ds: manifest_filepath: ??? sample_rate: ${model.sample_rate} labels: ${model.labels} batch_size: 128 shuffle: True # tarred datasets is_tarred: false tarred_audio_filepaths: null tarred_shard_strategy: "scatter" shuffle_n: 2048 num_workers: 8 pin_memory: true # bucketing params bucketing_strategy: "synced_randomized" bucketing_batch_size: null bucketing_weights: null augmentor: shift: prob: 1.0 min_shift_ms: -5.0 max_shift_ms: 5.0 white_noise: prob: 1.0 min_level: -90 max_level: -46 validation_ds: manifest_filepath: ??? sample_rate: ${model.sample_rate} labels: ${model.labels} batch_size: 128 shuffle: False num_workers: 8 pin_memory: true val_loss_idx: 0 test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} labels: ${model.labels} batch_size: 128 shuffle: False num_workers: 8 pin_memory: true test_loss_idx: 0 preprocessor: _target_: nemo.collections.asr.modules.AudioToMFCCPreprocessor window_size: 0.025 window_stride: 0.01 window: "hann" n_mels: &n_mels 64 n_mfcc: *n_mels n_fft: 512 spec_augment: _target_: nemo.collections.asr.modules.SpectrogramAugmentation freq_masks: 2 time_masks: 2 freq_width: 15 time_width: 25 rect_masks: 5 rect_time: 25 rect_freq: 15 encoder: _target_: nemo.collections.asr.modules.ConvASREncoder feat_in: *n_mels activation: relu conv_mask: true jasper: - filters: 128 repeat: 1 kernel: [11] stride: [1] dilation: [1] dropout: ${model.dropout} residual: false separable: true kernel_size_factor: ${model.kernel_size_factor} - filters: 64 repeat: ${model.repeat} kernel: [13] stride: [1] dilation: [1] dropout: ${model.dropout} residual: true separable: true kernel_size_factor: ${model.kernel_size_factor} - filters: 64 repeat: ${model.repeat} kernel: [15] stride: [1] dilation: [1] dropout: ${model.dropout} residual: true separable: true kernel_size_factor: ${model.kernel_size_factor} - filters: 64 repeat: ${model.repeat} kernel: [17] stride: [1] dilation: [1] dropout: ${model.dropout} residual: true separable: true kernel_size_factor: ${model.kernel_size_factor} - filters: 128 repeat: 1 kernel: [29] stride: [1] dilation: [2] dropout: ${model.dropout} residual: false separable: true kernel_size_factor: ${model.kernel_size_factor} - filters: &enc_final_filters 128 repeat: 1 kernel: [1] stride: [1] dilation: [1] dropout: ${model.dropout} residual: false decoder: _target_: nemo.collections.asr.modules.ConvASRDecoderClassification feat_in: *enc_final_filters return_logits: true pooling_type: 'avg' optim: name: sgd lr: 0.01 # optimizer arguments weight_decay: 0.001 momentum: 0.9 # scheduler setup sched: name: PolynomialHoldDecayAnnealing # Scheduler params power: 2.0 warmup_ratio: 0.05 hold_ratio: 0.45 min_lr: 0.001 last_epoch: -1 trainer: devices: 1 # number of gpus max_epochs: 150 max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp accumulate_grad_batches: 1 enable_checkpointing: False # Provided by exp_manager logger: False # Provided by exp_manager log_every_n_steps: 1 # Interval of logging. val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations benchmark: false # needs to be false for models with variable-length speech input as it slows down training exp_manager: exp_dir: null name: *name create_tensorboard_logger: True create_checkpoint_callback: True create_wandb_logger: False wandb_logger_kwargs: name: null project: null