config.yaml

name: &name "QuartzNet15x5"
sample_rate: &sample_rate 16000
repeat: &repeat 1
dropout: &dropout 0.0
separable: &separable true
labels: &labels [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
         "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]

model:
  train_ds:
    manifest_filepath: ???
    sample_rate: 16000
    labels: *labels
    batch_size: 32
    trim_silence: True
    max_duration: 16.7
    shuffle: True
    num_workers: 8
    pin_memory: true
    # tarred datasets
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    # bucketing params
    bucketing_strategy: "synced_randomized"
    bucketing_batch_size: null

  validation_ds:
    manifest_filepath: ???
    sample_rate: 16000
    labels: *labels
    batch_size: 32
    shuffle: False
    num_workers: 8
    pin_memory: true

  preprocessor:
    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
    normalize: "per_feature"
    window_size: 0.02
    sample_rate: *sample_rate
    window_stride: 0.01
    window: "hann"
    features: &n_mels 64
    n_fft: 512
    frame_splicing: 1
    dither: 0.00001
    stft_conv: false

  spec_augment:
    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
    rect_freq: 50
    rect_masks: 5
    rect_time: 120

  encoder:
    _target_: nemo.collections.asr.modules.ConvASREncoder
    feat_in: *n_mels
    activation: relu
    conv_mask: true

    jasper:
      - filters: 128
        repeat: 1
        kernel: [11]
        stride: [1]
        dilation: [1]
        dropout: *dropout
        residual: true
        separable: *separable
        se: true
        se_context_size: -1

      - filters: 256
        repeat: *repeat
        kernel: [13]
        stride: [1]
        dilation: [1]
        dropout: *dropout
        residual: true
        separable: *separable
        se: true
        se_context_size: -1

      - filters: 256
        repeat: *repeat
        kernel: [15]
        stride: [1]
        dilation: [1]
        dropout: *dropout
        residual: true
        separable: *separable
        se: true
        se_context_size: -1

      - filters: 256
        repeat: *repeat
        kernel: [17]
        stride: [1]
        dilation: [1]
        dropout: *dropout
        residual: true
        separable: *separable
        se: true
        se_context_size: -1

      - filters: 256
        repeat: *repeat
        kernel: [19]
        stride: [1]
        dilation: [1]
        dropout: *dropout
        residual: true
        separable: *separable
        se: true
        se_context_size: -1

      - filters: 256
        repeat: 1
        kernel: [21]
        stride: [1]
        dilation: [1]
        dropout: 0.0
        residual: false
        separable: *separable
        se: true
        se_context_size: -1

      - filters: &enc_feat_out 1024
        repeat: 1
        kernel: [1]
        stride: [1]
        dilation: [1]
        dropout: 0.0
        residual: false
        separable: *separable
        se: true
        se_context_size: -1

  decoder:
    _target_: nemo.collections.asr.modules.ConvASRDecoder
    feat_in: 1024
    num_classes: 28
    vocabulary: *labels

  optim:
    name: novograd
    # _target_: nemo.core.optim.optimizers.Novograd
    lr: .01
    # optimizer arguments
    betas: [0.8, 0.5]
    weight_decay: 0.001

    # scheduler setup
    sched:
      name: CosineAnnealing

      # pytorch lightning args
      monitor: val_loss
      reduce_on_plateau: false

      # Scheduler params
      warmup_steps: null
      warmup_ratio: null
      min_lr: 0.0
      last_epoch: -1

trainer:
  devices: 1 # number of gpus
  max_epochs: 5
  max_steps: -1 # computed at runtime if not set
  num_nodes: 1
  accelerator: gpu
  strategy: ddp
  accumulate_grad_batches: 1
  enable_checkpointing: False  # Provided by exp_manager
  logger: False  # Provided by exp_manager
  log_every_n_steps: 1  # Interval of logging.
  val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch
  benchmark: false # needs to be false for models with variable-length speech input as it slows down training

exp_manager:
  exp_dir: null
  name: *name
  create_tensorboard_logger: True
  create_checkpoint_callback: True