wav2vecCTC.yaml

 
# This config contains the default values for training a wav2vec model with CTC loss and BPE-based vocabulary.
# Default learning parameters in this config are set for effective batch size of 1k on 32 GPUs.
# To train it with smaller batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.

name: &name Wav2Vec_CTC

model:
  sample_rate: &sample_rate 16000
  embedding_dim: &emb_dim 768 # Project size of embedding dimension for transformer

  train_ds:
    manifest_filepath: ???
    sample_rate: *sample_rate
    batch_size: ???
    trim_silence: false
    max_duration: null
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    use_start_end_token: false
    num_workers: 8
    pin_memory: true

  validation_ds:
    manifest_filepath: ???
    sample_rate: *sample_rate
    batch_size: ??
    shuffle: false
    use_start_end_token: false
    num_workers: 8
    pin_memory: true

  test_ds:
    manifest_filepath: null
    sample_rate: *sample_rate
    batch_size: null
    shuffle: false
    use_start_end_token: false
    num_workers: 8
    pin_memory: true
  
  tokenizer:
    dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
    type: bpe # Can be either bpe or wpe

  preprocessor:
    _target_: nemo.collections.asr.modules.wav2vec_modules.ConvFeatureEncoder
    extractor_mode: layer_norm # Mode for feature extractor. [group_norm, layer_norm]
    conv_bias: False # Include bias in convolution feature extractor model
    feature_grad_mult: 1.0 # Multiply extracted feature gradients
    normalize_audio: true
    embedding_dim: *emb_dim # Final dimensions of output
    conv_layers:
      - emb_dim: 512
        kernel_size: 10
        stride: 5
      - emb_dim: 512
        kernel_size: 3
        stride: 2
      - emb_dim: 512
        kernel_size: 3
        stride: 2
      - emb_dim: 512
        kernel_size: 3
        stride: 2
      - emb_dim: 512
        kernel_size: 3
        stride: 2
      - emb_dim: 512
        kernel_size: 2
        stride: 2
      - emb_dim: 512
        kernel_size: 2
        stride: 2

  spec_augment:
    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
    freq_masks: 4
    time_masks: 10
    freq_width: 27
    time_width: 0.05
    mask_value: 0.0

  encoder:
    _target_: nemo.collections.asr.modules.wav2vec_modules.Wav2VecTransformerEncoder
    layer_drop: 0.05
    pos_embed: # Config for convolutional model that generates positional embeddings required for attention layer
      embedding_dim: *emb_dim
      conv_pos: 128 # Number of filters for convolutional positional embeddings
      conv_pos_groups: 16 # Number of groups for convolutional positional embeddings
    transformer: # Config for nemo.collections.nlp.modules.common.transformer.TransformerEncoder
      num_layers: 12 # Number of encoder layers in transformer model
      hidden_size: *emb_dim # Encoder embedding dim
      inner_size: 3072 # Encoder embedding dim for feed forward
      num_attention_heads: 8 # Number of encoder attention heads
      attn_score_dropout: .1 #probability of dropout applied to attention scores
      attn_layer_dropout: .1 #probability of dropout applied to the output of the attention layers, but before layer normalization
      ffn_dropout: .1 # probability of dropout applied to FFN output
      hidden_act: gelu # Activation for transformer
    
  decoder:
    _target_: nemo.collections.asr.modules.ConvASRDecoder
    feat_in: *emb_dim
    num_classes: -1  # filled with vocabulary size from tokenizer at runtime
    vocabulary: []  # filled with vocabulary from tokenizer at runtime

  optim:
    name: adamw
    lr: 2
    eps: 1e-06
    # optimizer arguments
    betas: [ 0.9, 0.98 ]
    weight_decay: 0.0

    # scheduler setup
    sched:
      name: NoamAnnealing
      d_model: ${model.encoder.transformer.hidden_size}
      min_lr: 0.001
      # Scheduler params
      warmup_steps: 1500
      warmup_ratio: null

trainer:
  devices: 1 # number of gpus
  num_nodes: 1
  max_epochs: 100
  max_steps: -1 # computed at runtime if not set
  accelerator: gpu
  strategy: ddp
  accumulate_grad_batches: 1
  gradient_clip_val: 0.0
  precision: 32 # 16, 32, or bf16
  log_every_n_steps: 100 # Interval of logging.
  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
  sync_batchnorm: true
  enable_checkpointing: False # Provided by exp_manager
  logger: false # Provided by exp_manager

exp_manager:
  exp_dir: null
  name: *name
  create_tensorboard_logger: true
  create_checkpoint_callback: true
  checkpoint_callback_params:
    monitor: "val_wer"
    mode: "min"
    save_top_k: 5
    every_n_epochs: 1
    always_save_nemo: true
  create_wandb_logger: false
  wandb_logger_kwargs:
    name: null
    project: null
  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
  resume_if_exists: false
  resume_ignore_no_checkpoint: false

hydra:
  run:
    dir: .
  job_logging:
    root:
      handlers: null