wav2vecCTC.yaml 5.19 KB
Newer Older
wxj's avatar
wxj committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
 
# This config contains the default values for training a wav2vec model with CTC loss and BPE-based vocabulary.
# Default learning parameters in this config are set for effective batch size of 1k on 32 GPUs.
# To train it with smaller batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.

name: &name Wav2Vec_CTC

model:
  sample_rate: &sample_rate 16000
  embedding_dim: &emb_dim 768 # Project size of embedding dimension for transformer

  train_ds:
    manifest_filepath: ???
    sample_rate: *sample_rate
    batch_size: ???
    trim_silence: false
    max_duration: null
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    use_start_end_token: false
    num_workers: 8
    pin_memory: true

  validation_ds:
    manifest_filepath: ???
    sample_rate: *sample_rate
    batch_size: ??
    shuffle: false
    use_start_end_token: false
    num_workers: 8
    pin_memory: true

  test_ds:
    manifest_filepath: null
    sample_rate: *sample_rate
    batch_size: null
    shuffle: false
    use_start_end_token: false
    num_workers: 8
    pin_memory: true
  
  tokenizer:
    dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
    type: bpe # Can be either bpe or wpe

  preprocessor:
    _target_: nemo.collections.asr.modules.wav2vec_modules.ConvFeatureEncoder
    extractor_mode: layer_norm # Mode for feature extractor. [group_norm, layer_norm]
    conv_bias: False # Include bias in convolution feature extractor model
    feature_grad_mult: 1.0 # Multiply extracted feature gradients
    normalize_audio: true
    embedding_dim: *emb_dim # Final dimensions of output
    conv_layers:
      - emb_dim: 512
        kernel_size: 10
        stride: 5
      - emb_dim: 512
        kernel_size: 3
        stride: 2
      - emb_dim: 512
        kernel_size: 3
        stride: 2
      - emb_dim: 512
        kernel_size: 3
        stride: 2
      - emb_dim: 512
        kernel_size: 3
        stride: 2
      - emb_dim: 512
        kernel_size: 2
        stride: 2
      - emb_dim: 512
        kernel_size: 2
        stride: 2

  spec_augment:
    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
    freq_masks: 4
    time_masks: 10
    freq_width: 27
    time_width: 0.05
    mask_value: 0.0

  encoder:
    _target_: nemo.collections.asr.modules.wav2vec_modules.Wav2VecTransformerEncoder
    layer_drop: 0.05
    pos_embed: # Config for convolutional model that generates positional embeddings required for attention layer
      embedding_dim: *emb_dim
      conv_pos: 128 # Number of filters for convolutional positional embeddings
      conv_pos_groups: 16 # Number of groups for convolutional positional embeddings
    transformer: # Config for nemo.collections.nlp.modules.common.transformer.TransformerEncoder
      num_layers: 12 # Number of encoder layers in transformer model
      hidden_size: *emb_dim # Encoder embedding dim
      inner_size: 3072 # Encoder embedding dim for feed forward
      num_attention_heads: 8 # Number of encoder attention heads
      attn_score_dropout: .1 #probability of dropout applied to attention scores
      attn_layer_dropout: .1 #probability of dropout applied to the output of the attention layers, but before layer normalization
      ffn_dropout: .1 # probability of dropout applied to FFN output
      hidden_act: gelu # Activation for transformer
    
  decoder:
    _target_: nemo.collections.asr.modules.ConvASRDecoder
    feat_in: *emb_dim
    num_classes: -1  # filled with vocabulary size from tokenizer at runtime
    vocabulary: []  # filled with vocabulary from tokenizer at runtime

  optim:
    name: adamw
    lr: 2
    eps: 1e-06
    # optimizer arguments
    betas: [ 0.9, 0.98 ]
    weight_decay: 0.0

    # scheduler setup
    sched:
      name: NoamAnnealing
      d_model: ${model.encoder.transformer.hidden_size}
      min_lr: 0.001
      # Scheduler params
      warmup_steps: 1500
      warmup_ratio: null

trainer:
  devices: 1 # number of gpus
  num_nodes: 1
  max_epochs: 100
  max_steps: -1 # computed at runtime if not set
  accelerator: gpu
  strategy: ddp
  accumulate_grad_batches: 1
  gradient_clip_val: 0.0
  precision: 32 # 16, 32, or bf16
  log_every_n_steps: 100 # Interval of logging.
  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
  sync_batchnorm: true
  enable_checkpointing: False # Provided by exp_manager
  logger: false # Provided by exp_manager

exp_manager:
  exp_dir: null
  name: *name
  create_tensorboard_logger: true
  create_checkpoint_callback: true
  checkpoint_callback_params:
    monitor: "val_wer"
    mode: "min"
    save_top_k: 5
    every_n_epochs: 1
    always_save_nemo: true
  create_wandb_logger: false
  wandb_logger_kwargs:
    name: null
    project: null
  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
  resume_if_exists: false
  resume_ignore_no_checkpoint: false

hydra:
  run:
    dir: .
  job_logging:
    root:
      handlers: null