marblenet_3x2x64.yaml 4.29 KB
Newer Older
wxj's avatar
wxj committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
name: &name "MarbleNet-3x2x64"

model:
  sample_rate: 16000
  repeat: 2
  dropout: 0.0
  kernel_size_factor: 1.0

  labels: ['background', 'speech']

  train_ds:
    manifest_filepath: ???
    sample_rate: ${model.sample_rate}
    labels: ${model.labels}
    batch_size: 128
    shuffle: True
    # tarred datasets
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: "scatter"
    shuffle_n: 2048
    num_workers: 8
    pin_memory: true
    # bucketing params
    bucketing_strategy: "synced_randomized"
    bucketing_batch_size: null
    bucketing_weights: null
    augmentor:
      shift:
        prob: 1.0
        min_shift_ms: -5.0
        max_shift_ms: 5.0
      white_noise:
        prob: 1.0
        min_level: -90
        max_level: -46

  validation_ds:
    manifest_filepath: ???
    sample_rate: ${model.sample_rate}
    labels: ${model.labels}
    batch_size: 128
    shuffle: False
    num_workers: 8
    pin_memory: true
    val_loss_idx: 0

  test_ds:
    manifest_filepath: null
    sample_rate: ${model.sample_rate}
    labels: ${model.labels}
    batch_size: 128
    shuffle: False
    num_workers: 8
    pin_memory: true
    test_loss_idx: 0

  preprocessor:
    _target_: nemo.collections.asr.modules.AudioToMFCCPreprocessor
    window_size: 0.025
    window_stride: 0.01
    window: "hann"
    n_mels: &n_mels 64
    n_mfcc: *n_mels
    n_fft: 512

  spec_augment:
    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
    freq_masks: 2
    time_masks: 2
    freq_width: 15
    time_width: 25
    rect_masks: 5
    rect_time: 25
    rect_freq: 15

  encoder:
    _target_: nemo.collections.asr.modules.ConvASREncoder
    feat_in: *n_mels
    activation: relu
    conv_mask: true

    jasper:
      - filters: 128
        repeat: 1
        kernel: [11]
        stride: [1]
        dilation: [1]
        dropout: ${model.dropout}
        residual: false
        separable: true
        kernel_size_factor: ${model.kernel_size_factor}

      - filters: 64
        repeat: ${model.repeat}
        kernel: [13]
        stride: [1]
        dilation: [1]
        dropout: ${model.dropout}
        residual: true
        separable: true
        kernel_size_factor: ${model.kernel_size_factor}

      - filters: 64
        repeat: ${model.repeat}
        kernel: [15]
        stride: [1]
        dilation: [1]
        dropout: ${model.dropout}
        residual: true
        separable: true
        kernel_size_factor: ${model.kernel_size_factor}

      - filters: 64
        repeat: ${model.repeat}
        kernel: [17]
        stride: [1]
        dilation: [1]
        dropout: ${model.dropout}
        residual: true
        separable: true
        kernel_size_factor: ${model.kernel_size_factor}

      - filters: 128
        repeat: 1
        kernel: [29]
        stride: [1]
        dilation: [2]
        dropout: ${model.dropout}
        residual: false
        separable: true
        kernel_size_factor: ${model.kernel_size_factor}

      - filters: &enc_final_filters 128
        repeat: 1
        kernel: [1]
        stride: [1]
        dilation: [1]
        dropout: ${model.dropout}
        residual: false

  decoder:
    _target_: nemo.collections.asr.modules.ConvASRDecoderClassification
    feat_in: *enc_final_filters
    return_logits: true
    pooling_type: 'avg'

  optim:
    name: sgd
    lr: 0.01
    # optimizer arguments
    weight_decay: 0.001
    momentum: 0.9

    # scheduler setup
    sched:
      name: PolynomialHoldDecayAnnealing
      # Scheduler params
      power: 2.0
      warmup_ratio: 0.05
      hold_ratio: 0.45
      min_lr: 0.001
      last_epoch: -1

trainer:
  devices: 1 # number of gpus
  max_epochs: 150
  max_steps: -1 # computed at runtime if not set
  num_nodes: 1
  accelerator: gpu
  strategy: ddp
  accumulate_grad_batches: 1
  enable_checkpointing: False  # Provided by exp_manager
  logger: False  # Provided by exp_manager
  log_every_n_steps: 1  # Interval of logging.
  val_check_interval: 1.0  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
  benchmark: false # needs to be false for models with variable-length speech input as it slows down training

exp_manager:
  exp_dir: null
  name: *name
  create_tensorboard_logger: True
  create_checkpoint_callback: True
  create_wandb_logger: False
  wandb_logger_kwargs:
    name: null
    project: null