run_nst.sh 15 KB
Newer Older
Sugon_ldc's avatar
Sugon_ldc committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
#!/bin/bash

# Copyright 2019 Mobvoi Inc. All Rights Reserved.
# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# This is an augmented version of aishell-1 "run.sh" to make the code compatible with noisy student training

. ./path.sh || exit 1;

# Use this to control how many gpu you use, It's 1-gpu training if you specify
# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
# communication. More details can be found in
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
# export NCCL_SOCKET_IFNAME=ens4f1
export NCCL_DEBUG=INFO
stage=1 # start from 0 if you need to start from data preparation
stop_stage=8

# here are extra parameters used in NST
cer_out_dir=""
dir=""
supervised_data_list=""
checkpoint=
unsupervised_data_list=""
data_list=""

hypo_name=""
out_data_list=""
#parameters with default values:
label=0
average_num=30
nj=16
num_split=1
cer_hypo_threshold=10
speak_rate_threshold=0
label_file="label.txt"
utter_time_file="utter_time.json"
enable_nst=1
job_num=0
dir_split="wenet_split_60_test/"
hypo_name="hypothesis_nst${job_num}.txt"
wav_dir="data/train/wenet_1k_untar/"
tar_dir="data/train/wenet_1khr_tar/"
untar_dir="data/train/wenet_1khr_untar/"
cer_hypo_dir="wenet_cer_hypo"
cer_label_dir="wenet_cer_label"
pseudo_data_ratio=0.75

# The num of machines(nodes) for multi-machine training, 1 is for one machine.
# NFS is required if num_nodes > 1.

num_nodes=1

# The rank of each node or machine, which ranges from 0 to `num_nodes - 1`.
# You should set the node_ranHk=0 on the first machine, set the node_rank=1
# on the second machine, and so on.
node_rank=0
dict=data/dict/lang_char.txt

# data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
# `shard` is used for large dataset which is over 1k hours, and `shard` is
# faster on reading data and training.
data_type=shard
num_utts_per_shard=1000
train_set=train
train_config=conf/train_conformer.yaml
cmvn=true
average_checkpoint=true
target_pt=80
decode_checkpoint=$dir/$target_pt.pt

# here we only use attention_rescoring for NST
decode_modes="attention_rescoring"

. tools/parse_options.sh || exit 1;

# print the settings
echo "setting for this run:"
echo "dir is ${dir}"
echo "data list is ${data_list}"
echo "job_num is ${job_num}"
echo "cer_out_dir is  ${cer_out_dir}"
echo "average_num is ${average_num}"
echo "checkpoint is ${checkpoint} "
echo "enable_nst is ${enable_nst} "

# we assumed that you have finished the data pre-process steps from -1 to 3 in aishell1/s0/run.sh .
# You can modify the "--train_data_supervised" to match your supervised data list.
# Here i used wenetspeech as the unsupervised data, you can run the data pre-process steps from -1 to 3 in
# wenetspeech/s0/run.sh ; you can modify "--train_data_supervised" to match your unsupervised data list.
# you can follow this process to generate your own dataset.
# I have also included my code for extracting data in local/...

# stage 1 is for training
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
  echo "********step 1 start time : $now ********"
  mkdir -p $dir
  # You have to rm `INIT_FILE` manually when you resume or restart a
  # multi-machine training.
  rm $dir/ddp_init
  INIT_FILE=$dir/ddp_init
  init_method=file://$(readlink -f $INIT_FILE)
  echo "$0: init method is $init_method"
  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
  # Use "nccl" if it works, otherwise use "gloo"
  dist_backend="gloo"
  world_size=`expr $num_gpus \* $num_nodes`
  echo "total gpus is: $world_size"

  # the global_cmvn file need to be calculated by combining both supervised/unsupervised datasets,
  # and it should be positioned at data/${train_set}/global_cmvn .
  cmvn_opts=
  $cmvn && cp data/${train_set}/global_cmvn $dir/global_cmvn
  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"

  # train.py rewrite $train_config to $dir/train.yaml with model input
  # and output dimension, and $dir/train.yaml will be used for inference
  # and export.
  echo "checkpoint is "  ${checkpoint}
  for ((i = 0; i < $num_gpus; ++i)); do
  {
    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
    echo "gpu number  $i "
    # Rank of each gpu/process used for knowing whether it is
    # the master of a worker.

    rank=`expr $node_rank \* $num_gpus + $i`
    python wenet/bin/train.py --gpu $gpu_id \
      --config $train_config \
      --data_type $data_type \
      --symbol_table $dict \
      --train_data data/$train_set/$data_list \
      --cv_data data/dev/data.list \
      ${checkpoint:+--checkpoint $checkpoint} \
      --model_dir $dir \
      --ddp.init_method $init_method \
      --ddp.world_size $world_size \
      --ddp.rank $rank \
      --ddp.dist_backend $dist_backend \
      --num_workers 1 \
      $cmvn_opts \
      --pin_memory
  } &
  done
  wait
fi

# In stage 2, we get the averaged final checkpoint and calculate the test and dev accuracy
# please make sure your test and valid data.list are in the proper location.
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  # Test model, please specify the model you want to test by --checkpoint
  # stage 5 we test with aishell dataset,
  echo "******** step 2 start time : $now ********"
  if [ ${average_checkpoint} == true ]; then
    decode_checkpoint=$dir/avg_${average_num}.pt
    echo "do model average and final checkpoint is $decode_checkpoint"
    python wenet/bin/average_model.py \
      --dst_model $decode_checkpoint \
      --src_path $dir  \
      --num ${average_num} \
      --val_best
  fi

  # export model
  python wenet/bin/export_jit.py \
    --config $dir/train.yaml \
    --checkpoint $dir/avg_${average_num}.pt \
    --output_file $dir/final.zip \
    --output_quant_file $dir/final_quant.zip
  # Please specify decoding_chunk_size for unified streaming and
  # non-streaming model. The default value is -1, which is full chunk
  # for non-streaming inference.
  decoding_chunk_size=
  ctc_weight=0.5
  reverse_weight=0.0

  # test_wer
  for mode in ${decode_modes}; do
  {
    #test_dir=$dir/test_${mode}_${target_pt}pt  # for target pt
    test_dir=$dir/test_${mode}${average_num}pt   # for average pt
    mkdir -p $test_dir
    python wenet/bin/recognize.py --gpu 0 \
      --mode $mode \
      --config $dir/train.yaml \
      --data_type $data_type \
      --test_data data/test/data.list \
      --checkpoint $decode_checkpoint \
      --beam_size 10 \
      --batch_size 1 \
      --penalty 0.0 \
      --dict $dict \
      --ctc_weight $ctc_weight \
      --reverse_weight $reverse_weight \
      --result_file $test_dir/text \
      ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
    echo "before compute-wer"
    python tools/compute-wer.py --char=1 --v=1 \
      data/test/text $test_dir/text > $test_dir/wer
  } &
  done

#   dev_wer
  for mode in ${decode_modes}; do
  {
    #test_dir=$dir/test_${mode}_${target_pt}pt  # for target pt
    dev_dir=$dir/dev_${mode}${average_num}pt   # for average pt
    mkdir -p $dev_dir
    python wenet/bin/recognize.py --gpu 0 \
      --mode $mode \
      --config $dir/train.yaml \
      --data_type $data_type \
      --test_data data/dev/data.list \
      --checkpoint $decode_checkpoint \
      --beam_size 10 \
      --batch_size 1 \
      --penalty 0.0 \
      --dict $dict \
      --ctc_weight $ctc_weight \
      --reverse_weight $reverse_weight \
      --result_file $dev_dir/text \
      ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
    echo "before compute-wer"
    python tools/compute-wer.py --char=1 --v=1 \
      data/dev/text $dev_dir/text > $dev_dir/wer
  } &
  done
  wait
fi


# split the (unsupervised) datalist into N sublists, where N depends on the number of available cpu in your cluster.
# when making inference, we compute N sublist in parallel.
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ] && [ ${enable_nst} -eq 0 ]; then
  echo "********step 3 start time : $now ********"
  python local/split_data_list.py \
    --job_nums $num_split \
    --data_list_path data/train/$unsupervised_data_list \
    --output_dir data/train/$dir_split

fi


# stage 4 will perform inference without language model on the given sublist(job num)
# here is example usages:
# bash run_nst.sh --stage 4 --stop-stage 4 --job_num $i --dir_split data/train/wenet_4khr_split_60/
# --hypo_name hypothesis_0.txt --dir exp/conformer_aishell2_wenet4k_nst4
# You need to specify the "job_num" n (n <= N), "dir_split" which is the dir path for split data
# "hypo_name" is the path for output hypothesis and "dir" is the path where we train and store the model.
# For each gpu, you can run with different job_num to perform data-wise parallel computing.
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
  echo "********step 4 start time : $now ********"
  # we assume you have run stage 2 so that avg_${average_num}.pt exists
  decode_checkpoint=$dir/avg_${average_num}.pt
  # Please specify decoding_chunk_size for unified streaming and
  # non-streaming model. The default value is -1, which is full chunk
  # for non-streaming inference.
  decoding_chunk_size=
  ctc_weight=0.5
  reverse_weight=0.0
  mode="attention_rescoring"
  gpu_id=0
  echo "job number  ${job_num} "
  echo "data_list dir is  ${dir_split}"
  echo "hypo name is " $hypo_name
  echo "dir is ${dir}"

  python wenet/bin/recognize.py --gpu $gpu_id \
    --mode $mode \
    --config $dir/train.yaml \
    --data_type $data_type \
    --test_data data/train/${dir_split}data_sublist${job_num}/data_list \
    --checkpoint $decode_checkpoint \
    --beam_size 10 \
    --batch_size 1 \
    --penalty 0.0 \
    --dict $dict \
    --ctc_weight $ctc_weight \
    --reverse_weight $reverse_weight \
    --result_file data/train/${dir_split}data_sublist${job_num}/${hypo_name} \
    ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
    echo "end time : $now"

fi


# Generate wav.scp file and label.txt file(optional) for each sublist we generated in step 3.
# the wav_dir should be prepared in data processing step as we mentioned.
#You need to specify the "job_num" n (n <= N), "dir_split" which is the dir path for split data,
# "hypo_name" is the path for output hypothesis and "dir" is the path where we train and store the model.
# wav_dir is the directory that stores raw wav file and possible labels.
# if you have label for unsupervised dataset, set label = 1 other wise keep it 0
# For each gpu or cpu, you can run with different job_num to perform data-wise parallel computing.
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ] && [ ${enable_nst} -eq 0 ]; then
  echo "********step 5 start time : $now ********"
  python local/get_wav_labels.py \
    --dir_split data/train/${dir_split} \
    --hypo_name /$hypo_name \
    --wav_dir $wav_dir\
    --job_num $job_num \
    --label $label
fi

# Calculate cer-hypo between hypothesis with and without language model.
# We assumed that you have finished language model
# training using the wenet aishell-1 pipline. (You should have data/lang/words.txt , data/lang/TLG.fst files ready.)
# Here is an exmaple usage:
# bash run_nst.sh --stage 5 --stop-stage 5 --job_num n --dir_split data/train/wenet1k_redo_split_60/
# --cer_hypo_dir wenet1k_cer_hypo --hypo_name hypothesis_nst.txt --dir exp/conformer_no_filter_redo_nst6
# You need to specify the "job_num" n (n <= N), "dir_split" which is the dir path for split data
# "hypo_name" is the path for output hypothesis and "dir" is the path where we train and store the model.
# For each gpu, you can run with different job_num to perform data-wise parallel computing.
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
  echo "********step 6 start time : $now ********"
  chunk_size=-1
  mode="attention_rescoring"
  test_dir=$dir/test_${mode}_${job_num}
  now=$(date +"%T")
  echo "start time : $now"
  echo "GPU dir is " $job_num "dir_split is " data/train/${dir_split}
  echo "nj is" $nj "hypo_file is" $hypo_name "cer out is" $cer_hypo_dir "lm is 4gram"
  echo "dir is " $dir
  if [ ! -f data/train/${dir_split}data_sublist${job_num}/${hypo_name}  ]; then
  echo "text file does not exists"
  exit 1;
  fi

  ./tools/decode.sh --nj 16 \
    --beam 15.0 --lattice_beam 7.5 --max_active 7000 \
    --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \
    --chunk_size $chunk_size \
    --fst_path data/lang_test/TLG.fst \
    data/train/${dir_split}data_sublist${job_num}/wav.scp \
    data/train/${dir_split}data_sublist${job_num}/${hypo_name} $dir/final.zip \
    data/lang_test/words.txt $dir/Hypo_LM_diff10/${cer_hypo_dir}_${job_num}
  now=$(date +"%T")
  echo "end time : $now"
fi

# (optional, only run this stage if you have true label for unsupervised data.)
# Calculate cer-label between true label and hypothesis with language model.
# You can use the output cer to evaluate NST's performance.
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ] && [ ${label} -eq 1 ]; then
  echo "********step 7 start time : $now ********"
  chunk_size=-1
  mode="attention_rescoring"
  test_dir=$dir/test_${mode}_${job_num}
  now=$(date +"%T")
  echo "start time : $now"
  echo "GPU dir is " $job_num "dir_split is " data/train/${dir_split}
  echo "nj is" $nj "label_file is" $label_file "cer out is" $cer_label_dir "lm is 4gram"
  echo "dir is " $dir
  echo "label_file " data/train/${dir_split}data_sublist${job_num}/${label_file}
  if [ ! -f data/train/${dir_split}data_sublist${job_num}/${label_file}  ]; then
  echo "text file does not exists"
  exit 1;
  fi

  ./tools/decode.sh --nj 16 \
    --beam 15.0 --lattice_beam 7.5 --max_active 7000 \
    --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \
    --chunk_size $chunk_size \
    --fst_path data/lang_test/TLG.fst \
    data/train/${dir_split}data_sublist${job_num}/wav.scp \
    data/train/${dir_split}data_sublist${job_num}/${label_file} $dir/final.zip \
    data/lang_test/words.txt $dir/Hypo_LM_diff10/${cer_label_dir}_${job_num}
  now=$(date +"%T")
  echo "end time : $now"
fi


if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
  echo "********step 8 start time : $now ********"
  python local/generate_filtered_pseudo_label.py  \
    --cer_hypo_dir $cer_hypo_dir \
    --untar_dir data/train/$untar_dir \
    --wav_dir $wav_dir \
    --dir_num $job_num \
    --cer_hypo_threshold $cer_hypo_threshold \
    --speak_rate_threshold $speak_rate_threshold \
    --dir $dir \
    --tar_dir data/train/$tar_dir \
    --utter_time_file $utter_time_file

  python local/generate_data_list.py  \
    --tar_dir data/train/$tar_dir \
    --out_data_list data/train/$out_data_list \
    --supervised_data_list data/train/$supervised_data_list \
    --pseudo_data_ratio $pseudo_data_ratio

fi