#!/bin/bash # Copyright 2019 Mobvoi Inc. All Rights Reserved. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This is an augmented version of aishell-1 "run.sh" to make the code compatible with noisy student training . ./path.sh || exit 1; # Use this to control how many gpu you use, It's 1-gpu training if you specify # just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" # The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl # communication. More details can be found in # https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html # export NCCL_SOCKET_IFNAME=ens4f1 export NCCL_DEBUG=INFO stage=1 # start from 0 if you need to start from data preparation stop_stage=8 # here are extra parameters used in NST cer_out_dir="" dir="" supervised_data_list="" checkpoint= unsupervised_data_list="" data_list="" hypo_name="" out_data_list="" #parameters with default values: label=0 average_num=30 nj=16 num_split=1 cer_hypo_threshold=10 speak_rate_threshold=0 label_file="label.txt" utter_time_file="utter_time.json" enable_nst=1 job_num=0 dir_split="wenet_split_60_test/" hypo_name="hypothesis_nst${job_num}.txt" wav_dir="data/train/wenet_1k_untar/" tar_dir="data/train/wenet_1khr_tar/" untar_dir="data/train/wenet_1khr_untar/" cer_hypo_dir="wenet_cer_hypo" cer_label_dir="wenet_cer_label" pseudo_data_ratio=0.75 # The num of machines(nodes) for multi-machine training, 1 is for one machine. # NFS is required if num_nodes > 1. num_nodes=1 # The rank of each node or machine, which ranges from 0 to `num_nodes - 1`. # You should set the node_ranHk=0 on the first machine, set the node_rank=1 # on the second machine, and so on. node_rank=0 dict=data/dict/lang_char.txt # data_type can be `raw` or `shard`. Typically, raw is used for small dataset, # `shard` is used for large dataset which is over 1k hours, and `shard` is # faster on reading data and training. data_type=shard num_utts_per_shard=1000 train_set=train train_config=conf/train_conformer.yaml cmvn=true average_checkpoint=true target_pt=80 decode_checkpoint=$dir/$target_pt.pt # here we only use attention_rescoring for NST decode_modes="attention_rescoring" . tools/parse_options.sh || exit 1; # print the settings echo "setting for this run:" echo "dir is ${dir}" echo "data list is ${data_list}" echo "job_num is ${job_num}" echo "cer_out_dir is ${cer_out_dir}" echo "average_num is ${average_num}" echo "checkpoint is ${checkpoint} " echo "enable_nst is ${enable_nst} " # we assumed that you have finished the data pre-process steps from -1 to 3 in aishell1/s0/run.sh . # You can modify the "--train_data_supervised" to match your supervised data list. # Here i used wenetspeech as the unsupervised data, you can run the data pre-process steps from -1 to 3 in # wenetspeech/s0/run.sh ; you can modify "--train_data_supervised" to match your unsupervised data list. # you can follow this process to generate your own dataset. # I have also included my code for extracting data in local/... # stage 1 is for training if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "********step 1 start time : $now ********" mkdir -p $dir # You have to rm `INIT_FILE` manually when you resume or restart a # multi-machine training. rm $dir/ddp_init INIT_FILE=$dir/ddp_init init_method=file://$(readlink -f $INIT_FILE) echo "$0: init method is $init_method" num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # Use "nccl" if it works, otherwise use "gloo" dist_backend="gloo" world_size=`expr $num_gpus \* $num_nodes` echo "total gpus is: $world_size" # the global_cmvn file need to be calculated by combining both supervised/unsupervised datasets, # and it should be positioned at data/${train_set}/global_cmvn . cmvn_opts= $cmvn && cp data/${train_set}/global_cmvn $dir/global_cmvn $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" # train.py rewrite $train_config to $dir/train.yaml with model input # and output dimension, and $dir/train.yaml will be used for inference # and export. echo "checkpoint is " ${checkpoint} for ((i = 0; i < $num_gpus; ++i)); do { gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) echo "gpu number $i " # Rank of each gpu/process used for knowing whether it is # the master of a worker. rank=`expr $node_rank \* $num_gpus + $i` python wenet/bin/train.py --gpu $gpu_id \ --config $train_config \ --data_type $data_type \ --symbol_table $dict \ --train_data data/$train_set/$data_list \ --cv_data data/dev/data.list \ ${checkpoint:+--checkpoint $checkpoint} \ --model_dir $dir \ --ddp.init_method $init_method \ --ddp.world_size $world_size \ --ddp.rank $rank \ --ddp.dist_backend $dist_backend \ --num_workers 1 \ $cmvn_opts \ --pin_memory } & done wait fi # In stage 2, we get the averaged final checkpoint and calculate the test and dev accuracy # please make sure your test and valid data.list are in the proper location. if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # Test model, please specify the model you want to test by --checkpoint # stage 5 we test with aishell dataset, echo "******** step 2 start time : $now ********" if [ ${average_checkpoint} == true ]; then decode_checkpoint=$dir/avg_${average_num}.pt echo "do model average and final checkpoint is $decode_checkpoint" python wenet/bin/average_model.py \ --dst_model $decode_checkpoint \ --src_path $dir \ --num ${average_num} \ --val_best fi # export model python wenet/bin/export_jit.py \ --config $dir/train.yaml \ --checkpoint $dir/avg_${average_num}.pt \ --output_file $dir/final.zip \ --output_quant_file $dir/final_quant.zip # Please specify decoding_chunk_size for unified streaming and # non-streaming model. The default value is -1, which is full chunk # for non-streaming inference. decoding_chunk_size= ctc_weight=0.5 reverse_weight=0.0 # test_wer for mode in ${decode_modes}; do { #test_dir=$dir/test_${mode}_${target_pt}pt # for target pt test_dir=$dir/test_${mode}${average_num}pt # for average pt mkdir -p $test_dir python wenet/bin/recognize.py --gpu 0 \ --mode $mode \ --config $dir/train.yaml \ --data_type $data_type \ --test_data data/test/data.list \ --checkpoint $decode_checkpoint \ --beam_size 10 \ --batch_size 1 \ --penalty 0.0 \ --dict $dict \ --ctc_weight $ctc_weight \ --reverse_weight $reverse_weight \ --result_file $test_dir/text \ ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} echo "before compute-wer" python tools/compute-wer.py --char=1 --v=1 \ data/test/text $test_dir/text > $test_dir/wer } & done # dev_wer for mode in ${decode_modes}; do { #test_dir=$dir/test_${mode}_${target_pt}pt # for target pt dev_dir=$dir/dev_${mode}${average_num}pt # for average pt mkdir -p $dev_dir python wenet/bin/recognize.py --gpu 0 \ --mode $mode \ --config $dir/train.yaml \ --data_type $data_type \ --test_data data/dev/data.list \ --checkpoint $decode_checkpoint \ --beam_size 10 \ --batch_size 1 \ --penalty 0.0 \ --dict $dict \ --ctc_weight $ctc_weight \ --reverse_weight $reverse_weight \ --result_file $dev_dir/text \ ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} echo "before compute-wer" python tools/compute-wer.py --char=1 --v=1 \ data/dev/text $dev_dir/text > $dev_dir/wer } & done wait fi # split the (unsupervised) datalist into N sublists, where N depends on the number of available cpu in your cluster. # when making inference, we compute N sublist in parallel. if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ] && [ ${enable_nst} -eq 0 ]; then echo "********step 3 start time : $now ********" python local/split_data_list.py \ --job_nums $num_split \ --data_list_path data/train/$unsupervised_data_list \ --output_dir data/train/$dir_split fi # stage 4 will perform inference without language model on the given sublist(job num) # here is example usages: # bash run_nst.sh --stage 4 --stop-stage 4 --job_num $i --dir_split data/train/wenet_4khr_split_60/ # --hypo_name hypothesis_0.txt --dir exp/conformer_aishell2_wenet4k_nst4 # You need to specify the "job_num" n (n <= N), "dir_split" which is the dir path for split data # "hypo_name" is the path for output hypothesis and "dir" is the path where we train and store the model. # For each gpu, you can run with different job_num to perform data-wise parallel computing. if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then echo "********step 4 start time : $now ********" # we assume you have run stage 2 so that avg_${average_num}.pt exists decode_checkpoint=$dir/avg_${average_num}.pt # Please specify decoding_chunk_size for unified streaming and # non-streaming model. The default value is -1, which is full chunk # for non-streaming inference. decoding_chunk_size= ctc_weight=0.5 reverse_weight=0.0 mode="attention_rescoring" gpu_id=0 echo "job number ${job_num} " echo "data_list dir is ${dir_split}" echo "hypo name is " $hypo_name echo "dir is ${dir}" python wenet/bin/recognize.py --gpu $gpu_id \ --mode $mode \ --config $dir/train.yaml \ --data_type $data_type \ --test_data data/train/${dir_split}data_sublist${job_num}/data_list \ --checkpoint $decode_checkpoint \ --beam_size 10 \ --batch_size 1 \ --penalty 0.0 \ --dict $dict \ --ctc_weight $ctc_weight \ --reverse_weight $reverse_weight \ --result_file data/train/${dir_split}data_sublist${job_num}/${hypo_name} \ ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} echo "end time : $now" fi # Generate wav.scp file and label.txt file(optional) for each sublist we generated in step 3. # the wav_dir should be prepared in data processing step as we mentioned. #You need to specify the "job_num" n (n <= N), "dir_split" which is the dir path for split data, # "hypo_name" is the path for output hypothesis and "dir" is the path where we train and store the model. # wav_dir is the directory that stores raw wav file and possible labels. # if you have label for unsupervised dataset, set label = 1 other wise keep it 0 # For each gpu or cpu, you can run with different job_num to perform data-wise parallel computing. if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ] && [ ${enable_nst} -eq 0 ]; then echo "********step 5 start time : $now ********" python local/get_wav_labels.py \ --dir_split data/train/${dir_split} \ --hypo_name /$hypo_name \ --wav_dir $wav_dir\ --job_num $job_num \ --label $label fi # Calculate cer-hypo between hypothesis with and without language model. # We assumed that you have finished language model # training using the wenet aishell-1 pipline. (You should have data/lang/words.txt , data/lang/TLG.fst files ready.) # Here is an exmaple usage: # bash run_nst.sh --stage 5 --stop-stage 5 --job_num n --dir_split data/train/wenet1k_redo_split_60/ # --cer_hypo_dir wenet1k_cer_hypo --hypo_name hypothesis_nst.txt --dir exp/conformer_no_filter_redo_nst6 # You need to specify the "job_num" n (n <= N), "dir_split" which is the dir path for split data # "hypo_name" is the path for output hypothesis and "dir" is the path where we train and store the model. # For each gpu, you can run with different job_num to perform data-wise parallel computing. if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then echo "********step 6 start time : $now ********" chunk_size=-1 mode="attention_rescoring" test_dir=$dir/test_${mode}_${job_num} now=$(date +"%T") echo "start time : $now" echo "GPU dir is " $job_num "dir_split is " data/train/${dir_split} echo "nj is" $nj "hypo_file is" $hypo_name "cer out is" $cer_hypo_dir "lm is 4gram" echo "dir is " $dir if [ ! -f data/train/${dir_split}data_sublist${job_num}/${hypo_name} ]; then echo "text file does not exists" exit 1; fi ./tools/decode.sh --nj 16 \ --beam 15.0 --lattice_beam 7.5 --max_active 7000 \ --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \ --chunk_size $chunk_size \ --fst_path data/lang_test/TLG.fst \ data/train/${dir_split}data_sublist${job_num}/wav.scp \ data/train/${dir_split}data_sublist${job_num}/${hypo_name} $dir/final.zip \ data/lang_test/words.txt $dir/Hypo_LM_diff10/${cer_hypo_dir}_${job_num} now=$(date +"%T") echo "end time : $now" fi # (optional, only run this stage if you have true label for unsupervised data.) # Calculate cer-label between true label and hypothesis with language model. # You can use the output cer to evaluate NST's performance. if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ] && [ ${label} -eq 1 ]; then echo "********step 7 start time : $now ********" chunk_size=-1 mode="attention_rescoring" test_dir=$dir/test_${mode}_${job_num} now=$(date +"%T") echo "start time : $now" echo "GPU dir is " $job_num "dir_split is " data/train/${dir_split} echo "nj is" $nj "label_file is" $label_file "cer out is" $cer_label_dir "lm is 4gram" echo "dir is " $dir echo "label_file " data/train/${dir_split}data_sublist${job_num}/${label_file} if [ ! -f data/train/${dir_split}data_sublist${job_num}/${label_file} ]; then echo "text file does not exists" exit 1; fi ./tools/decode.sh --nj 16 \ --beam 15.0 --lattice_beam 7.5 --max_active 7000 \ --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \ --chunk_size $chunk_size \ --fst_path data/lang_test/TLG.fst \ data/train/${dir_split}data_sublist${job_num}/wav.scp \ data/train/${dir_split}data_sublist${job_num}/${label_file} $dir/final.zip \ data/lang_test/words.txt $dir/Hypo_LM_diff10/${cer_label_dir}_${job_num} now=$(date +"%T") echo "end time : $now" fi if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then echo "********step 8 start time : $now ********" python local/generate_filtered_pseudo_label.py \ --cer_hypo_dir $cer_hypo_dir \ --untar_dir data/train/$untar_dir \ --wav_dir $wav_dir \ --dir_num $job_num \ --cer_hypo_threshold $cer_hypo_threshold \ --speak_rate_threshold $speak_rate_threshold \ --dir $dir \ --tar_dir data/train/$tar_dir \ --utter_time_file $utter_time_file python local/generate_data_list.py \ --tar_dir data/train/$tar_dir \ --out_data_list data/train/$out_data_list \ --supervised_data_list data/train/$supervised_data_list \ --pseudo_data_ratio $pseudo_data_ratio fi