#!/usr/bin/env bash # Copyright 2017 Johns Hopkins University (Shinji Watanabe) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) . ./path.sh || exit 1; . ./cmd.sh || exit 1; # general configuration backend=pytorch stage=-1 # start from -1 if you need to start from data download stop_stage=100 ngpu=1 # number of gpus ("0" uses cpu, otherwise use gpu) debugmode=1 dumpdir=dump # directory to dump full features N=0 # number of minibatches to be used (mainly for debugging). "0" uses all minibatches. verbose=0 # verbose option resume= # Resume the training from snapshot seed=1 # feature configuration do_delta=false train_config=conf/train.yaml lm_config=conf/lm.yaml decode_config=conf/decode.yaml # rnnlm related use_wordlm=true # false means to train/use a character LM lm_vocabsize=20000 # effective only for word LMs lm_resume= # specify a snapshot file to resume LM training lmtag= # tag for managing LMs use_lm=true # decoding parameter recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best' # You may set 'mic' to: # ihm [individual headset mic- the default which gives best results] # sdm1 [single distant microphone- the current script allows you only to select # the 1st of 8 microphones] # mdm8 [multiple distant microphones-- currently we only support averaging over # the 8 source microphones]. # ... by calling this script as, for example, # ./run.sh --mic sdm1 # ./run.sh --mic mdm8 mic=ihm # exp tag tag="" # tag for managing experiments. . utils/parse_options.sh || exit 1; # Set bash to 'debug' mode, it will exit on : # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', set -e set -u set -o pipefail base_mic=${mic//[0-9]/} # sdm, ihm or mdm nmics=${mic//[a-z]/} # e.g. 8 for mdm8. # Path where AMI gets downloaded (or where locally available): AMI_DIR=$PWD/wav_db # Default, case $(hostname -d) in clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU, esac train_set=${mic}_train train_dev=${mic}_dev train_test=${mic}_eval recog_set="${mic}_dev ${mic}_eval" if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then echo "stage -1: Data Download" if [ -d ${AMI_DIR} ] && ! touch ${AMI_DIR}/.foo 2>/dev/null; then echo "$0: directory $AMI_DIR seems to exist and not be owned by you." echo " ... Assuming the data does not need to be downloaded. Please use --stage 0 or more." exit 1 fi if [ -e data/local/downloads/wget_${mic}.sh ]; then echo "data/local/downloads/wget_$mic.sh already exists, better quit than re-download... (use --stage N)" exit 1 fi local/ami_download.sh ${mic} ${AMI_DIR} fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ### Task dependent. You have to make data the following preparation part by yourself. ### But you can utilize Kaldi recipes in most cases echo "stage 0: Data preparation" # common data prep if [ ! -d data/local/downloads/annotations ]; then local/ami_text_prep.sh data/local/downloads fi # beamforming if [ "$base_mic" == "mdm" ]; then PROCESSED_AMI_DIR=${PWD}/beamformed if [ -z ${BEAMFORMIT} ]; then export BEAMFORMIT=${KALDI_ROOT}/tools/BeamformIt fi export PATH=${PATH}:${BEAMFORMIT} ! hash BeamformIt && echo "Missing BeamformIt, run 'cd ../../../tools/kaldi/tools; extras/install_beamformit.sh; cd -;'" && exit 1 local/ami_beamform.sh --cmd "$train_cmd" --nj 20 ${nmics} ${AMI_DIR} ${PROCESSED_AMI_DIR} else PROCESSED_AMI_DIR=${AMI_DIR} fi local/ami_${base_mic}_data_prep.sh ${PROCESSED_AMI_DIR} ${mic} # data augmentation utils/perturb_data_dir_speed.sh 0.9 data/${mic}/train_orig data/${mic}_tmp1 utils/perturb_data_dir_speed.sh 1.0 data/${mic}/train_orig data/${mic}_tmp2 utils/perturb_data_dir_speed.sh 1.1 data/${mic}/train_orig data/${mic}_tmp3 rm -r data/${mic}/train_orig utils/combine_data.sh --extra-files utt2uniq data/${mic}/train_orig data/${mic}_tmp1 data/${mic}_tmp2 data/${mic}_tmp3 local/ami_${base_mic}_scoring_data_prep.sh ${PROCESSED_AMI_DIR} ${mic} dev local/ami_${base_mic}_scoring_data_prep.sh ${PROCESSED_AMI_DIR} ${mic} eval for dset in train dev eval; do # changed the original AMI data structure in the Kaldi recipe to the following utils/data/modify_speaker_info.sh --seconds-per-spk-max 30 data/${mic}/${dset}_orig data/${mic}_${dset} done fi feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir} feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir} if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ### Task dependent. You have to design training and dev sets by yourself. ### But you can utilize Kaldi recipes in most cases echo "stage 1: Feature Generation" fbankdir=fbank # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame for x in ${mic}_train ${mic}_dev ${mic}_eval; do steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \ data/${x} exp/make_fbank/${x} ${fbankdir} utils/fix_data_dir.sh data/${x} done # compute global CMVN compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark # dump features for training if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then utils/create_split_dir.pl \ /export/b{14,15,16,17}/${USER}/espnet-data/egs/ami/asr1/dump/${train_set}/delta${do_delta}/storage \ ${feat_tr_dir}/storage fi if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then utils/create_split_dir.pl \ /export/b{14,15,16,17}/${USER}/espnet-data/egs/ami/asr1/dump/${train_dev}/delta${do_delta}/storage \ ${feat_dt_dir}/storage fi dump.sh --cmd "$train_cmd" --nj 32 --do_delta ${do_delta} \ data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/train ${feat_tr_dir} dump.sh --cmd "$train_cmd" --nj 10 --do_delta ${do_delta} \ data/${train_dev}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/dev ${feat_dt_dir} for rtask in ${recog_set}; do feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}; mkdir -p ${feat_recog_dir} dump.sh --cmd "$train_cmd" --nj 10 --do_delta ${do_delta} \ data/${rtask}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/recog/${rtask} \ ${feat_recog_dir} done fi dict=data/lang_1char/${train_set}_units.txt echo "dictionary: ${dict}" if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ### Task dependent. You have to check non-linguistic symbols used in the corpus. echo "stage 2: Dictionary and Json Data Preparation" mkdir -p data/lang_1char/ echo "make a dictionary" echo " 1" > ${dict} # must be 1, 0 will be used for "blank" in CTC text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \ | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict} wc -l ${dict} # make json labels data2json.sh --feat ${feat_tr_dir}/feats.scp \ data/${train_set} ${dict} > ${feat_tr_dir}/data.json data2json.sh --feat ${feat_dt_dir}/feats.scp \ data/${train_dev} ${dict} > ${feat_dt_dir}/data.json for rtask in ${recog_set}; do feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta} data2json.sh --feat ${feat_recog_dir}/feats.scp \ data/${rtask} ${dict} > ${feat_recog_dir}/data.json done fi # It takes a few days. If you just want to end-to-end ASR without LM, # you can skip this and remove --rnnlm option in the recognition (stage 5) if [ -z ${lmtag} ]; then lmtag=$(basename ${lm_config%.*}) if [ ${use_wordlm} = true ]; then lmtag=${lmtag}_word${lm_vocabsize} fi fi lmexpname=train_rnnlm_${backend}_${lmtag} lmexpdir=exp/${lmexpname} mkdir -p ${lmexpdir} if [[ ${stage} -le 3 && ${use_lm} == true ]]; then echo "stage 3: LM Preparation" if [ ${use_wordlm} = true ]; then lmdatadir=data/local/wordlm_train lmdict=${lmdatadir}/wordlist_${lm_vocabsize}.txt mkdir -p ${lmdatadir} cut -f 2- -d" " data/${train_set}/text > ${lmdatadir}/train.txt cut -f 2- -d" " data/${train_dev}/text > ${lmdatadir}/valid.txt cut -f 2- -d" " data/${train_test}/text > ${lmdatadir}/test.txt text2vocabulary.py -s ${lm_vocabsize} -o ${lmdict} ${lmdatadir}/train.txt else lmdatadir=data/local/lm_train lmdict=${dict} mkdir -p ${lmdatadir} text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " \ > ${lmdatadir}/train.txt text2token.py -s 1 -n 1 data/${train_dev}/text | cut -f 2- -d" " \ > ${lmdatadir}/valid.txt text2token.py -s 1 -n 1 data/${train_test}/text | cut -f 2- -d" " \ > ${lmdatadir}/test.txt fi ${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \ lm_train.py \ --config ${lm_config} \ --ngpu ${ngpu} \ --backend ${backend} \ --verbose 1 \ --outdir ${lmexpdir} \ --tensorboard-dir tensorboard/${lmexpname} \ --train-label ${lmdatadir}/train.txt \ --valid-label ${lmdatadir}/valid.txt \ --test-label ${lmdatadir}/test.txt \ --resume ${lm_resume} \ --dict ${lmdict} fi if [ -z ${tag} ]; then expname=${train_set}_${backend}_$(basename ${train_config%.*}) if ${do_delta}; then expname=${expname}_delta fi else expname=${train_set}_${backend}_${tag} fi expdir=exp/${expname} mkdir -p ${expdir} if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then echo "stage 4: Network Training" ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \ asr_train.py \ --config ${train_config} \ --ngpu ${ngpu} \ --backend ${backend} \ --outdir ${expdir}/results \ --tensorboard-dir tensorboard/${expname} \ --debugmode ${debugmode} \ --dict ${dict} \ --debugdir ${expdir} \ --minibatches ${N} \ --verbose ${verbose} \ --resume ${resume} \ --seed ${seed} \ --train-json ${feat_tr_dir}/data.json \ --valid-json ${feat_dt_dir}/data.json fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then echo "stage 5: Decoding" nj=32 pids=() # initialize pids for rtask in ${recog_set}; do ( decode_dir=decode_${rtask}_$(basename ${decode_config%.*})_${lmtag} if [ ${use_lm} = true ]; then if [ ${use_wordlm} = true ]; then recog_opts="--word-rnnlm ${lmexpdir}/rnnlm.model.best" else recog_opts="--rnnlm ${lmexpdir}/rnnlm.model.best" fi else echo "No language model is involved." recog_opts="" fi feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta} # split data splitjson.py --parts ${nj} ${feat_recog_dir}/data.json #### use CPU for decoding ngpu=0 ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \ asr_recog.py \ --config ${decode_config} \ --ngpu ${ngpu} \ --backend ${backend} \ --recog-json ${feat_recog_dir}/split${nj}utt/data.JOB.json \ --result-label ${expdir}/${decode_dir}/data.JOB.json \ --model ${expdir}/results/${recog_model} \ ${recog_opts} score_sclite.sh --wer true ${expdir}/${decode_dir} ${dict} ) & pids+=($!) # store background pids done i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false echo "Finished" fi