#!/usr/bin/env bash # Copyright 2018 Nagoya University (Tomoki Hayashi) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) . ./path.sh || exit 1; . ./cmd.sh || exit 1; # general configuration backend=pytorch stage=-1 stop_stage=100 ngpu=1 # number of gpus ("0" uses cpu, otherwise use gpu) nj=32 # number of parallel jobs dumpdir=dump # directory to dump full features verbose=1 # verbose option (if set > 0, get more log) N=0 # number of minibatches to be used (mainly for debugging). "0" uses all minibatches. seed=1 # random seed number resume="" # the snapshot path to resume (if set empty, no effect) # feature extraction related fs=16000 # sampling frequency fmax="" # maximum frequency fmin="" # minimum frequency n_mels=80 # number of mel basis n_fft=1024 # number of fft points n_shift=256 # number of shift points win_length="" # window length # config files train_config=conf/train_pytorch_tacotron2.yaml decode_config=conf/decode.yaml # decoding related model=model.loss.best n_average=0 # if > 0, the model averaged with n_average ckpts will be used instead of model.loss.best griffin_lim_iters=64 # the number of iterations of Griffin-Lim # data datadir=./downloads an4_root=${datadir}/an4 data_url=http://www.speech.cs.cmu.edu/databases/an4/ data_url2=https://huggingface.co/datasets/espnet/an4/resolve/main # exp tag tag="" # tag for managing experiments. . utils/parse_options.sh || exit 1; # Set bash to 'debug' mode, it will exit on : # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', set -e set -u set -o pipefail train_set=train_nodev train_dev=train_dev eval_set="test" if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then echo "stage -1: Data Download" mkdir -p ${datadir} if ! local/download_and_untar.sh ${datadir} ${data_url}; then echo "Failed to download from the original site, try a backup site." local/download_and_untar.sh ${datadir} ${data_url2} fi fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ### Task dependent. You have to make data the following preparation part by yourself. ### But you can utilize Kaldi recipes in most cases echo "stage 0: Data preparation" mkdir -p data/{train,test} exp if [ ! -f ${an4_root}/README ]; then echo Cannot find an4 root! Exiting... exit 1 fi python3 local/data_prep.py ${an4_root} sph2pipe for x in test train; do for f in text wav.scp utt2spk; do sort data/${x}/${f} -o data/${x}/${f} done utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt done fi feat_tr_dir=${dumpdir}/${train_set}; mkdir -p ${feat_tr_dir} feat_dt_dir=${dumpdir}/${train_dev}; mkdir -p ${feat_dt_dir} feat_ev_dir=${dumpdir}/${eval_set}; mkdir -p ${feat_ev_dir} if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ### Task dependent. You have to design training and dev sets by yourself. ### But you can utilize Kaldi recipes in most cases echo "stage 1: Feature Generation" # Generate the fbank features; by default 80-dimensional fbanks on each frame fbankdir=fbank for x in train test; do make_fbank.sh --cmd "${train_cmd}" --nj ${nj} \ --fs ${fs} \ --fmax "${fmax}" \ --fmin "${fmin}" \ --n_fft ${n_fft} \ --n_shift ${n_shift} \ --win_length "${win_length}" \ --n_mels ${n_mels} \ data/${x} \ exp/make_fbank/${x} \ ${fbankdir} done # make a dev set utils/subset_data_dir.sh --first data/train 100 data/${train_dev} n=$(( $(wc -l < data/train/wav.scp) - 100 )) utils/subset_data_dir.sh --first data/train ${n} data/${train_set} # compute statistics for global mean-variance normalization compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark # dump features for training dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta false \ data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/train ${feat_tr_dir} dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta false \ data/${train_dev}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/dev ${feat_dt_dir} dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta false \ data/${eval_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/eval ${feat_ev_dir} fi dict=data/lang_1char/${train_set}_units.txt echo "dictionary: ${dict}" if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ### Task dependent. You have to check non-linguistic symbols used in the corpus. echo "stage 2: Dictionary and Json Data Preparation" mkdir -p data/lang_1char/ echo " 1" > ${dict} # must be 1, 0 will be used for "blank" in CTC text2token.py -s 1 -n 1 data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \ | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict} wc -l ${dict} # make json labels data2json.sh --feat ${feat_tr_dir}/feats.scp \ data/${train_set} ${dict} > ${feat_tr_dir}/data.json data2json.sh --feat ${feat_dt_dir}/feats.scp \ data/${train_dev} ${dict} > ${feat_dt_dir}/data.json data2json.sh --feat ${feat_ev_dir}/feats.scp \ data/${eval_set} ${dict} > ${feat_ev_dir}/data.json fi if [ -z ${tag} ]; then expname=${train_set}_${backend}_$(basename ${train_config%.*}) else expname=${train_set}_${backend}_${tag} fi expdir=exp/${expname} mkdir -p ${expdir} if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "stage 3: Text-to-speech model training" tr_json=${feat_tr_dir}/data.json dt_json=${feat_dt_dir}/data.json ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \ tts_train.py \ --backend ${backend} \ --ngpu ${ngpu} \ --minibatches ${N} \ --outdir ${expdir}/results \ --tensorboard-dir tensorboard/${expname} \ --verbose ${verbose} \ --seed ${seed} \ --resume ${resume} \ --train-json ${tr_json} \ --valid-json ${dt_json} \ --config ${train_config} fi if [ ${n_average} -gt 0 ]; then model=model.last${n_average}.avg.best fi outdir=${expdir}/outputs_${model}_$(basename ${decode_config%.*}) if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then echo "stage 4: Decoding" if [ ${n_average} -gt 0 ]; then average_checkpoints.py --backend ${backend} \ --snapshots ${expdir}/results/snapshot.ep.* \ --out ${expdir}/results/${model} \ --num ${n_average} fi pids=() # initialize pids for sets in ${train_dev} ${eval_set}; do ( [ ! -e ${outdir}/${sets} ] && mkdir -p ${outdir}/${sets} cp ${dumpdir}/${sets}/data.json ${outdir}/${sets} splitjson.py --parts ${nj} ${outdir}/${sets}/data.json # decode in parallel ${train_cmd} JOB=1:${nj} ${outdir}/${sets}/log/decode.JOB.log \ tts_decode.py \ --backend ${backend} \ --ngpu 0 \ --verbose ${verbose} \ --out ${outdir}/${sets}/feats.JOB \ --json ${outdir}/${sets}/split${nj}utt/data.JOB.json \ --model ${expdir}/results/${model} \ --config ${decode_config} # concatenate scp files for n in $(seq ${nj}); do cat "${outdir}/${sets}/feats.$n.scp" || exit 1; done > ${outdir}/${sets}/feats.scp ) & pids+=($!) # store background pids done i=0; for pid in "${pids[@]}"; do wait ${pid} || ((i++)); done [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then echo "stage 5: Synthesis" pids=() # initialize pids for sets in ${train_dev} ${eval_set}; do ( [ ! -e ${outdir}_denorm/${sets} ] && mkdir -p ${outdir}_denorm/${sets} apply-cmvn --norm-vars=true --reverse=true data/${train_set}/cmvn.ark \ scp:${outdir}/${sets}/feats.scp \ ark,scp:${outdir}_denorm/${sets}/feats.ark,${outdir}_denorm/${sets}/feats.scp convert_fbank.sh --nj ${nj} --cmd "${train_cmd}" \ --fs ${fs} \ --fmax "${fmax}" \ --fmin "${fmin}" \ --n_fft ${n_fft} \ --n_shift ${n_shift} \ --win_length "${win_length}" \ --n_mels ${n_mels} \ --iters ${griffin_lim_iters} \ ${outdir}_denorm/${sets} \ ${outdir}_denorm/${sets}/log \ ${outdir}_denorm/${sets}/wav ) & pids+=($!) # store background pids done i=0; for pid in "${pids[@]}"; do wait ${pid} || ((i++)); done [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false echo "Finished." fi