#!/usr/bin/env bash # Copyright 2018 Nagoya University (Tomoki Hayashi) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) . ./path.sh || exit 1; . ./cmd.sh || exit 1; # general configuration backend=pytorch stage=-1 stop_stage=100 ngpu=1 # number of gpus ("0" uses cpu, otherwise use gpu) nj=16 # number of parallel jobs dumpdir=dump # directory to dump full features verbose=1 # verbose option (if set > 0, get more log) N=0 # number of minibatches to be used (mainly for debugging). "0" uses all minibatches. seed=1 # random seed number resume="" # the snapshot path to resume (if set empty, no effect) # feature extraction related fs=16000 # sampling frequency fmax=7600 # maximum frequency fmin=80 # minimum frequency n_mels=80 # number of mel basis n_fft=1024 # number of fft points n_shift=256 # number of shift points win_length="" # window length # config files train_config=conf/train_pytorch_transformer.v1.single.finetune.yaml # you can select from conf or conf/tuning. # now we support tacotron2 and transformer for TTS. # see more info in the header of each config. decode_config=conf/decode.yaml # decoding related model=model.loss.best n_average=1 # if > 0, the model averaged with n_average ckpts will be used instead of model.loss.best griffin_lim_iters=64 # the number of iterations of Griffin-Lim # pretrained model related download_dir=downloads pretrained_model="mailabs.en_US.judy.transformer.v1.single" # see model info in local/pretrained_model_download.sh # dataset configuration spk=slt # see local/data_prep.sh to check available speakers # exp tag tag="" # tag for managing experiments. . utils/parse_options.sh || exit 1; # Set bash to 'debug' mode, it will exit on : # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', set -e set -u set -o pipefail org_set=${spk} train_set=${spk}_train_no_dev dev_set=${spk}_dev eval_set=${spk}_eval if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then echo "stage -1: Data and Pretrained Model Download" local/data_download.sh ${download_dir} ${spk} local/pretrained_model_download.sh ${download_dir} ${pretrained_model} fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ### Task dependent. You have to make data the following preparation part by yourself. ### But you can utilize Kaldi recipes in most cases echo "stage 0: Data preparation" local/data_prep.sh ${download_dir}/cmu_us_${spk}_arctic ${spk} data/${org_set} utils/fix_data_dir.sh data/${org_set} utils/validate_data_dir.sh --no-feats data/${org_set} fi feat_tr_dir=${dumpdir}/${train_set}; mkdir -p ${feat_tr_dir} feat_dt_dir=${dumpdir}/${dev_set}; mkdir -p ${feat_dt_dir} feat_ev_dir=${dumpdir}/${eval_set}; mkdir -p ${feat_ev_dir} if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ### Task dependent. You have to design training and dev name by yourself. ### But you can utilize Kaldi recipes in most cases echo "stage 1: Feature Generation" # Generate the fbank features; by default 80-dimensional fbanks on each frame fbankdir=fbank make_fbank.sh --cmd "${train_cmd}" --nj ${nj} \ --fs ${fs} \ --fmax "${fmax}" \ --fmin "${fmin}" \ --n_fft ${n_fft} \ --n_shift ${n_shift} \ --win_length "${win_length}" \ --n_mels ${n_mels} \ data/${org_set} \ exp/make_fbank/${org_set} \ ${fbankdir} # make a dev set utils/subset_data_dir.sh --last data/${org_set} 200 data/${org_set}_tmp utils/subset_data_dir.sh --last data/${org_set}_tmp 100 data/${eval_set} utils/subset_data_dir.sh --first data/${org_set}_tmp 100 data/${dev_set} n=$(( $(wc -l < data/${org_set}/wav.scp) - 200 )) utils/subset_data_dir.sh --first data/${org_set} ${n} data/${train_set} rm -rf data/${org_set}_tmp # use pretrained model cmvn cmvn=$(find ${download_dir}/${pretrained_model} -name "cmvn.ark" | head -n 1) # dump features for training dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta false \ data/${train_set}/feats.scp ${cmvn} exp/dump_feats/${train_set} ${feat_tr_dir} dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta false \ data/${dev_set}/feats.scp ${cmvn} exp/dump_feats/${dev_set} ${feat_dt_dir} dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta false \ data/${eval_set}/feats.scp ${cmvn} exp/dump_feats/${eval_set} ${feat_ev_dir} fi dict=$(find ${download_dir}/${pretrained_model} -name "*_units.txt" | head -n 1) echo "dictionary: ${dict}" if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ### Task dependent. You have to check non-linguistic symbols used in the corpus. echo "stage 2: Dictionary and Json Data Preparation" # make json labels using pretrained model dict data2json.sh --feat ${feat_tr_dir}/feats.scp \ data/${train_set} ${dict} > ${feat_tr_dir}/data.json data2json.sh --feat ${feat_dt_dir}/feats.scp \ data/${dev_set} ${dict} > ${feat_dt_dir}/data.json data2json.sh --feat ${feat_ev_dir}/feats.scp \ data/${eval_set} ${dict} > ${feat_ev_dir}/data.json fi # add pretrained model info in config pretrained_model_path=$(find ${download_dir}/${pretrained_model} -name "model*.best" | head -n 1) train_config="$(change_yaml.py -a pretrained-model="${pretrained_model_path}" \ -o "conf/$(basename "${train_config}" .yaml).${pretrained_model}.yaml" "${train_config}")" if [ -z ${tag} ]; then expname=${train_set}_${backend}_$(basename ${train_config%.*}) else expname=${train_set}_${backend}_${tag} fi expdir=exp/${expname} mkdir -p ${expdir} if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "stage 3: Text-to-speech model training" tr_json=${feat_tr_dir}/data.json dt_json=${feat_dt_dir}/data.json ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \ tts_train.py \ --backend ${backend} \ --ngpu ${ngpu} \ --minibatches ${N} \ --outdir ${expdir}/results \ --tensorboard-dir tensorboard/${expname} \ --verbose ${verbose} \ --seed ${seed} \ --resume ${resume} \ --train-json ${tr_json} \ --valid-json ${dt_json} \ --config ${train_config} fi if [ ${n_average} -gt 0 ]; then model=model.last${n_average}.avg.best fi outdir=${expdir}/outputs_${model}_$(basename ${decode_config%.*}) if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then echo "stage 4: Decoding" if [ ${n_average} -gt 0 ]; then average_checkpoints.py --backend ${backend} \ --snapshots ${expdir}/results/snapshot.ep.* \ --out ${expdir}/results/${model} \ --num ${n_average} fi pids=() # initialize pids for name in ${dev_set} ${eval_set}; do ( [ ! -e ${outdir}/${name} ] && mkdir -p ${outdir}/${name} cp ${dumpdir}/${name}/data.json ${outdir}/${name} splitjson.py --parts ${nj} ${outdir}/${name}/data.json # decode in parallel ${train_cmd} JOB=1:${nj} ${outdir}/${name}/log/decode.JOB.log \ tts_decode.py \ --backend ${backend} \ --ngpu 0 \ --verbose ${verbose} \ --out ${outdir}/${name}/feats.JOB \ --json ${outdir}/${name}/split${nj}utt/data.JOB.json \ --model ${expdir}/results/${model} \ --config ${decode_config} # concatenate scp files for n in $(seq ${nj}); do cat "${outdir}/${name}/feats.$n.scp" || exit 1; done > ${outdir}/${name}/feats.scp ) & pids+=($!) # store background pids done i=0; for pid in "${pids[@]}"; do wait ${pid} || ((i++)); done [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then echo "stage 5: Synthesis" pids=() # initialize pids for name in ${dev_set} ${eval_set}; do ( [ ! -e ${outdir}_denorm/${name} ] && mkdir -p ${outdir}_denorm/${name} # use pretrained model cmvn cmvn=$(find ${download_dir}/${pretrained_model} -name "cmvn.ark" | head -n 1) apply-cmvn --norm-vars=true --reverse=true ${cmvn} \ scp:${outdir}/${name}/feats.scp \ ark,scp:${outdir}_denorm/${name}/feats.ark,${outdir}_denorm/${name}/feats.scp convert_fbank.sh --nj ${nj} --cmd "${train_cmd}" \ --fs ${fs} \ --fmax "${fmax}" \ --fmin "${fmin}" \ --n_fft ${n_fft} \ --n_shift ${n_shift} \ --win_length "${win_length}" \ --n_mels ${n_mels} \ --iters ${griffin_lim_iters} \ ${outdir}_denorm/${name} \ ${outdir}_denorm/${name}/log \ ${outdir}_denorm/${name}/wav ) & pids+=($!) # store background pids done i=0; for pid in "${pids[@]}"; do wait ${pid} || ((i++)); done [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false echo "Finished." fi