text_to_wav.py

import os
import sys
import argparse
import yaml
import logging
import zipfile
from glob import glob
import soundfile as sf
import numpy as np


ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))  # NOQA: E402
sys.path.insert(0, os.path.dirname(ROOT_PATH))  # NOQA: E402

try:
    from kantts.bin.infer_sambert import am_infer
    from kantts.bin.infer_hifigan import hifigan_infer
    from kantts.utils.ling_unit import text_to_mit_symbols as text_to_symbols
except ImportError:
    raise ImportError("Please install kantts.")

logging.basicConfig(
    #  filename=os.path.join(stage_dir, 'stdout.log'),
    format="%(asctime)s, %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s",
    datefmt="%Y-%m-%d:%H:%M:%S",
    level=logging.INFO,
)


def concat_process(chunked_dir, output_dir):
    wav_files = sorted(glob(os.path.join(chunked_dir, "*.wav")))
    print(wav_files)
    sentence_sil = 0.28  # seconds
    end_sil = 0.05  # seconds

    cnt = 0
    wav_concat = None
    main_id, sub_id = 0, 0

    while cnt < len(wav_files):
        wav_file = os.path.join(
            chunked_dir, "{}_{}_mel_gen.wav".format(main_id, sub_id)
        )
        if os.path.exists(wav_file):
            wav, sr = sf.read(wav_file)
            sentence_sil_samples = int(sentence_sil * sr)
            end_sil_samples = int(end_sil * sr)
            if sub_id == 0:
                wav_concat = wav
            else:
                wav_concat = np.concatenate(
                    (wav_concat, np.zeros(sentence_sil_samples), wav), axis=0
                )

            sub_id += 1
            cnt += 1
        else:
            if wav_concat is not None:
                wav_concat = np.concatenate(
                    (wav_concat, np.zeros(end_sil_samples)), axis=0
                )
                sf.write(os.path.join(output_dir, f"{main_id}.wav"), wav_concat, sr)

            main_id += 1
            sub_id = 0
            wav_concat = None

        if cnt == len(wav_files):
            wav_concat = np.concatenate((wav_concat, np.zeros(end_sil_samples)), axis=0)
            sf.write(os.path.join(output_dir, f"{main_id}.wav"), wav_concat, sr)


def text_to_wav(
    text_file,
    output_dir,
    resources_zip_file,
    am_ckpt,
    voc_ckpt,
    speaker=None,
    se_file=None,
    lang="PinYin",
):
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(os.path.join(output_dir, "res_wavs"), exist_ok=True)

    resource_root_dir = os.path.dirname(resources_zip_file)
    resource_dir = os.path.join(resource_root_dir, "resource")

    if not os.path.exists(resource_dir):
        logging.info("Extracting resources...")
        with zipfile.ZipFile(resources_zip_file, "r") as zip_ref:
            zip_ref.extractall(resource_root_dir)

    with open(text_file, "r") as text_data:
        texts = text_data.readlines()

    logging.info("Converting text to symbols...")
    am_config = os.path.join(os.path.dirname(os.path.dirname(am_ckpt)), "config.yaml")
    with open(am_config, "r") as f:
        config = yaml.load(f, Loader=yaml.Loader)
    if speaker is None:
        speaker = config["linguistic_unit"]["speaker_list"].split(",")[0]
    symbols_lst = text_to_symbols(texts, resource_dir, speaker, lang)
    symbols_file = os.path.join(output_dir, "symbols.lst")
    with open(symbols_file, "w") as symbol_data:
        for symbol in symbols_lst:
            symbol_data.write(symbol)

    logging.info("AM is infering...")
    am_infer(symbols_file, am_ckpt, output_dir, se_file)

    logging.info("Vocoder is infering...")
    hifigan_infer(os.path.join(output_dir, "feat"), voc_ckpt, output_dir)

    concat_process(output_dir, os.path.join(output_dir, "res_wavs"))

    logging.info("Text to wav finished!")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Text to wav")
    parser.add_argument("--txt", type=str, required=True, help="Path to text file")
    parser.add_argument(
        "--output_dir", type=str, required=True, help="Path to output directory"
    )
    parser.add_argument(
        "--res_zip", type=str, required=True, help="Path to resource zip file"
    )
    parser.add_argument(
        "--am_ckpt", type=str, required=True, help="Path to am ckpt file"
    )
    parser.add_argument(
        "--voc_ckpt", type=str, required=True, help="Path to voc ckpt file"
    )
    parser.add_argument(
        "--speaker",
        type=str,
        required=False,
        default=None,
        help="The speaker name, default is the first speaker",
    )
    parser.add_argument(
        "--se_file",
        type=str,
        required=False,
        default=None,
        help="The speaker embedding file , default is None",
    )
    parser.add_argument(
        "--lang",
        type=str,
        default="PinYin",
        help="""The language of the text, default is PinYin, other options are:
        English,
        British,
        ZhHK,
        WuuShanghai,
        Sichuan,
        Indonesian,
        Malay,
        Filipino,
        Vietnamese,
        Korean,
        Russian
        """,
    )
    args = parser.parse_args()
    
    text_to_wav(
        args.txt,
        args.output_dir,
        args.res_zip,
        args.am_ckpt,
        args.voc_ckpt,
        args.speaker,
        args.se_file,
        args.lang,
    )