inference.py

import torchaudio
from speechbrain.inference.TTS import Tacotron2
from speechbrain.inference.vocoders import HIFIGAN
import os
import argparse


def parse_opt(known=False):
    parser = argparse.ArgumentParser()
    parser.add_argument('-m', '--model-path', type=str, default="", help="the tacotron2 model path")
    parser.add_argument('-v', '--vocoder-path', type=str, default="", help="the vocoder model path")
    parser.add_argument('-t', '--text', type=str, default="Autumn, the season of change.", help="input text")
    parser.add_argument('-res', '--result_path', type=str, default="./res", help="the path to save wav file")
    opt = parser.parse_known_args()[0] if known else parser.parse_args()
    return opt


def main(opt):    
    tacotron2 = Tacotron2.from_hparams(source=opt.model_path, run_opts={"device":"cuda"})
    hifi_gan = HIFIGAN.from_hparams(source=opt.vocoder_path,run_opts={"device":"cuda"})

    # Running the TTS
    mel_output, mel_length, alignment = tacotron2.encode_text(opt.text)

    # Running Vocoder (spectrogram-to-waveform)
    waveforms = hifi_gan.decode_batch(mel_output)

    # Save the waverform
    torchaudio.save(os.path.join(opt.result_path, 'example.wav'),waveforms.squeeze(1).cpu(), 22050)

if __name__ == "__main__":
    main(opt=parse_opt())