from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan from transformers import SpeechT5HifiGan from datasets import load_dataset import torch import os import numpy as np import soundfile as sf import librosa import argparse def parse_opt(known=False): parser = argparse.ArgumentParser() parser.add_argument('-hip', '--hip-device', type=int, default=0, help="initial hip devices") parser.add_argument('-m', '--model-path', type=str, default="", help="initial model path") parser.add_argument('-v', '--vocoder-path', type=str, default="", help="the vocoder model path") parser.add_argument('-is', '--input_speech', type=str, default="Autumn, the season of change.", help="input speech") parser.add_argument('-s', '--speaker', type=str, default="", help="the feature of speaker:path of xxx.npy") parser.add_argument('-res', '--result_path', type=str, default="../res", help="the path to save wav file") opt = parser.parse_known_args()[0] if known else parser.parse_args() return opt def main(opt): device = torch.device(f"cuda:{int(opt.hip_device)}") print(f"Using device: {device}") # 设置HF的下载路径为国内镜像 os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" # 模型加载 processor = SpeechT5Processor.from_pretrained(opt.model_path) model = SpeechT5ForSpeechToSpeech.from_pretrained(opt.model_path).to(device) # input:encoder的输入-单声道/16kHZ example_speech, sampling_rate = librosa.load(opt.input_speech, sr=16000, dtype=np.float64) inputs = processor(audio=example_speech, sampling_rate=sampling_rate, return_tensors="pt").to(device) # input:decoder的输入-speaker_embedding speaker_embeddings = np.load(opt.speaker).astype(np.float64) speaker_embeddings = torch.tensor(speaker_embeddings, dtype=torch.float32).unsqueeze(0).to(device) # output:decoder的输出-将声谱图转为音频波形 vocoder = SpeechT5HifiGan.from_pretrained(opt.vocoder_path).to(device) speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder) sf.write(os.path.join(opt.result_path, "vc.wav"), speech.cpu().numpy(), samplerate=sampling_rate) if __name__ == "__main__": main(parse_opt())