inference_talkinghead.py

# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import sys
from typing import Any
import tempfile
from modelscope.pipelines import pipeline
from facechain.constants import tts_speakers_map
from facechain.utils import join_worker_data_dir
try:
    import edge_tts
except ImportError:
    print("警告：未找到edge_tts模块，语音合成功能将无法使用。您可以通过`pip install edge-tts`安装它。\n Warning: The edge_tts module is not found, so the speech synthesis function will not be available. You can install it by 'pip install edge-tts'.")

class SadTalker():
    def __init__(self, uuid):
        if not uuid:
            if os.getenv("MODELSCOPE_ENVIRONMENT") == 'studio':
                return "请登陆后使用! (Please login first)"
            else:
                uuid = 'qw'

        # self.save_dir = os.path.join('/tmp', uuid, 'sythesized_video') # deprecated
        # self.save_dir = os.path.join('.', uuid, 'sythesized_video') # deprecated
        self.save_dir = join_worker_data_dir(uuid, 'sythesized_video')

    def __call__(self, *args, **kwargs) -> Any:
        # two required arguments
        source_image = kwargs.get("source_image") or args[0]
        driven_audio = kwargs.get('driven_audio') or args[1]
        # other optional arguments
        kwargs = {
            'preprocess' : kwargs.get('preprocess') or args[2], 
            'still_mode' : kwargs.get('still_mode') or args[3],
            'use_enhancer' : kwargs.get('use_enhancer') or args[4],
            'batch_size' : kwargs.get('batch_size') or args[5],
            'size' : kwargs.get('size') or args[6], 
            'pose_style' : kwargs.get('pose_style') or args[7],
            'exp_scale' : kwargs.get('exp_scale') or args[8],
            'result_dir': self.save_dir
        }
        inference = pipeline('talking-head', model='wwd123/sadtalker', model_revision='v1.0.0')
        print("initialized sadtalker pipeline")
        video_path = inference(source_image, driven_audio=driven_audio, **kwargs)
        return video_path


async def text_to_speech_edge(text, speaker):
    voice = tts_speakers_map[speaker]
    communicate = edge_tts.Communicate(text, voice)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name

    await communicate.save(tmp_path)

    return tmp_path