Unverified Commit f7cdbcb5 authored by LiangLiu's avatar LiangLiu Committed by GitHub
Browse files

multi-person & animate & podcast (#554)



- 服务化功能新增(前端+后端):
1、seko-talk 模型支持多人输入
2、支持播客合成与管理
3、支持wan2.2 animate 模型

- 后端接口新增:
1、 基于火山的播客websocket合成接口,支持边合成边听
2、播客的查询管理接口
3、基于 yolo 的多人人脸检测接口
4、音频多人切分接口

- 推理代码侵入式修改
1、将 animate 相关的 输入文件路径(mask/image/pose等)从固定写死的config中移除到可变的input_info中
2、animate的预处理相关代码包装成接口供服务化使用

@xinyiqin

---------
Co-authored-by: default avatarqinxinyi <qxy118045534@163.com>
parent 61dd69ca
......@@ -94,7 +94,7 @@
}
},
"s2v": {
"seko_talk": {
"SekoTalk": {
"single_stage": {
"pipeline": {
"inputs": ["input_image", "input_audio"],
......@@ -125,12 +125,24 @@
}
}
}
},
"animate": {
"wan2.2_animate": {
"single_stage": {
"pipeline": {
"inputs": ["input_image","input_video"],
"outputs": ["output_video"]
}
}
}
}
},
"meta": {
"special_types": {
"input_image": "IMAGE",
"input_audio": "AUDIO",
"input_video": "VIDEO",
"latents": "TENSOR",
"output_video": "VIDEO"
},
......
......@@ -12,9 +12,6 @@
"sample_guide_scale": 5.0,
"enable_cfg": false,
"cpu_offload": false,
"src_pose_path": "../save_results/animate/process_results/src_pose.mp4",
"src_face_path": "../save_results/animate/process_results/src_face.mp4",
"src_ref_images": "../save_results/animate/process_results/src_ref.png",
"refert_num": 1,
"replace_flag": false,
"fps": 30
......
......@@ -13,9 +13,6 @@
"enable_cfg": false,
"cpu_offload": true,
"offload_granularity": "phase",
"src_pose_path": "../save_results/animate/process_results/src_pose.mp4",
"src_face_path": "../save_results/animate/process_results/src_face.mp4",
"src_ref_images": "../save_results/animate/process_results/src_ref.png",
"refert_num": 1,
"replace_flag": false,
"fps": 30,
......
......@@ -12,9 +12,6 @@
"sample_guide_scale": 1.0,
"enable_cfg": false,
"cpu_offload": false,
"src_pose_path": "../save_results/animate/process_results/src_pose.mp4",
"src_face_path": "../save_results/animate/process_results/src_face.mp4",
"src_ref_images": "../save_results/animate/process_results/src_ref.png",
"refert_num": 1,
"replace_flag": false,
"fps": 30,
......
......@@ -12,11 +12,6 @@
"sample_guide_scale": 5.0,
"enable_cfg": false,
"cpu_offload": false,
"src_pose_path": "../save_results/replace/process_results/src_pose.mp4",
"src_face_path": "../save_results/replace/process_results/src_face.mp4",
"src_ref_images": "../save_results/replace/process_results/src_ref.png",
"src_bg_path": "../save_results/replace/process_results/src_bg.mp4",
"src_mask_path": "../save_results/replace/process_results/src_mask.mp4",
"refert_num": 1,
"fps": 30,
"replace_flag": true
......
......@@ -13,11 +13,6 @@
"enable_cfg": false,
"cpu_offload": true,
"offload_granularity": "phase",
"src_pose_path": "../save_results/replace/process_results/src_pose.mp4",
"src_face_path": "../save_results/replace/process_results/src_face.mp4",
"src_ref_images": "../save_results/replace/process_results/src_ref.png",
"src_bg_path": "../save_results/replace/process_results/src_bg.mp4",
"src_mask_path": "../save_results/replace/process_results/src_mask.mp4",
"refert_num": 1,
"fps": 30,
"replace_flag": true,
......
# -*- coding: utf-8 -*-
"""
Audio Source Separation Module
Separates different voice tracks in audio, supports multi-person audio separation
"""
import base64
import io
import os
import tempfile
import traceback
from collections import defaultdict
from typing import Dict, Optional, Union
import torch
import torchaudio
from loguru import logger
# Import pyannote.audio for speaker diarization
from pyannote.audio import Audio, Pipeline
class AudioSeparator:
"""
Audio separator for separating different voice tracks in audio using pyannote.audio
Supports multi-person conversation separation, maintains duration (other speakers' tracks are empty)
"""
def __init__(
self,
model_path: str = None,
device: str = None,
sample_rate: int = 16000,
):
"""
Initialize audio separator
Args:
model_path: Model path (if using custom model), default uses pyannote/speaker-diarization-community-1
device: Device ('cpu', 'cuda', etc.), None for auto selection
sample_rate: Target sample rate, default 16000
"""
self.sample_rate = sample_rate
self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
self._init_pyannote(model_path)
def _init_pyannote(self, model_path: str = None):
"""Initialize pyannote.audio pipeline"""
try:
huggingface_token = os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN")
model_name = model_path or "pyannote/speaker-diarization-community-1"
try:
# Try loading with token if available
if huggingface_token:
self.pipeline = Pipeline.from_pretrained(model_name, token=huggingface_token)
else:
# Try without token (may work for public models)
self.pipeline = Pipeline.from_pretrained(model_name)
except Exception as e:
if "gated" in str(e).lower() or "token" in str(e).lower():
raise RuntimeError(f"Model requires authentication. Set HUGGINGFACE_TOKEN or HF_TOKEN environment variable: {e}")
raise RuntimeError(f"Failed to load pyannote model: {e}")
# Move pipeline to specified device
if self.device:
self.pipeline.to(torch.device(self.device))
# Initialize Audio helper for waveform loading
self.pyannote_audio = Audio()
logger.info("Initialized pyannote.audio speaker diarization pipeline")
except Exception as e:
logger.error(f"Failed to initialize pyannote: {e}")
raise RuntimeError(f"Failed to initialize pyannote.audio pipeline: {e}")
def separate_speakers(
self,
audio_path: Union[str, bytes],
num_speakers: Optional[int] = None,
min_speakers: int = 1,
max_speakers: int = 5,
) -> Dict:
"""
Separate different speakers in audio
Args:
audio_path: Audio file path or bytes data
num_speakers: Specified number of speakers, None for auto detection
min_speakers: Minimum number of speakers
max_speakers: Maximum number of speakers
Returns:
Dict containing:
- speakers: List of speaker audio segments, each containing:
- speaker_id: Speaker ID (0, 1, 2, ...)
- audio: torch.Tensor audio data [channels, samples]
- segments: List of (start_time, end_time) tuples
- sample_rate: Sample rate
"""
try:
# Load audio
if isinstance(audio_path, bytes):
# 尝试从字节数据推断音频格式
# 检查是否是 WAV 格式(RIFF 头)
is_wav = audio_path[:4] == b"RIFF" and audio_path[8:12] == b"WAVE"
# 检查是否是 MP3 格式(ID3 或 MPEG 头)
is_mp3 = audio_path[:3] == b"ID3" or audio_path[:2] == b"\xff\xfb" or audio_path[:2] == b"\xff\xf3"
# 根据格式选择后缀
if is_wav:
suffix = ".wav"
elif is_mp3:
suffix = ".mp3"
else:
# 默认尝试 WAV,如果失败会抛出错误
suffix = ".wav"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_file:
tmp_file.write(audio_path)
tmp_audio_path = tmp_file.name
try:
result = self._separate_speakers_internal(tmp_audio_path, num_speakers, min_speakers, max_speakers)
finally:
# 确保临时文件被删除
try:
os.unlink(tmp_audio_path)
except Exception as e:
logger.warning(f"Failed to delete temp file {tmp_audio_path}: {e}")
return result
else:
return self._separate_speakers_internal(audio_path, num_speakers, min_speakers, max_speakers)
except Exception as e:
logger.error(f"Speaker separation failed: {traceback.format_exc()}")
raise RuntimeError(f"Audio separation error: {e}")
def _separate_speakers_internal(
self,
audio_path: str,
num_speakers: Optional[int] = None,
min_speakers: int = 1,
max_speakers: int = 5,
) -> Dict:
"""Internal method: execute speaker separation"""
# Load audio
waveform, original_sr = torchaudio.load(audio_path)
if original_sr != self.sample_rate:
resampler = torchaudio.transforms.Resample(original_sr, self.sample_rate)
waveform = resampler(waveform)
# Convert to mono if stereo
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
# Ensure waveform is float32 and normalized (pyannote expects this format)
if waveform.dtype != torch.float32:
waveform = waveform.float()
# Ensure waveform is in range [-1, 1] (normalize if needed)
if waveform.abs().max() > 1.0:
waveform = waveform / waveform.abs().max()
if self.pipeline is None:
raise RuntimeError("Pyannote pipeline not initialized")
return self._separate_with_pyannote(audio_path, waveform, num_speakers, min_speakers, max_speakers)
def _separate_with_pyannote(
self,
audio_path: str,
waveform: torch.Tensor,
num_speakers: Optional[int],
min_speakers: int,
max_speakers: int,
) -> Dict:
"""Use pyannote.audio for speaker diarization"""
try:
# Use waveform dict to avoid AudioDecoder dependency issues
# Pipeline can accept either file path or waveform dict
# Using waveform dict is more reliable when torchcodec is not properly installed
audio_input = {
"waveform": waveform,
"sample_rate": self.sample_rate,
}
# Run speaker diarization
output = self.pipeline(
audio_input,
min_speakers=min_speakers if num_speakers is None else num_speakers,
max_speakers=max_speakers if num_speakers is None else num_speakers,
)
# Extract audio segments for each speaker
speakers_dict = defaultdict(list)
for turn, speaker in output.speaker_diarization:
print(f"Speaker: {speaker}, Start time: {turn.start}, End time: {turn.end}")
start_time = turn.start
end_time = turn.end
start_sample = int(start_time * self.sample_rate)
end_sample = int(end_time * self.sample_rate)
# Extract audio segment for this time period
segment_audio = waveform[:, start_sample:end_sample]
speakers_dict[speaker].append((start_time, end_time, segment_audio))
# Generate complete audio for each speaker (other speakers' segments are empty)
speakers = []
audio_duration = waveform.shape[1] / self.sample_rate
num_samples = waveform.shape[1]
for speaker_id, segments in speakers_dict.items():
# Create zero-filled audio
speaker_audio = torch.zeros_like(waveform)
# Fill in this speaker's segments
for start_time, end_time, segment_audio in segments:
start_sample = int(start_time * self.sample_rate)
end_sample = int(end_time * self.sample_rate)
# Ensure no out-of-bounds
end_sample = min(end_sample, num_samples)
segment_len = end_sample - start_sample
if segment_len > 0 and segment_audio.shape[1] > 0:
actual_len = min(segment_len, segment_audio.shape[1])
speaker_audio[:, start_sample : start_sample + actual_len] = segment_audio[:, :actual_len]
speakers.append(
{
"speaker_id": speaker_id,
"audio": speaker_audio,
"segments": [(s[0], s[1]) for s in segments],
"sample_rate": self.sample_rate,
}
)
logger.info(f"Separated audio into {len(speakers)} speakers using pyannote")
return {"speakers": speakers, "method": "pyannote"}
except Exception as e:
logger.error(f"Pyannote separation failed: {e}")
raise RuntimeError(f"Audio separation failed: {e}")
def save_speaker_audio(self, speaker_audio: torch.Tensor, output_path: str, sample_rate: int = None):
"""
Save speaker audio to file
Args:
speaker_audio: Audio tensor [channels, samples]
output_path: Output path
sample_rate: Sample rate, if None uses self.sample_rate
"""
sr = sample_rate if sample_rate else self.sample_rate
torchaudio.save(output_path, speaker_audio, sr)
logger.info(f"Saved speaker audio to {output_path}")
def speaker_audio_to_base64(self, speaker_audio: torch.Tensor, sample_rate: int = None, format: str = "wav") -> str:
"""
Convert speaker audio tensor to base64 encoded string without saving to file
Args:
speaker_audio: Audio tensor [channels, samples]
sample_rate: Sample rate, if None uses self.sample_rate
format: Audio format (default: "wav")
Returns:
Base64 encoded audio string
"""
sr = sample_rate if sample_rate else self.sample_rate
# Use BytesIO to save audio to memory
buffer = io.BytesIO()
torchaudio.save(buffer, speaker_audio, sr, format=format)
# Get the audio bytes
audio_bytes = buffer.getvalue()
# Encode to base64
audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
logger.debug(f"Converted speaker audio to base64, size: {len(audio_bytes)} bytes")
return audio_base64
def separate_and_save(
self,
audio_path: Union[str, bytes],
output_dir: str,
num_speakers: Optional[int] = None,
min_speakers: int = 1,
max_speakers: int = 5,
) -> Dict:
"""
Separate audio and save to files
Args:
audio_path: Input audio path or bytes data
output_dir: Output directory
num_speakers: Specified number of speakers
min_speakers: Minimum number of speakers
max_speakers: Maximum number of speakers
Returns:
Separation result dictionary, containing output file paths
"""
os.makedirs(output_dir, exist_ok=True)
result = self.separate_speakers(audio_path, num_speakers, min_speakers, max_speakers)
output_paths = []
for speaker in result["speakers"]:
speaker_id = speaker["speaker_id"]
output_path = os.path.join(output_dir, f"{speaker_id}.wav")
self.save_speaker_audio(speaker["audio"], output_path, speaker["sample_rate"])
output_paths.append(output_path)
speaker["output_path"] = output_path
result["output_paths"] = output_paths
return result
def separate_audio_tracks(
audio_path: str,
output_dir: str = None,
num_speakers: int = None,
model_path: str = None,
) -> Dict:
"""
Convenience function: separate different audio tracks
Args:
audio_path: Audio file path
output_dir: Output directory, if None does not save files
num_speakers: Number of speakers
model_path: Model path (optional)
Returns:
Separation result dictionary
"""
separator = AudioSeparator(model_path=model_path)
if output_dir:
return separator.separate_and_save(audio_path, output_dir, num_speakers=num_speakers)
else:
return separator.separate_speakers(audio_path, num_speakers=num_speakers)
if __name__ == "__main__":
# Test code
import sys
if len(sys.argv) < 2:
print("Usage: python audio_separator.py <audio_path> [output_dir] [num_speakers]")
sys.exit(1)
audio_path = sys.argv[1]
output_dir = sys.argv[2] if len(sys.argv) > 2 else "./separated_audio"
num_speakers = int(sys.argv[3]) if len(sys.argv) > 3 else None
separator = AudioSeparator()
result = separator.separate_and_save(audio_path, output_dir, num_speakers=num_speakers)
print(f"Separated audio into {len(result['speakers'])} speakers:")
for speaker in result["speakers"]:
print(f" Speaker {speaker['speaker_id']}: {len(speaker['segments'])} segments")
if "output_path" in speaker:
print(f" Saved to: {speaker['output_path']}")
# -*- coding: utf-8 -*-
"""
Face Detection Module using YOLO
Supports detecting faces in images, including human faces, animal faces, anime faces, sketches, etc.
"""
import io
import traceback
from typing import Dict, List, Union
import numpy as np
from PIL import Image, ImageDraw
from loguru import logger
from ultralytics import YOLO
class FaceDetector:
"""
Face detection using YOLO models
Supports detecting: human faces, animal faces, anime faces, sketch faces, etc.
"""
def __init__(self, model_path: str = None, conf_threshold: float = 0.25, device: str = None):
"""
Initialize face detector
Args:
model_path: YOLO model path, if None uses default pretrained model
conf_threshold: Confidence threshold, default 0.25
device: Device ('cpu', 'cuda', '0', '1', etc.), None for auto selection
"""
self.conf_threshold = conf_threshold
self.device = device
if model_path is None:
# Use YOLO11 pretrained model, can detect COCO dataset classes (including person)
# Or use dedicated face detection model
logger.info("Loading default YOLO11n model for face detection")
try:
self.model = YOLO("yolo11n.pt") # Lightweight model
except Exception as e:
logger.warning(f"Failed to load default model, trying yolov8n: {e}")
self.model = YOLO("yolov8n.pt")
else:
logger.info(f"Loading YOLO model from {model_path}")
self.model = YOLO(model_path)
# Person class ID in COCO dataset is 0
# YOLO can detect person, for more precise face detection, recommend using dedicated face detection models
# Such as YOLOv8-face or RetinaFace, can be specified via model_path parameter
# First use YOLO to detect person region, then can further detect faces within
self.target_classes = {
"person": 0, # Face (by detecting person class)
# Can be extended to detect animal faces (cat, dog, etc.) and other classes
}
def detect_faces(
self,
image: Union[str, Image.Image, bytes, np.ndarray],
return_image: bool = False,
) -> Dict:
"""
Detect faces in image
Args:
image: Input image, can be path, PIL Image, bytes or numpy array
return_image: Whether to return annotated image with detection boxes
return_boxes: Whether to return detection box information
Returns:
Dict containing:
- faces: List of face detection results, each containing:
- bbox: [x1, y1, x2, y2] bounding box coordinates (absolute pixel coordinates)
- confidence: Confidence score (0.0-1.0)
- class_id: Class ID
- class_name: Class name
- image (optional): PIL Image with detection boxes drawn (if return_image=True)
"""
try:
# Load image
if isinstance(image, str):
img = Image.open(image).convert("RGB")
elif isinstance(image, bytes):
img = Image.open(io.BytesIO(image)).convert("RGB")
elif isinstance(image, np.ndarray):
img = Image.fromarray(image).convert("RGB")
elif isinstance(image, Image.Image):
img = image.convert("RGB")
else:
raise ValueError(f"Unsupported image type: {type(image)}")
# Use YOLO for detection
# Note: YOLO by default detects person, we focus on person detection
# For more precise face detection, can train or use dedicated face detection models
results = self.model.predict(
source=img,
conf=self.conf_threshold,
device=self.device,
verbose=False,
)
faces = []
annotated_img = img.copy() if return_image else None
if len(results) > 0:
result = results[0]
boxes = result.boxes
if boxes is not None and len(boxes) > 0:
for i in range(len(boxes)):
# Get bounding box coordinates (xyxy format)
bbox = boxes.xyxy[i].cpu().numpy().tolist()
confidence = float(boxes.conf[i].cpu().numpy())
class_id = int(boxes.cls[i].cpu().numpy())
# Get class name
class_name = result.names.get(class_id, "unknown")
# Process target classes (person, etc.)
# For person, the entire body box contains face region
# For more precise face detection, can:
# 1. Use dedicated face detection models (RetinaFace, YOLOv8-face)
# 2. Further use face detection model within current person box
# 3. Use specifically trained multi-class detection models (faces, animal faces, anime faces, etc.)
if class_id in self.target_classes.values():
face_info = {
"bbox": bbox, # [x1, y1, x2, y2] - absolute pixel coordinates
"confidence": confidence,
"class_id": class_id,
"class_name": class_name,
}
faces.append(face_info)
# Draw annotations on image if needed
if return_image and annotated_img is not None:
draw = ImageDraw.Draw(annotated_img)
x1, y1, x2, y2 = bbox
# Draw bounding box
draw.rectangle(
[x1, y1, x2, y2],
outline="red",
width=2,
)
# Draw label
label = f"{class_name} {confidence:.2f}"
draw.text((x1, y1 - 15), label, fill="red")
result_dict = {"faces": faces}
if return_image and annotated_img is not None:
result_dict["image"] = annotated_img
logger.info(f"Detected {len(faces)} faces in image")
return result_dict
except Exception as e:
logger.error(f"Face detection failed: {traceback.format_exc()}")
raise RuntimeError(f"Face detection error: {e}")
def detect_faces_from_bytes(self, image_bytes: bytes, **kwargs) -> Dict:
"""
Detect faces from byte data
Args:
image_bytes: Image byte data
**kwargs: Additional parameters passed to detect_faces
Returns:
Detection result dictionary
"""
return self.detect_faces(image_bytes, **kwargs)
def extract_face_regions(self, image: Union[str, Image.Image, bytes], expand_ratio: float = 0.1) -> List[Image.Image]:
"""
Extract detected face regions
Args:
image: Input image
expand_ratio: Bounding box expansion ratio to include more context
Returns:
List of extracted face region images
"""
result = self.detect_faces(image)
faces = result["faces"]
# Load original image
if isinstance(image, str):
img = Image.open(image).convert("RGB")
elif isinstance(image, bytes):
img = Image.open(io.BytesIO(image)).convert("RGB")
elif isinstance(image, Image.Image):
img = image.convert("RGB")
else:
raise ValueError(f"Unsupported image type: {type(image)}")
face_regions = []
img_width, img_height = img.size
for face in faces:
x1, y1, x2, y2 = face["bbox"]
# Expand bounding box
width = x2 - x1
height = y2 - y1
expand_x = width * expand_ratio
expand_y = height * expand_ratio
x1 = max(0, int(x1 - expand_x))
y1 = max(0, int(y1 - expand_y))
x2 = min(img_width, int(x2 + expand_x))
y2 = min(img_height, int(y2 + expand_y))
# Crop region
face_region = img.crop((x1, y1, x2, y2))
face_regions.append(face_region)
return face_regions
def count_faces(self, image: Union[str, Image.Image, bytes]) -> int:
"""
Count number of faces in image
Args:
image: Input image
Returns:
Number of detected faces
"""
result = self.detect_faces(image, return_image=False)
return len(result["faces"])
def detect_faces_in_image(
image_path: str,
model_path: str = None,
conf_threshold: float = 0.25,
return_image: bool = False,
) -> Dict:
"""
Convenience function: detect faces in image
Args:
image_path: Image path
model_path: YOLO model path
conf_threshold: Confidence threshold
return_image: Whether to return annotated image
Returns:
Detection result dictionary containing:
- faces: List of face detection results with bbox coordinates [x1, y1, x2, y2]
- image (optional): Annotated image with detection boxes
"""
detector = FaceDetector(model_path=model_path, conf_threshold=conf_threshold)
return detector.detect_faces(image_path, return_image=return_image)
if __name__ == "__main__":
# Test code
import sys
if len(sys.argv) < 2:
print("Usage: python face_detector.py <image_path>")
sys.exit(1)
image_path = sys.argv[1]
detector = FaceDetector()
result = detector.detect_faces(image_path, return_image=True)
print(f"Detected {len(result['faces'])} faces:")
for i, face in enumerate(result["faces"]):
print(f" Face {i + 1}: {face}")
output_path = "detected_faces.png"
result["image"].save(output_path)
print(f"Annotated image saved to: {output_path}")
# -*- coding: utf-8 -*-
import asyncio
import io
import json
import os
import struct
import uuid
from dataclasses import dataclass
from enum import IntEnum
from typing import Callable, List, Optional
import websockets
from loguru import logger
from pydub import AudioSegment
# Protocol definitions (from podcasts_protocols)
class MsgType(IntEnum):
"""Message type enumeration"""
Invalid = 0
FullClientRequest = 0b1
AudioOnlyClient = 0b10
FullServerResponse = 0b1001
AudioOnlyServer = 0b1011
FrontEndResultServer = 0b1100
Error = 0b1111
ServerACK = AudioOnlyServer
class MsgTypeFlagBits(IntEnum):
"""Message type flag bits"""
NoSeq = 0
PositiveSeq = 0b1
LastNoSeq = 0b10
NegativeSeq = 0b11
WithEvent = 0b100
class VersionBits(IntEnum):
"""Version bits"""
Version1 = 1
class HeaderSizeBits(IntEnum):
"""Header size bits"""
HeaderSize4 = 1
HeaderSize8 = 2
HeaderSize12 = 3
HeaderSize16 = 4
class SerializationBits(IntEnum):
"""Serialization method bits"""
Raw = 0
JSON = 0b1
Thrift = 0b11
Custom = 0b1111
class CompressionBits(IntEnum):
"""Compression method bits"""
None_ = 0
Gzip = 0b1
Custom = 0b1111
class EventType(IntEnum):
"""Event type enumeration"""
None_ = 0
StartConnection = 1
StartTask = 1
FinishConnection = 2
FinishTask = 2
ConnectionStarted = 50
TaskStarted = 50
ConnectionFailed = 51
TaskFailed = 51
ConnectionFinished = 52
TaskFinished = 52
StartSession = 100
CancelSession = 101
FinishSession = 102
SessionStarted = 150
SessionCanceled = 151
SessionFinished = 152
SessionFailed = 153
UsageResponse = 154
ChargeData = 154
TaskRequest = 200
UpdateConfig = 201
AudioMuted = 250
SayHello = 300
TTSSentenceStart = 350
TTSSentenceEnd = 351
TTSResponse = 352
TTSEnded = 359
PodcastRoundStart = 360
PodcastRoundResponse = 361
PodcastRoundEnd = 362
PodcastEnd = 363
@dataclass
class Message:
"""Message object"""
version: VersionBits = VersionBits.Version1
header_size: HeaderSizeBits = HeaderSizeBits.HeaderSize4
type: MsgType = MsgType.Invalid
flag: MsgTypeFlagBits = MsgTypeFlagBits.NoSeq
serialization: SerializationBits = SerializationBits.JSON
compression: CompressionBits = CompressionBits.None_
event: EventType = EventType.None_
session_id: str = ""
connect_id: str = ""
sequence: int = 0
error_code: int = 0
payload: bytes = b""
@classmethod
def from_bytes(cls, data: bytes) -> "Message":
"""Create message object from bytes"""
if len(data) < 3:
raise ValueError(f"Data too short: expected at least 3 bytes, got {len(data)}")
type_and_flag = data[1]
msg_type = MsgType(type_and_flag >> 4)
flag = MsgTypeFlagBits(type_and_flag & 0b00001111)
msg = cls(type=msg_type, flag=flag)
msg.unmarshal(data)
return msg
def marshal(self) -> bytes:
"""Serialize message to bytes"""
buffer = io.BytesIO()
header = [
(self.version << 4) | self.header_size,
(self.type << 4) | self.flag,
(self.serialization << 4) | self.compression,
]
header_size = 4 * self.header_size
if padding := header_size - len(header):
header.extend([0] * padding)
buffer.write(bytes(header))
writers = self._get_writers()
for writer in writers:
writer(buffer)
return buffer.getvalue()
def unmarshal(self, data: bytes) -> None:
"""Deserialize message from bytes"""
buffer = io.BytesIO(data)
version_and_header_size = buffer.read(1)[0]
self.version = VersionBits(version_and_header_size >> 4)
self.header_size = HeaderSizeBits(version_and_header_size & 0b00001111)
buffer.read(1)
serialization_compression = buffer.read(1)[0]
self.serialization = SerializationBits(serialization_compression >> 4)
self.compression = CompressionBits(serialization_compression & 0b00001111)
header_size = 4 * self.header_size
read_size = 3
if padding_size := header_size - read_size:
buffer.read(padding_size)
readers = self._get_readers()
for reader in readers:
reader(buffer)
remaining = buffer.read()
if remaining:
raise ValueError(f"Unexpected data after message: {remaining}")
def _get_writers(self) -> List[Callable[[io.BytesIO], None]]:
"""Get list of writer functions"""
writers = []
if self.flag == MsgTypeFlagBits.WithEvent:
writers.extend([self._write_event, self._write_session_id])
if self.type in [MsgType.FullClientRequest, MsgType.FullServerResponse, MsgType.FrontEndResultServer, MsgType.AudioOnlyClient, MsgType.AudioOnlyServer]:
if self.flag in [MsgTypeFlagBits.PositiveSeq, MsgTypeFlagBits.NegativeSeq]:
writers.append(self._write_sequence)
elif self.type == MsgType.Error:
writers.append(self._write_error_code)
else:
raise ValueError(f"Unsupported message type: {self.type}")
writers.append(self._write_payload)
return writers
def _get_readers(self) -> List[Callable[[io.BytesIO], None]]:
"""Get list of reader functions"""
readers = []
if self.type in [MsgType.FullClientRequest, MsgType.FullServerResponse, MsgType.FrontEndResultServer, MsgType.AudioOnlyClient, MsgType.AudioOnlyServer]:
if self.flag in [MsgTypeFlagBits.PositiveSeq, MsgTypeFlagBits.NegativeSeq]:
readers.append(self._read_sequence)
elif self.type == MsgType.Error:
readers.append(self._read_error_code)
if self.flag == MsgTypeFlagBits.WithEvent:
readers.extend([self._read_event, self._read_session_id, self._read_connect_id])
readers.append(self._read_payload)
return readers
def _write_event(self, buffer: io.BytesIO) -> None:
buffer.write(struct.pack(">i", self.event))
def _write_session_id(self, buffer: io.BytesIO) -> None:
if self.event in [EventType.StartConnection, EventType.FinishConnection, EventType.ConnectionStarted, EventType.ConnectionFailed]:
return
session_id_bytes = self.session_id.encode("utf-8")
size = len(session_id_bytes)
if size > 0xFFFFFFFF:
raise ValueError(f"Session ID size ({size}) exceeds max(uint32)")
buffer.write(struct.pack(">I", size))
if size > 0:
buffer.write(session_id_bytes)
def _write_sequence(self, buffer: io.BytesIO) -> None:
buffer.write(struct.pack(">i", self.sequence))
def _write_error_code(self, buffer: io.BytesIO) -> None:
buffer.write(struct.pack(">I", self.error_code))
def _write_payload(self, buffer: io.BytesIO) -> None:
size = len(self.payload)
if size > 0xFFFFFFFF:
raise ValueError(f"Payload size ({size}) exceeds max(uint32)")
buffer.write(struct.pack(">I", size))
buffer.write(self.payload)
def _read_event(self, buffer: io.BytesIO) -> None:
event_bytes = buffer.read(4)
if event_bytes:
self.event = EventType(struct.unpack(">i", event_bytes)[0])
def _read_session_id(self, buffer: io.BytesIO) -> None:
if self.event in [EventType.StartConnection, EventType.FinishConnection, EventType.ConnectionStarted, EventType.ConnectionFailed, EventType.ConnectionFinished]:
return
size_bytes = buffer.read(4)
if size_bytes:
size = struct.unpack(">I", size_bytes)[0]
if size > 0:
session_id_bytes = buffer.read(size)
if len(session_id_bytes) == size:
self.session_id = session_id_bytes.decode("utf-8")
def _read_connect_id(self, buffer: io.BytesIO) -> None:
if self.event in [EventType.ConnectionStarted, EventType.ConnectionFailed, EventType.ConnectionFinished]:
size_bytes = buffer.read(4)
if size_bytes:
size = struct.unpack(">I", size_bytes)[0]
if size > 0:
self.connect_id = buffer.read(size).decode("utf-8")
def _read_sequence(self, buffer: io.BytesIO) -> None:
sequence_bytes = buffer.read(4)
if sequence_bytes:
self.sequence = struct.unpack(">i", sequence_bytes)[0]
def _read_error_code(self, buffer: io.BytesIO) -> None:
error_code_bytes = buffer.read(4)
if error_code_bytes:
self.error_code = struct.unpack(">I", error_code_bytes)[0]
def _read_payload(self, buffer: io.BytesIO) -> None:
size_bytes = buffer.read(4)
if size_bytes:
size = struct.unpack(">I", size_bytes)[0]
if size > 0:
self.payload = buffer.read(size)
async def receive_message(websocket: websockets.WebSocketClientProtocol) -> Message:
"""Receive message from websocket"""
try:
data = await websocket.recv()
if isinstance(data, str):
raise ValueError(f"Unexpected text message: {data}")
elif isinstance(data, bytes):
msg = Message.from_bytes(data)
# logger.debug(f"Received: {msg}")
return msg
else:
raise ValueError(f"Unexpected message type: {type(data)}")
except Exception as e:
logger.error(f"Failed to receive message: {e}")
raise
async def wait_for_event(websocket: websockets.WebSocketClientProtocol, msg_type: MsgType, event_type: EventType) -> Message:
"""Wait for specific event"""
while True:
msg = await receive_message(websocket)
if msg.type != msg_type or msg.event != event_type:
raise ValueError(f"Unexpected message: {msg}")
if msg.type == msg_type and msg.event == event_type:
return msg
async def start_connection(websocket: websockets.WebSocketClientProtocol) -> None:
"""Start connection"""
msg = Message(type=MsgType.FullClientRequest, flag=MsgTypeFlagBits.WithEvent)
msg.event = EventType.StartConnection
msg.payload = b"{}"
logger.debug(f"Sending: {msg}")
await websocket.send(msg.marshal())
async def finish_connection(websocket: websockets.WebSocketClientProtocol) -> None:
"""Finish connection"""
msg = Message(type=MsgType.FullClientRequest, flag=MsgTypeFlagBits.WithEvent)
msg.event = EventType.FinishConnection
msg.payload = b"{}"
logger.debug(f"Sending: {msg}")
await websocket.send(msg.marshal())
async def start_session(websocket: websockets.WebSocketClientProtocol, payload: bytes, session_id: str) -> None:
"""Start session"""
msg = Message(type=MsgType.FullClientRequest, flag=MsgTypeFlagBits.WithEvent)
msg.event = EventType.StartSession
msg.session_id = session_id
msg.payload = payload
logger.debug(f"Sending: {msg}")
await websocket.send(msg.marshal())
async def finish_session(websocket: websockets.WebSocketClientProtocol, session_id: str) -> None:
"""Finish session"""
msg = Message(type=MsgType.FullClientRequest, flag=MsgTypeFlagBits.WithEvent)
msg.event = EventType.FinishSession
msg.session_id = session_id
msg.payload = b"{}"
logger.debug(f"Sending: {msg}")
await websocket.send(msg.marshal())
class PodcastRoundPostProcessor:
def __init__(self, session_id, data_manager):
self.session_id = session_id
self.data_manager = data_manager
self.temp_merged_audio_name = "merged_audio.mp3"
self.output_merged_audio_name = f"{session_id}-merged_audio.mp3"
self.subtitle_timestamps = [] # 记录字幕时间戳
self.current_audio_duration = 0.0 # 当前音频时长
self.merged_audio = None # 用于存储合并的音频对象
self.merged_audio_bytes = None
async def init(self):
if self.data_manager:
await self.data_manager.create_podcast_temp_session_dir(self.session_id)
async def postprocess_round(self, current_round, voice, audio, podcast_texts):
text = ""
if podcast_texts:
text = podcast_texts[-1].get("text", "")
logger.debug(f"Processing round: {current_round}, voice: {voice}, text: {text}, audio: {len(audio)} bytes")
new_segment = AudioSegment.from_mp3(io.BytesIO(bytes(audio)))
round_duration = len(new_segment) / 1000.0
if self.merged_audio is None:
self.merged_audio = new_segment
else:
self.merged_audio = self.merged_audio + new_segment
# 保存合并后的音频到临时文件(用于前端实时访问)
merged_io = io.BytesIO()
self.merged_audio.export(merged_io, format="mp3")
self.merged_audio_bytes = merged_io.getvalue()
if self.data_manager:
await self.data_manager.save_podcast_temp_session_file(self.session_id, self.temp_merged_audio_name, self.merged_audio_bytes)
merged_file_size = len(self.merged_audio_bytes)
# 记录字幕时间戳
self.subtitle_timestamps.append(
{
"start": self.current_audio_duration,
"end": self.current_audio_duration + round_duration,
"text": text,
"speaker": voice,
}
)
self.current_audio_duration += round_duration
logger.debug(f"Merged audio updated: {merged_file_size} bytes, duration: {self.current_audio_duration:.2f}s")
return {
"url": f"/api/v1/podcast/audio?session_id={self.session_id}&filename={self.temp_merged_audio_name}",
"size": merged_file_size,
"duration": self.current_audio_duration,
"round": current_round,
"text": text,
"speaker": voice,
}
async def postprocess_final(self):
if self.data_manager:
await self.data_manager.save_podcast_output_file(self.output_merged_audio_name, self.merged_audio_bytes)
return {
"subtitles": self.subtitle_timestamps,
"audio_name": self.output_merged_audio_name,
}
async def cleanup(self):
if self.data_manager:
await self.data_manager.clear_podcast_temp_session_dir(self.session_id)
self.data_manager = None
class VolcEnginePodcastClient:
"""
VolcEngine Podcast客户端
支持多种播客类型:
- action=0: 文本转播客
- action=3: NLP文本转播客
- action=4: 提示词生成播客
"""
def __init__(self):
self.endpoint = "wss://openspeech.bytedance.com/api/v3/sami/podcasttts"
self.appid = os.getenv("VOLCENGINE_PODCAST_APPID")
self.access_token = os.getenv("VOLCENGINE_PODCAST_ACCESS_TOKEN")
self.app_key = "aGjiRDfUWi"
self.proxy = os.getenv("HTTPS_PROXY", None)
if self.proxy:
logger.info(f"volcengine podcast use proxy: {self.proxy}")
async def podcast_request(
self,
session_id: str,
data_manager=None,
text: str = "",
input_url: str = "",
prompt_text: str = "",
nlp_texts: str = "",
action: int = 0,
resource_id: str = "volc.service_type.10050",
encoding: str = "mp3",
input_id: str = "test_podcast",
speaker_info: str = '{"random_order":false}',
use_head_music: bool = False,
use_tail_music: bool = False,
only_nlp_text: bool = False,
return_audio_url: bool = False,
skip_round_audio_save: bool = False,
on_round_complete: Optional[Callable] = None,
):
"""
执行播客请求
Args:
text: 输入文本 (action=0时使用)
input_url: Web URL或文件URL (action=0时使用)
prompt_text: 提示词文本 (action=4时必须)
nlp_texts: NLP文本 (action=3时必须)
action: 播客类型 (0/3/4)
resource_id: 音频资源ID
encoding: 音频格式 (mp3/wav)
input_id: 唯一输入标识
speaker_info: 播客说话人信息
use_head_music: 是否使用开头音乐
use_tail_music: 是否使用结尾音乐
only_nlp_text: 是否只返回播客文本
return_audio_url: 是否返回音频URL
skip_round_audio_save: 是否跳过单轮音频保存
output_dir: 输出目录
on_round_complete: 轮次完成回调函数
"""
if not self.appid or not self.access_token:
logger.error("APP ID or Access Key is required")
return None, None
headers = {
"X-Api-App-Id": self.appid,
"X-Api-App-Key": self.app_key,
"X-Api-Access-Key": self.access_token,
"X-Api-Resource-Id": resource_id,
"X-Api-Connect-Id": str(uuid.uuid4()),
}
is_podcast_round_end = True
audio_received = False
last_round_id = -1
task_id = ""
websocket = None
retry_num = 5
audio = bytearray()
voice = ""
current_round = 0
podcast_texts = []
post_processor = PodcastRoundPostProcessor(session_id, data_manager)
await post_processor.init()
try:
while retry_num > 0:
# 建立WebSocket连接
websocket = await websockets.connect(self.endpoint, additional_headers=headers)
logger.debug(f"WebSocket connected: {websocket.response.headers}")
# 构建请求参数
if input_url:
req_params = {
"input_id": input_id,
"nlp_texts": json.loads(nlp_texts) if nlp_texts else None,
"prompt_text": prompt_text,
"action": action,
"use_head_music": use_head_music,
"use_tail_music": use_tail_music,
"input_info": {
"input_url": input_url,
"return_audio_url": return_audio_url,
"only_nlp_text": only_nlp_text,
},
"speaker_info": json.loads(speaker_info) if speaker_info else None,
"audio_config": {"format": encoding, "sample_rate": 24000, "speech_rate": 0},
}
else:
req_params = {
"input_id": input_id,
"input_text": text,
"nlp_texts": json.loads(nlp_texts) if nlp_texts else None,
"prompt_text": prompt_text,
"action": action,
"use_head_music": use_head_music,
"use_tail_music": use_tail_music,
"input_info": {
"input_url": input_url,
"return_audio_url": return_audio_url,
"only_nlp_text": only_nlp_text,
},
"speaker_info": json.loads(speaker_info) if speaker_info else None,
"audio_config": {"format": encoding, "sample_rate": 24000, "speech_rate": 0},
}
logger.debug(f"Request params: {json.dumps(req_params, indent=2, ensure_ascii=False)}")
if not is_podcast_round_end:
req_params["retry_info"] = {"retry_task_id": task_id, "last_finished_round_id": last_round_id}
# Start connection
await start_connection(websocket)
await wait_for_event(websocket, MsgType.FullServerResponse, EventType.ConnectionStarted)
session_id = str(uuid.uuid4())
if not task_id:
task_id = session_id
# Start session
await start_session(websocket, json.dumps(req_params).encode(), session_id)
await wait_for_event(websocket, MsgType.FullServerResponse, EventType.SessionStarted)
# Finish session
await finish_session(websocket, session_id)
while True:
msg = await receive_message(websocket)
# 音频数据块
if msg.type == MsgType.AudioOnlyServer and msg.event == EventType.PodcastRoundResponse:
if not audio_received and audio:
audio_received = True
audio.extend(msg.payload)
# 错误信息
elif msg.type == MsgType.Error:
raise RuntimeError(f"Server error: {msg.payload.decode()}")
elif msg.type == MsgType.FullServerResponse:
# 播客 round 开始
if msg.event == EventType.PodcastRoundStart:
data = json.loads(msg.payload.decode())
if data.get("text"):
filtered_payload = {"text": data.get("text"), "speaker": data.get("speaker")}
podcast_texts.append(filtered_payload)
voice = data.get("speaker")
current_round = data.get("round_id")
if current_round == -1:
voice = "head_music"
if current_round == 9999:
voice = "tail_music"
is_podcast_round_end = False
logger.debug(f"New round started: {data}")
# 播客 round 结束
if msg.event == EventType.PodcastRoundEnd:
data = json.loads(msg.payload.decode())
logger.debug(f"Podcast round end: {data}")
if data.get("is_error"):
break
is_podcast_round_end = True
last_round_id = current_round
if audio:
round_info = await post_processor.postprocess_round(current_round, voice, audio, podcast_texts)
if on_round_complete:
await on_round_complete(round_info)
audio.clear()
# 播客结束
if msg.event == EventType.PodcastEnd:
data = json.loads(msg.payload.decode())
logger.info(f"Podcast end: {data}")
# 会话结束
if msg.event == EventType.SessionFinished:
break
if not audio_received and not only_nlp_text:
raise RuntimeError("No audio data received")
# 保持连接
await finish_connection(websocket)
await wait_for_event(websocket, MsgType.FullServerResponse, EventType.ConnectionFinished)
# 播客结束, 保存最终音频文件
if is_podcast_round_end:
podcast_info = await post_processor.postprocess_final()
return podcast_info
else:
logger.error(f"Current podcast not finished, resuming from round {last_round_id}")
retry_num -= 1
await asyncio.sleep(1)
if websocket:
await websocket.close()
finally:
await post_processor.cleanup()
if websocket:
await websocket.close()
return None
async def test(args):
"""
Podcast测试函数
Args:
args: dict, 包含所有podcast参数
"""
client = VolcEnginePodcastClient()
# 设置默认参数
params = {
"text": "",
"input_url": "https://zhuanlan.zhihu.com/p/607822576",
"prompt_text": "",
"nlp_texts": "",
"action": 0,
"resource_id": "volc.service_type.10050",
"encoding": "mp3",
"input_id": "test_podcast",
"speaker_info": '{"random_order":false}',
"use_head_music": False,
"use_tail_music": False,
"only_nlp_text": False,
"return_audio_url": True,
"skip_round_audio_save": False,
"output_dir": "output",
}
# 覆盖默认参数
if args:
params.update(args)
await client.podcast_request(**params)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--text", default="", help="Input text Use when action in [0]")
parser.add_argument("--input_url", default="", help="Web url or file url Use when action in [0]")
parser.add_argument("--prompt_text", default="", help="Input Prompt Text must not empty when action in [4]")
parser.add_argument("--nlp_texts", default="", help="Input NLP Texts must not empty when action in [3]")
parser.add_argument("--resource_id", default="volc.service_type.10050", help="Audio Resource ID")
parser.add_argument("--encoding", default="mp3", choices=["mp3", "wav"], help="Audio format")
parser.add_argument("--input_id", default="test_podcast", help="Unique input identifier")
parser.add_argument("--speaker_info", default='{"random_order":false}', help="Podcast Speaker Info")
parser.add_argument("--use_head_music", default=False, action="store_true", help="Enable head music")
parser.add_argument("--use_tail_music", default=False, action="store_true", help="Enable tail music")
parser.add_argument("--only_nlp_text", default=False, action="store_true", help="Enable only podcast text when action in [0, 4]")
parser.add_argument("--return_audio_url", default=False, action="store_true", help="Enable return audio url that can download")
parser.add_argument("--action", default=0, type=int, choices=[0, 3, 4], help="different podcast type")
parser.add_argument("--skip_round_audio_save", default=False, action="store_true", help="skip round audio save")
parser.add_argument("--output_dir", default="output", help="Output directory")
args = parser.parse_args()
kwargs = {k: v for k, v in vars(args).items() if v is not None and not (isinstance(v, bool) and not v)}
asyncio.run(test(kwargs))
......@@ -69,6 +69,8 @@ def class_try_catch_async(func):
def data_name(x, task_id):
if x == "input_image":
x = x + ".png"
elif x == "input_video":
x = x + ".mp4"
elif x == "output_video":
x = x + ".mp4"
return f"{task_id}-{x}"
......@@ -165,7 +167,14 @@ async def preload_data(inp, inp_type, typ, val):
timeout = int(os.getenv("REQUEST_TIMEOUT", "5"))
data = await fetch_resource(val, timeout=timeout)
elif typ == "base64":
data = base64.b64decode(val)
# Decode base64 in background thread to avoid blocking event loop
data = await asyncio.to_thread(base64.b64decode, val)
# For multi-person audio directory, val should be a dict with file structure
elif typ == "directory":
data = {}
for fname, b64_data in val.items():
data[fname] = await asyncio.to_thread(base64.b64decode, b64_data)
return {"type": "directory", "data": data}
elif typ == "stream":
# no bytes data need to be saved by data_manager
data = None
......@@ -176,8 +185,13 @@ async def preload_data(inp, inp_type, typ, val):
if inp_type == "IMAGE":
data = await asyncio.to_thread(format_image_data, data)
elif inp_type == "AUDIO":
if typ != "stream":
if typ != "stream" and typ != "directory":
data = await asyncio.to_thread(format_audio_data, data)
elif inp_type == "VIDEO":
# Video data doesn't need special formatting, just validate it's not empty
if len(data) == 0:
raise ValueError("Video file is empty")
logger.info(f"load video: {len(data)} bytes")
else:
raise Exception(f"cannot parse inp_type={inp_type} data")
return data
......@@ -191,7 +205,15 @@ async def load_inputs(params, raw_inputs, types):
for inp in raw_inputs:
item = params.pop(inp)
bytes_data = await preload_data(inp, types[inp], item["type"], item["data"])
if bytes_data is not None:
# Handle multi-person audio directory
if bytes_data is not None and isinstance(bytes_data, dict) and bytes_data.get("type") == "directory":
fs = []
for fname, fdata in bytes_data["data"].items():
inputs_data[f"{inp}/{fname}"] = fdata
fs.append(f"{inp}/{fname}")
params["extra_inputs"] = {inp: fs}
elif bytes_data is not None:
inputs_data[inp] = bytes_data
else:
params[inp] = item
......@@ -202,11 +224,15 @@ def check_params(params, raw_inputs, raw_outputs, types):
stream_audio = os.getenv("STREAM_AUDIO", "0") == "1"
stream_video = os.getenv("STREAM_VIDEO", "0") == "1"
for x in raw_inputs + raw_outputs:
if x in params and "type" in params[x] and params[x]["type"] == "stream":
if types[x] == "AUDIO":
assert stream_audio, "stream audio is not supported, please set env STREAM_AUDIO=1"
elif types[x] == "VIDEO":
assert stream_video, "stream video is not supported, please set env STREAM_VIDEO=1"
if x in params and "type" in params[x]:
if params[x]["type"] == "stream":
if types[x] == "AUDIO":
assert stream_audio, "stream audio is not supported, please set env STREAM_AUDIO=1"
elif types[x] == "VIDEO":
assert stream_video, "stream video is not supported, please set env STREAM_VIDEO=1"
elif params[x]["type"] == "directory":
# Multi-person audio directory is only supported for AUDIO type
assert types[x] == "AUDIO", f"directory type is only supported for AUDIO input, got {types[x]}"
if __name__ == "__main__":
......
......@@ -22,8 +22,8 @@ class VolcEngineTTSClient:
def __init__(self, voices_list_file=None):
self.url = "https://openspeech.bytedance.com/api/v3/tts/unidirectional"
self.appid = os.getenv("VOLCENGINE_APPID")
self.access_token = os.getenv("VOLCENGINE_ACCESS_TOKEN")
self.appid = os.getenv("VOLCENGINE_TTS_APPID")
self.access_token = os.getenv("VOLCENGINE_TTS_ACCESS_TOKEN")
self.proxy = os.getenv("HTTPS_PROXY", None)
if self.proxy:
logger.info(f"volcengine tts use proxy: {self.proxy}")
......
......@@ -14,6 +14,8 @@ class BaseDataManager:
self.template_audios_dir = None
self.template_videos_dir = None
self.template_tasks_dir = None
self.podcast_temp_session_dir = None
self.podcast_output_dir = None
async def init(self):
pass
......@@ -188,7 +190,8 @@ class BaseDataManager:
template_dir = self.get_template_dir(template_type)
if template_dir is None:
return None
return await self.save_bytes(bytes_data, None, abs_path=os.path.join(template_dir, filename))
abs_path = os.path.join(template_dir, filename)
return await self.save_bytes(bytes_data, None, abs_path=abs_path)
@class_try_catch_async
async def presign_template_url(self, template_type, filename):
......@@ -197,6 +200,46 @@ class BaseDataManager:
return None
return await self.presign_url(None, abs_path=os.path.join(template_dir, filename))
@class_try_catch_async
async def list_podcast_temp_session_files(self, session_id):
session_dir = os.path.join(self.podcast_temp_session_dir, session_id)
return await self.list_files(base_dir=session_dir)
@class_try_catch_async
async def save_podcast_temp_session_file(self, session_id, filename, bytes_data):
fpath = os.path.join(self.podcast_temp_session_dir, session_id, filename)
await self.save_bytes(bytes_data, None, abs_path=fpath)
@class_try_catch_async
async def load_podcast_temp_session_file(self, session_id, filename):
fpath = os.path.join(self.podcast_temp_session_dir, session_id, filename)
return await self.load_bytes(None, abs_path=fpath)
@class_try_catch_async
async def delete_podcast_temp_session_file(self, session_id, filename):
fpath = os.path.join(self.podcast_temp_session_dir, session_id, filename)
return await self.delete_bytes(None, abs_path=fpath)
@class_try_catch_async
async def save_podcast_output_file(self, filename, bytes_data):
fpath = os.path.join(self.podcast_output_dir, filename)
await self.save_bytes(bytes_data, None, abs_path=fpath)
@class_try_catch_async
async def load_podcast_output_file(self, filename):
fpath = os.path.join(self.podcast_output_dir, filename)
return await self.load_bytes(None, abs_path=fpath)
@class_try_catch_async
async def delete_podcast_output_file(self, filename):
fpath = os.path.join(self.podcast_output_dir, filename)
return await self.delete_bytes(None, abs_path=fpath)
@class_try_catch_async
async def presign_podcast_output_url(self, filename):
fpath = os.path.join(self.podcast_output_dir, filename)
return await self.presign_url(None, abs_path=fpath)
# Import data manager implementations
from .local_data_manager import LocalDataManager # noqa
......
import asyncio
import os
import shutil
from loguru import logger
......@@ -24,6 +25,12 @@ class LocalDataManager(BaseDataManager):
assert os.path.exists(self.template_videos_dir), f"{self.template_videos_dir} not exists!"
assert os.path.exists(self.template_tasks_dir), f"{self.template_tasks_dir} not exists!"
# podcast temp session dir and output dir
self.podcast_temp_session_dir = os.path.join(self.local_dir, "podcast_temp_session")
self.podcast_output_dir = os.path.join(self.local_dir, "podcast_output")
os.makedirs(self.podcast_temp_session_dir, exist_ok=True)
os.makedirs(self.podcast_output_dir, exist_ok=True)
@class_try_catch_async
async def save_bytes(self, bytes_data, filename, abs_path=None):
out_path = self.fmt_path(self.local_dir, filename, abs_path)
......@@ -54,6 +61,20 @@ class LocalDataManager(BaseDataManager):
prefix = base_dir if base_dir else self.local_dir
return os.listdir(prefix)
@class_try_catch_async
async def create_podcast_temp_session_dir(self, session_id):
dir_path = os.path.join(self.podcast_temp_session_dir, session_id)
os.makedirs(dir_path, exist_ok=True)
return dir_path
@class_try_catch_async
async def clear_podcast_temp_session_dir(self, session_id):
session_dir = os.path.join(self.podcast_temp_session_dir, session_id)
if os.path.isdir(session_dir):
shutil.rmtree(session_dir)
logger.info(f"cleared podcast temp session dir {session_dir}")
return True
async def test():
import torch
......
......@@ -38,6 +38,10 @@ class S3DataManager(BaseDataManager):
self.template_videos_dir = os.path.join(template_dir, "videos")
self.template_tasks_dir = os.path.join(template_dir, "tasks")
# podcast temp session dir and output dir
self.podcast_temp_session_dir = os.path.join(self.base_path, "podcast_temp_session")
self.podcast_output_dir = os.path.join(self.base_path, "podcast_output")
async def init_presign_client(self):
# init tos client for volces.com
if "volces.com" in self.endpoint_url:
......@@ -128,12 +132,42 @@ class S3DataManager(BaseDataManager):
@class_try_catch_async
async def list_files(self, base_dir=None):
prefix = base_dir if base_dir else self.base_path
response = await self.s3_client.list_objects_v2(Bucket=self.bucket_name, Prefix=prefix)
if base_dir:
prefix = self.fmt_path(self.base_path, None, abs_path=base_dir)
else:
prefix = self.base_path
prefix = prefix + "/" if not prefix.endswith("/") else prefix
# Handle pagination for S3 list_objects_v2
files = []
if "Contents" in response:
for obj in response["Contents"]:
files.append(obj["Key"].replace(prefix + "/", ""))
continuation_token = None
page = 1
while True:
list_kwargs = {"Bucket": self.bucket_name, "Prefix": prefix, "MaxKeys": 1000}
if continuation_token:
list_kwargs["ContinuationToken"] = continuation_token
response = await self.s3_client.list_objects_v2(**list_kwargs)
if "Contents" in response:
page_files = []
for obj in response["Contents"]:
# Remove the prefix from the key to get just the filename
key = obj["Key"]
if key.startswith(prefix):
filename = key[len(prefix) :]
if filename: # Skip empty filenames (the directory itself)
page_files.append(filename)
files.extend(page_files)
else:
logger.warning(f"[S3DataManager.list_files] Page {page}: No files found in this page.")
# Check if there are more pages
if response.get("IsTruncated", False):
continuation_token = response.get("NextContinuationToken")
page += 1
else:
break
return files
@class_try_catch_async
......@@ -149,6 +183,18 @@ class S3DataManager(BaseDataManager):
else:
return None
@class_try_catch_async
async def create_podcast_temp_session_dir(self, session_id):
pass
@class_try_catch_async
async def clear_podcast_temp_session_dir(self, session_id):
session_dir = os.path.join(self.podcast_temp_session_dir, session_id)
fs = await self.list_files(base_dir=session_dir)
logger.info(f"clear podcast temp session dir {session_dir} with files: {fs}")
for f in fs:
await self.delete_bytes(f, abs_path=os.path.join(session_dir, f))
async def test():
import torch
......
import argparse
import asyncio
import base64
import json
import mimetypes
import os
import re
import tempfile
import traceback
import uuid
from contextlib import asynccontextmanager
import uvicorn
from fastapi import Depends, FastAPI, HTTPException, Request
from fastapi import Depends, FastAPI, HTTPException, Request, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, Response
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
......@@ -17,8 +19,11 @@ from fastapi.staticfiles import StaticFiles
from loguru import logger
from pydantic import BaseModel
from lightx2v.deploy.common.audio_separator import AudioSeparator
from lightx2v.deploy.common.face_detector import FaceDetector
from lightx2v.deploy.common.pipeline import Pipeline
from lightx2v.deploy.common.utils import check_params, data_name, load_inputs
from lightx2v.deploy.common.podcasts import VolcEnginePodcastClient
from lightx2v.deploy.common.utils import check_params, data_name, fetch_resource, format_image_data, load_inputs
from lightx2v.deploy.common.volcengine_tts import VolcEngineTTSClient
from lightx2v.deploy.data_manager import LocalDataManager, S3DataManager
from lightx2v.deploy.queue_manager import LocalQueueManager, RabbitMQQueueManager
......@@ -62,6 +67,9 @@ server_monitor = None
auth_manager = None
metrics_monitor = MetricMonitor()
volcengine_tts_client = None
volcengine_podcast_client = None
face_detector = None
audio_separator = None
@asynccontextmanager
......@@ -345,7 +353,7 @@ async def api_v1_task_submit(request: Request, user=Depends(verify_user_access))
# process multimodal inputs data
inputs_data = await load_inputs(params, inputs, types)
# init task
# init task (we need task_id before preprocessing to save processed files)
task_id = await task_manager.create_task(keys, workers, params, inputs, outputs, user["user_id"])
logger.info(f"Submit task: {task_id} {params}")
......@@ -450,14 +458,26 @@ async def api_v1_task_input_url(request: Request, user=Depends(verify_user_acces
try:
name = request.query_params["name"]
task_id = request.query_params["task_id"]
filename = request.query_params.get("filename", None)
task = await task_manager.query_task(task_id, user_id=user["user_id"])
assert task is not None, f"Task {task_id} not found"
assert name in task["inputs"], f"Input {name} not found in task {task_id}"
assert name not in task["params"], f"Input {name} is a stream"
if name in task["params"]:
return error_response(f"Input {name} is a stream", 400)
# eg, multi person audio directory input
if filename is not None:
extra_inputs = task["params"]["extra_inputs"][name]
name = f"{name}/{filename}"
assert name in task["inputs"], f"Extra input {name} not found in task {task_id}"
assert name in extra_inputs, f"Filename {filename} not found in extra inputs"
url = await data_manager.presign_url(task["inputs"][name])
if url is None:
url = f"./assets/task/input?task_id={task_id}&name={name}"
if filename is not None:
url += f"&filename={filename}"
return {"url": url}
except Exception as e:
......@@ -493,10 +513,20 @@ async def assets_task_input(request: Request, user=Depends(verify_user_access_fr
try:
name = request.query_params["name"]
task_id = request.query_params["task_id"]
filename = request.query_params.get("filename", None)
task = await task_manager.query_task(task_id, user_id=user["user_id"])
assert task is not None, f"Task {task_id} not found"
assert name in task["inputs"], f"Input {name} not found in task {task_id}"
assert name not in task["params"], f"Input {name} is a stream"
if name in task["params"]:
return error_response(f"Input {name} is a stream", 400)
# eg, multi person audio directory input
if filename is not None:
extra_inputs = task["params"]["extra_inputs"][name]
name = f"{name}/{filename}"
assert name in task["inputs"], f"Extra input {name} not found in task {task_id}"
assert name in extra_inputs, f"Filename {filename} not found in extra inputs"
data = await data_manager.load_bytes(task["inputs"][name])
# set correct Content-Type
......@@ -770,35 +800,61 @@ async def api_v1_template_list(request: Request):
all_audios = [] if all_audios is None else all_audios
all_videos = [] if all_videos is None else all_videos
# page info
total_images = len(all_images)
total_audios = len(all_audios)
total_videos = len(all_videos)
total_pages = (max(total_images, total_audios, total_videos) + page_size - 1) // page_size
paginated_image_templates = []
paginated_audio_templates = []
paginated_video_templates = []
# 创建图片文件名(不含扩展名)到图片信息的映射
all_images_sorted = sorted(all_images)
image_map = {} # 文件名(不含扩展名) -> {"filename": 完整文件名, "url": URL}
for img_name in all_images_sorted:
img_name_without_ext = img_name.rsplit(".", 1)[0] if "." in img_name else img_name
url = await data_manager.presign_template_url("images", img_name)
if url is None:
url = f"./assets/template/images/{img_name}"
image_map[img_name_without_ext] = {"filename": img_name, "url": url}
# 创建音频文件名(不含扩展名)到音频信息的映射
all_audios_sorted = sorted(all_audios)
audio_map = {} # 文件名(不含扩展名) -> {"filename": 完整文件名, "url": URL}
for audio_name in all_audios_sorted:
audio_name_without_ext = audio_name.rsplit(".", 1)[0] if "." in audio_name else audio_name
url = await data_manager.presign_template_url("audios", audio_name)
if url is None:
url = f"./assets/template/audios/{audio_name}"
audio_map[audio_name_without_ext] = {"filename": audio_name, "url": url}
# 合并音频和图片模板,基于文件名前缀匹配
# 获取所有唯一的基础文件名(不含扩展名)
all_base_names = set(list(image_map.keys()) + list(audio_map.keys()))
all_base_names_sorted = sorted(all_base_names)
# 构建合并后的模板列表
merged_templates = []
for base_name in all_base_names_sorted:
template_item = {
"id": base_name, # 使用基础文件名作为ID
"image": image_map.get(base_name),
"audio": audio_map.get(base_name),
}
merged_templates.append(template_item)
# 分页处理
total = len(merged_templates)
total_pages = (total + page_size - 1) // page_size if total > 0 else 1
paginated_templates = []
if page <= total_pages:
start_idx = (page - 1) * page_size
end_idx = start_idx + page_size
paginated_templates = merged_templates[start_idx:end_idx]
async def handle_media(media_type, media_names, paginated_media_templates):
media_names.sort(key=lambda x: x)
for media_name in media_names[start_idx:end_idx]:
url = await data_manager.presign_template_url(media_type, media_name)
if url is None:
url = f"./assets/template/{media_type}/{media_name}"
paginated_media_templates.append({"filename": media_name, "url": url})
await handle_media("images", all_images, paginated_image_templates)
await handle_media("audios", all_audios, paginated_audio_templates)
await handle_media("videos", all_videos, paginated_video_templates)
# 为了保持向后兼容,仍然返回images和audios字段(但可能为空)
# 同时添加新的merged字段
return {
"templates": {"images": paginated_image_templates, "audios": paginated_audio_templates, "videos": paginated_video_templates},
"pagination": {"page": page, "page_size": page_size, "total": max(total_images, total_audios), "total_pages": total_pages},
"templates": {
"images": [], # 保持向后兼容,但设为空
"audios": [], # 保持向后兼容,但设为空
"videos": [], # 保持向后兼容
"merged": paginated_templates, # 新的合并列表
},
"pagination": {"page": page, "page_size": page_size, "total": total, "total_pages": total_pages},
}
except Exception as e:
traceback.print_exc()
......@@ -1027,6 +1083,337 @@ async def api_v1_tts_generate(request: TTSRequest):
return JSONResponse({"error": f"TTS generation failed: {str(e)}"}, status_code=500)
@app.websocket("/api/v1/podcast/generate")
async def api_v1_podcast_generate_ws(websocket: WebSocket):
await websocket.accept()
def ws_get_user_id():
token = websocket.query_params.get("token")
if not token:
token = websocket.headers.get("authorization") or websocket.headers.get("Authorization")
if token and token.startswith("Bearer "):
token = token[7:]
payload = auth_manager.verify_jwt_token(token)
user_id = payload["user_id"]
return user_id
async def safe_send_json(payload):
try:
await websocket.send_json(payload)
except (WebSocketDisconnect, RuntimeError) as e:
logger.warning(f"WebSocket send skipped: {e}")
try:
user_id = ws_get_user_id()
data = await websocket.receive_text()
request_data = json.loads(data)
# stop request
if request_data.get("type") == "stop":
logger.info("Received stop signal from client")
await safe_send_json({"type": "stopped"})
return
# user input prompt
input_text = request_data.get("input", "")
is_url = input_text.startswith(("http://", "https://"))
if not input_text:
await safe_send_json({"error": "输入不能为空"})
return
session_id = "session_" + str(uuid.uuid4())
params = {
"session_id": session_id,
"data_manager": data_manager,
"text": "" if is_url else input_text,
"input_url": input_text if is_url else "",
"action": 0,
"use_head_music": False,
"use_tail_music": False,
"skip_round_audio_save": False,
}
logger.info(f"WebSocket generating podcast with params: {params}")
# 使用回调函数实时推送音频
async def on_round_complete(round_info):
await safe_send_json({"type": "audio_update", "data": round_info})
params["on_round_complete"] = on_round_complete
# 创建一个任务来处理停止信号
async def listen_for_stop(podcast_task):
while True:
try:
if podcast_task.done():
return
data = await asyncio.wait_for(websocket.receive_text(), timeout=0.1)
request = json.loads(data)
if request.get("type") == "stop":
logger.warning("Stop signal received during podcast generation")
podcast_task.cancel()
return
except asyncio.TimeoutError:
continue
except Exception as e:
logger.warning(f"Stop listener ended: {e}")
return
podcast_task = asyncio.create_task(volcengine_podcast_client.podcast_request(**params))
stop_listener_task = asyncio.create_task(listen_for_stop(podcast_task))
podcast_info = None
try:
podcast_info = await podcast_task
except asyncio.CancelledError:
logger.warning("Podcast generation cancelled by user")
await safe_send_json({"type": "stopped"})
return
finally:
stop_listener_task.cancel()
if podcast_info is None:
await safe_send_json({"error": "播客生成失败,请稍后重试"})
return
audio_path = podcast_info["audio_name"]
rounds = podcast_info["subtitles"]
await task_manager.create_podcast(session_id, user_id, input_text, audio_path, rounds)
audio_url = await data_manager.presign_podcast_output_url(audio_path)
if not audio_url:
audio_url = f"/api/v1/podcast/audio?session_id={session_id}&filename={audio_path}"
logger.info(f"completed podcast generation (session: {session_id})")
await safe_send_json(
{
"type": "complete",
"data": {
"audio_url": audio_url,
"subtitles": podcast_info["subtitles"],
"session_id": session_id,
"user_id": user_id,
},
}
)
except WebSocketDisconnect:
logger.info("WebSocket disconnected")
except Exception:
logger.error(f"Error in websocket: {traceback.format_exc()}")
await safe_send_json({"error": "websocket internal error, please try again later!"})
@app.get("/api/v1/podcast/audio")
async def api_v1_podcast_audio(request: Request, user=Depends(verify_user_access_from_query)):
try:
user_id = user["user_id"]
session_id = request.query_params.get("session_id")
filename = request.query_params.get("filename")
if not session_id or not filename:
return JSONResponse({"error": "session_id and filename are required"}, status_code=400)
ext = os.path.splitext(filename)[1].lower()
assert ext == ".mp3", f"Unsupported file extension: {ext}"
# 解析 Range 头,格式:bytes=start-end 或 bytes=start-
range_header = request.headers.get("Range")
start_byte, end_byte = None, None
if range_header:
match = re.match(r"bytes=(\d+)-(\d*)", range_header)
if match:
start_byte = int(match.group(1))
end_byte = int(match.group(2)) + 1 if match.group(2) else None
podcast_data = await task_manager.query_podcast(session_id, user_id)
if podcast_data:
# generate is finished and save info to database
func = data_manager.load_podcast_output_file
filename = podcast_data["audio_path"]
func_args = (filename,)
else:
func = data_manager.load_podcast_temp_session_file
func_args = (session_id, filename)
logger.debug(f"Serving audio file from {func.__name__} with args: {func_args}, start_byte: {start_byte}, end_byte: {end_byte}")
file_bytes = await func(*func_args)
file_size = len(file_bytes)
file_bytes = file_bytes[start_byte:end_byte]
content_length = len(file_bytes)
media_type = "audio/mpeg"
status_code = 200
headers = {"Content-Length": str(content_length), "Accept-Ranges": "bytes", "Content-Type": media_type, "Content-Disposition": f'attachment; filename="{filename}"'}
if start_byte is not None and start_byte > 0:
status_code = 206 # Partial Content
headers["Content-Range"] = f"bytes {start_byte}-{start_byte + content_length - 1}/{file_size}"
return Response(content=file_bytes, media_type=media_type, status_code=status_code, headers=headers)
except Exception as e:
logger.error(f"Error serving audio: {e}")
traceback.print_exc()
return JSONResponse({"error": str(e)}, status_code=500)
@app.get("/api/v1/podcast/history")
async def api_v1_podcast_history(request: Request, user=Depends(verify_user_access)):
try:
user_id = user["user_id"]
page = int(request.query_params.get("page", 1))
page_size = int(request.query_params.get("page_size", 10))
assert page > 0 and page_size > 0, "page and page_size must be greater than 0"
status = request.query_params.get("status", None) # has_audio, no_audio
query_params = {"user_id": user_id}
if status == "has_audio":
query_params["has_audio"] = True
elif status == "no_audio":
query_params["has_audio"] = False
total_tasks = await task_manager.list_podcasts(count=True, **query_params)
total_pages = (total_tasks + page_size - 1) // page_size
page_info = {"page": page, "page_size": page_size, "total": total_tasks, "total_pages": total_pages}
if page > total_pages:
return {"sessions": [], "pagination": page_info}
query_params["offset"] = (page - 1) * page_size
query_params["limit"] = page_size
sessions = await task_manager.list_podcasts(**query_params)
return {"sessions": sessions, "pagination": page_info}
except Exception as e:
logger.error(f"Error getting podcast history: {e}")
traceback.print_exc()
return {"sessions": []}
@app.get("/api/v1/podcast/session/{session_id}/audio_url")
async def api_v1_podcast_session_audio_url(session_id: str, user=Depends(verify_user_access)):
try:
user_id = user["user_id"]
podcast_data = await task_manager.query_podcast(session_id, user_id)
if not podcast_data:
return JSONResponse({"error": "Podcast session not found"}, status_code=404)
audio_path = podcast_data["audio_path"]
audio_url = await data_manager.presign_podcast_output_url(audio_path)
if not audio_url:
audio_url = f"/api/v1/podcast/audio?session_id={session_id}&filename={audio_path}"
return {"audio_url": audio_url}
except Exception as e:
logger.error(f"Error getting podcast session audio URL: {e}")
traceback.print_exc()
return JSONResponse({"error": str(e)}, status_code=500)
class FaceDetectRequest(BaseModel):
image: str # Base64 encoded image
class AudioSeparateRequest(BaseModel):
audio: str # Base64 encoded audio
num_speakers: int = None # Optional: number of speakers to separate
@app.post("/api/v1/face/detect")
async def api_v1_face_detect(request: FaceDetectRequest, user=Depends(verify_user_access)):
"""Detect faces in image (only detection, no cropping - cropping is done on frontend)
Supports both base64 encoded images and URLs (blob URLs, http URLs, etc.)
"""
try:
if not face_detector:
return error_response("Face detector not initialized", 500)
# 验证输入
if not request.image or not request.image.strip():
logger.error("Face detection request: image is empty")
return error_response("Image input is empty", 400)
image_bytes = None
try:
# Check if input is a URL (blob:, http:, https:, or data: URL)
if request.image.startswith(("http://", "https://")):
timeout = int(os.getenv("REQUEST_TIMEOUT", "10"))
image_bytes = await fetch_resource(request.image, timeout=timeout)
logger.debug(f"Fetched image from URL for face detection: {request.image[:100]}... (size: {len(image_bytes)} bytes)")
else:
encoded = request.image
# Data URL format: "data:image/png;base64,..."
if encoded.startswith("data:image"):
_, encoded = encoded.split(",", 1)
image_bytes = base64.b64decode(encoded)
logger.debug(f"Decoded base64 image: {request.image[:100]}... (size: {len(image_bytes)} bytes)")
# Validate image format before passing to face detector
image_bytes = await asyncio.to_thread(format_image_data, image_bytes)
except Exception as e:
logger.error(f"Failed to decode base64 image: {e}, image length: {len(request.image) if request.image else 0}")
return error_response(f"Invalid image format: {str(e)}", 400)
# Detect faces only (no cropping)
result = face_detector.detect_faces(image_bytes, return_image=False)
faces_data = []
for i, face in enumerate(result["faces"]):
faces_data.append(
{
"index": i,
"bbox": face["bbox"], # [x1, y1, x2, y2] - absolute pixel coordinates in original image
"confidence": face["confidence"],
"class_id": face["class_id"],
"class_name": face["class_name"],
# Note: face_image is not included - frontend will crop it based on bbox
}
)
return {"faces": faces_data, "total": len(faces_data)}
except Exception as e:
logger.error(f"Face detection error: {traceback.format_exc()}")
return error_response(f"Face detection failed: {str(e)}", 500)
@app.post("/api/v1/audio/separate")
async def api_v1_audio_separate(request: AudioSeparateRequest, user=Depends(verify_user_access)):
"""Separate different speakers in audio"""
try:
if not audio_separator:
return error_response("Audio separator not initialized", 500)
audio_bytes = None
try:
encoded = request.audio
if encoded.startswith("data:"):
# Remove data URL prefix (e.g., "data:audio/mpeg;base64," or "data:application/octet-stream;base64,")
_, encoded = encoded.split(",", 1)
audio_bytes = await asyncio.to_thread(base64.b64decode, encoded, validate=True)
logger.debug(f"Successfully decoded base64 audio, size: {len(audio_bytes)} bytes")
except Exception as e:
logger.error(f"Failed to decode base64 audio {request.audio[:100]}..., error: {str(e)}")
return error_response(f"Invalid base64 audio data", 400)
# Separate speakers
result = audio_separator.separate_speakers(audio_bytes, num_speakers=request.num_speakers)
# Convert audio tensors to base64 strings (without saving to file)
speakers_data = []
for speaker in result["speakers"]:
# Convert audio tensor directly to base64
audio_base64 = audio_separator.speaker_audio_to_base64(speaker["audio"], speaker["sample_rate"], format="wav")
speakers_data.append(
{
"speaker_id": speaker["speaker_id"],
"audio": audio_base64, # Base64 encoded audio
"segments": speaker["segments"],
"sample_rate": speaker["sample_rate"],
}
)
return {"speakers": speakers_data, "total": len(speakers_data), "method": result.get("method", "pyannote")}
except Exception as e:
logger.error(f"Audio separation error: {traceback.format_exc()}")
return error_response(f"Audio separation failed: {str(e)}", 500)
# 所有未知路由 fallback 到 index.html (必须在所有API路由之后)
@app.get("/{full_path:path}", response_class=HTMLResponse)
async def vue_fallback(full_path: str):
......@@ -1061,11 +1448,16 @@ if __name__ == "__main__":
parser.add_argument("--volcengine_tts_list_json", type=str, default=dft_volcengine_tts_list_json)
parser.add_argument("--ip", type=str, default="0.0.0.0")
parser.add_argument("--port", type=int, default=8080)
parser.add_argument("--face_detector_model_path", type=str, default=None)
parser.add_argument("--audio_separator_model_path", type=str, default="")
args = parser.parse_args()
logger.info(f"args: {args}")
model_pipelines = Pipeline(args.pipeline_json)
volcengine_tts_client = VolcEngineTTSClient(args.volcengine_tts_list_json)
volcengine_podcast_client = VolcEnginePodcastClient()
face_detector = FaceDetector(model_path=args.face_detector_model_path)
audio_separator = AudioSeparator(model_path=args.audio_separator_model_path)
auth_manager = AuthManager()
if args.task_url.startswith("/"):
task_manager = LocalTaskManager(args.task_url, metrics_monitor)
......
......@@ -186,7 +186,7 @@ class AuthManager:
try:
payload = jwt.decode(token, secret_key, algorithms=[self.jwt_algorithm])
token_type = payload.get("token_type")
if token_type != expected_type:
if token_type and token_type != expected_type:
raise HTTPException(status_code=401, detail="Token type mismatch")
return payload
except jwt.ExpiredSignatureError:
......
......@@ -46,12 +46,19 @@
<link rel="dns-prefetch" href="https://cdnjs.cloudflare.com">
<link rel="preload" href="/src/style.css" as="style">
<link rel="preload" href="/src/main.js" as="script" type="module">
<link href="https://cdn.bootcdn.net/ajax/libs/font-awesome/6.4.0/css/all.min.css" rel="stylesheet">
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css" rel="stylesheet" media="print" onload="this.media='all'">
<link href="https://cdn.bootcdn.net/ajax/libs/font-awesome/7.0.1/css/all.min.css" rel="stylesheet">
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/7.0.1/css/all.min.css" rel="stylesheet" media="print" onload="this.media='all'">
<link rel='stylesheet' href='https://cdn-uicons.flaticon.com/3.0.0/uicons-solid-rounded/css/uicons-solid-rounded.css'>
<link rel='stylesheet' href='https://cdn-uicons.flaticon.com/3.0.0/uicons-bold-rounded/css/uicons-bold-rounded.css'>
<link rel='stylesheet' href='https://cdn-uicons.flaticon.com/3.0.0/uicons-bold-straight/css/uicons-bold-straight.css'>
<link rel='stylesheet' href='https://cdn-uicons.flaticon.com/3.0.0/uicons-solid-rounded/css/uicons-solid-rounded.css'>
<link rel='stylesheet' href='https://cdn-uicons.flaticon.com/3.0.0/uicons-regular-rounded/css/uicons-regular-rounded.css'>
<link rel='stylesheet' href='https://cdn-uicons.flaticon.com/3.0.0/uicons-thin-rounded/css/uicons-thin-rounded.css'>
<link rel='stylesheet' href='https://cdn-uicons.flaticon.com/3.0.0/uicons-solid-straight/css/uicons-solid-straight.css'>
<link rel='stylesheet' href='https://cdn-uicons.flaticon.com/3.0.0/uicons-solid-chubby/css/uicons-solid-chubby.css'>
<link href="https://cdn.jsdelivr.net/npm/remixicon@3.5.0/fonts/remixicon.css" rel="stylesheet">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.0/font/bootstrap-icons.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@tabler/icons-webfont@latest/tabler-icons.min.css">
<link href="/src/style.css" rel="stylesheet">
<style>
.seo-shell {
......
<script setup>
import { onMounted, onUnmounted, ref } from 'vue'
import router from './router'
import { init, handleLoginCallback, handleClickOutside, validateToken } from './utils/other'
import { initLanguage } from './utils/i18n'
......@@ -76,7 +77,7 @@ onMounted(async () => {
localStorage.removeItem('currentUser')
isLoggedIn.value = false
console.log('Token已过期')
showAlert('请重新登录', 'warning', {
showAlert(t('pleaseRelogin'), 'warning', {
label: t('login'),
onClick: login
})
......@@ -87,7 +88,7 @@ onMounted(async () => {
}
} catch (error) {
console.error('初始化失败', error)
showAlert('初始化失败,请刷新页面重试', 'danger')
showAlert(t('initFailedPleaseRefresh'), 'danger')
isLoggedIn.value = false
} finally {
loginLoading.value = false
......
......@@ -41,8 +41,10 @@ import {
s2vAudioPreview,
getCurrentImagePreview,
getCurrentAudioPreview,
getCurrentVideoPreview,
setCurrentImagePreview,
setCurrentAudioPreview,
setCurrentVideoPreview,
updateUploadedContentStatus,
availableTaskTypes,
availableModelClasses,
......@@ -91,6 +93,13 @@ import {
downloadFile,
viewFile,
handleImageUpload,
detectFacesInImage,
faceDetecting,
audioSeparating,
cropFaceImage,
updateFaceRoleName,
toggleFaceEditing,
saveFaceRoleName,
selectTask,
selectModel,
resetForm,
......@@ -98,7 +107,14 @@ import {
triggerAudioUpload,
removeImage,
removeAudio,
removeVideo,
handleAudioUpload,
handleVideoUpload,
separateAudioTracks,
updateSeparatedAudioRole,
updateSeparatedAudioName,
toggleSeparatedAudioEditing,
saveSeparatedAudioName,
loadImageAudioTemplates,
selectImageTemplate,
selectAudioTemplate,
......@@ -178,6 +194,7 @@ import {
getUserAvatarUrl,
getCurrentImagePreviewUrl,
getCurrentAudioPreviewUrl,
getCurrentVideoPreviewUrl,
handleThumbnailError,
handleImageError,
handleImageLoad,
......@@ -298,7 +315,7 @@ const props = defineProps({
}
})
const { t, locale } = useI18n()
const { t, tm, locale } = useI18n()
const route = useRoute()
const router = useRouter()
......@@ -351,6 +368,991 @@ const audioPreviewDuration = ref(0)
const audioPreviewCurrentTime = ref(0)
const audioPreviewIsDragging = ref(false)
// 分离后的音频播放器相关状态
const separatedAudioElements = ref([]) // Array of audio elements
const separatedAudioPlaying = ref({}) // { index: boolean }
const separatedAudioDuration = ref({}) // { index: number }
const separatedAudioCurrentTime = ref({}) // { index: number }
const separatedAudioIsDragging = ref({}) // { index: boolean }
// 拖拽排序相关状态
const draggedRoleIndex = ref(-1)
const draggedAudioIndex = ref(-1)
const dragOverRoleIndex = ref(-1)
const dragOverAudioIndex = ref(-1)
const dragPreviewElement = ref(null)
const dragOffset = ref({ x: 0, y: 0 })
// 计算属性:当前表单检测到的脸
const currentDetectedFaces = computed(() => {
const form = getCurrentForm()
return form?.detectedFaces || []
})
// 计算属性:当前表单分离后的音频
const currentSeparatedAudios = computed(() => {
const form = getCurrentForm()
const audios = form?.separatedAudios || []
// Debug log
if (audios.length > 0) {
console.log('currentSeparatedAudios computed:', audios.length, 'audios')
}
return audios
})
// 计算属性:当前音频预览
const currentAudioPreview = computed(() => {
return getCurrentAudioPreview()
})
// 记录上次分离的角色个数,避免重复分离
const lastSeparatedFaceCount = ref(0)
const lastSeparatedAudioUrl = ref('')
// 角色模式:单角色/多角色
const isMultiRoleMode = ref(false) // false = 单角色模式, true = 多角色模式(默认关闭,单人模式)
// 监听任务类型变化,切换任务时重置为单人模式
watch(selectedTaskId, (newTaskId, oldTaskId) => {
if (newTaskId !== oldTaskId) {
// 切换任务类型时,重置为单人模式
isMultiRoleMode.value = false
// 重置分离记录
lastSeparatedFaceCount.value = 0
lastSeparatedAudioUrl.value = ''
}
})
// 统一监听 detectedFaces 和音频预览,当两者都存在且角色个数 > 1 时,自动分离音频
// 这样可以覆盖所有场景:上传音频、应用历史音频、使用音频模板、复用任务等
watch([currentDetectedFaces, currentAudioPreview, selectedTaskId], ([newFaces, audioUrl, taskType], [oldFaces, oldAudioUrl, oldTaskType]) => {
// 只在 s2v 任务下处理
if (taskType !== 's2v') {
// 如果不是 s2v 任务,清空分离记录
if (oldTaskType === 's2v') {
lastSeparatedFaceCount.value = 0
lastSeparatedAudioUrl.value = ''
}
return
}
const faceCount = newFaces?.length || 0
const oldFaceCount = oldFaces?.length || 0
// 如果角色个数 <= 1清空分离的音频
if (faceCount <= 1) {
if (oldFaceCount > 1) {
const form = getCurrentForm()
if (form) {
form.separatedAudios = []
}
lastSeparatedFaceCount.value = 0
lastSeparatedAudioUrl.value = ''
}
return
}
// 如果角色个数 > 1 且有音频预览
if (faceCount > 1 && audioUrl) {
// 检查是否需要分离(避免重复分离)
const needsSeparation =
faceCount !== lastSeparatedFaceCount.value ||
audioUrl !== lastSeparatedAudioUrl.value
if (needsSeparation) {
console.log(`[自动音频分离] 检测到 ${faceCount} 个角色且有音频,开始分离...`, {
faceCount,
audioUrl: audioUrl.substring(0, 50) + '...',
lastSeparatedFaceCount: lastSeparatedFaceCount.value,
lastSeparatedAudioUrl: lastSeparatedAudioUrl.value?.substring(0, 50) + '...'
})
separateAudioTracks(audioUrl, faceCount)
.then(() => {
// 分离成功,更新记录
lastSeparatedFaceCount.value = faceCount
lastSeparatedAudioUrl.value = audioUrl
console.log(`[自动音频分离] 分离成功,角色个数: ${faceCount}`)
})
.catch(error => {
console.error('[自动音频分离] 分离失败:', error)
// 分离失败,不清空记录,允许重试
})
} else {
console.log(`[自动音频分离] 跳过重复分离,角色个数: ${faceCount},音频未变化`)
}
} else if (faceCount > 1 && !audioUrl) {
// 有多个角色但没有音频,清空分离的音频
const form = getCurrentForm()
if (form && form.separatedAudios && form.separatedAudios.length > 0) {
form.separatedAudios = []
lastSeparatedFaceCount.value = 0
lastSeparatedAudioUrl.value = ''
}
}
}, { immediate: true })
// 手动切换模式
const toggleRoleMode = async () => {
if (selectedTaskId.value !== 's2v') return
const form = getCurrentForm()
if (!form) return
const newMode = !isMultiRoleMode.value
if (newMode) {
// 切换到多角色模式
isMultiRoleMode.value = true
// 如果还没有检测到角色,调用角色识别功能
if (!form.detectedFaces || form.detectedFaces.length === 0) {
const imageUrl = getCurrentImagePreviewUrl()
if (imageUrl) {
try {
faceDetecting.value = true
await detectFacesInImage(imageUrl)
} catch (error) {
console.error('Face detection failed:', error)
showAlert(t('faceDetectionFailed') + ': ' + (error.message || t('unknownError')), 'error')
} finally {
faceDetecting.value = false
}
} else {
showAlert(t('pleaseUploadImage'), 'warning')
isMultiRoleMode.value = false
return
}
}
// 如果检测后仍然只有0个或1个角色,提示用户
if (!form.detectedFaces || form.detectedFaces.length <= 1) {
showAlert(t('multiRoleModeRequires'), 'info')
return
}
// 如果有音频,自动分离
if (form.audioFile && getCurrentAudioPreview()) {
const audioDataUrl = getCurrentAudioPreview()
if (audioDataUrl && form.detectedFaces && form.detectedFaces.length > 1) {
try {
await separateAudioTracks(audioDataUrl, form.detectedFaces.length)
} catch (error) {
console.error('Audio separation failed:', error)
showAlert(t('audioSeparationFailed') + ': ' + error.message, 'error')
}
}
}
} else {
// 切换到单角色模式
isMultiRoleMode.value = false
// 清空分离的音频(单角色模式不需要分离)
form.separatedAudios = []
// 如果有多于1个角色的情况下切回单模式,提示用户
if (form.detectedFaces && form.detectedFaces.length > 1) {
showAlert(t('singleRoleModeInfo'), 'info')
}
}
}
// 处理删除图片,同时重置多角色模式
const handleRemoveImage = () => {
removeImage()
// 删除图片后,自动切回单角色模式
if (selectedTaskId.value === 's2v' && isMultiRoleMode.value) {
isMultiRoleMode.value = false
const form = getCurrentForm()
if (form) {
form.separatedAudios = []
}
}
}
// 脸部放大图片编辑相关状态
const showFaceEditModal = ref(false)
const editingFaceIndex = ref(-1)
const editingFaceBbox = ref([0, 0, 0, 0]) // [x1, y1, x2, y2]
const originalImageUrl = ref('')
const imageContainerRef = ref(null)
const imageLoaded = ref(false) // 图片是否已加载完成
const imageNaturalSize = ref({ width: 0, height: 0 }) // 图片原始尺寸
const isDraggingBbox = ref(false)
const dragType = ref('move') // 'move', 'resize-n', 'resize-s', 'resize-w', 'resize-e', 'resize-nw', 'resize-ne', 'resize-sw', 'resize-se'
const dragStartPos = ref({ x: 0, y: 0 })
const dragStartBbox = ref([0, 0, 0, 0]) // 拖拽开始时的bbox坐标
const bboxOffset = ref({ x: 0, y: 0 })
const isAddingNewFace = ref(false) // 是否在新增角色模式
const faceSaving = ref(false) // 是否正在保存角色(用于显示加载状态)
const showRoleModeInfo = ref(false) // 是否显示角色模式说明
// 打开脸部编辑模态框
const openFaceEditModal = async (faceIndex) => {
const form = getCurrentForm()
if (!form) return
originalImageUrl.value = getCurrentImagePreviewUrl()
imageLoaded.value = false // 重置图片加载状态
imageNaturalSize.value = { width: 0, height: 0 } // 重置图片尺寸
// 如果是新增模式(faceIndex 为 -1)
if (faceIndex === -1) {
isAddingNewFace.value = true
editingFaceIndex.value = -1
showFaceEditModal.value = true
// 等待DOM更新,确保图片元素已渲染
await nextTick()
await nextTick() // 多等待一次,确保图片元素完全渲染
// 等待图片加载完成
const img = imageContainerRef.value?.querySelector('img')
if (img) {
// 如果图片已经加载完成(从缓存),立即设置状态
if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
imageNaturalSize.value = { width: img.naturalWidth, height: img.naturalHeight }
imageLoaded.value = true
} else {
// 确保图片完全加载
await new Promise((resolve) => {
if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
imageNaturalSize.value = { width: img.naturalWidth, height: img.naturalHeight }
imageLoaded.value = true
resolve()
} else {
const onLoad = () => {
// 确保图片尺寸已正确设置
if (img.naturalWidth > 0 && img.naturalHeight > 0) {
imageNaturalSize.value = { width: img.naturalWidth, height: img.naturalHeight }
imageLoaded.value = true
img.removeEventListener('load', onLoad)
img.removeEventListener('error', onError)
resolve()
}
}
const onError = () => {
imageLoaded.value = true
img.removeEventListener('load', onLoad)
img.removeEventListener('error', onError)
resolve() // 即使加载失败也继续
}
img.addEventListener('load', onLoad)
img.addEventListener('error', onError)
}
})
}
// 再次等待,确保图片尺寸已正确设置
await nextTick()
// 计算图片的原始尺寸
const imgNaturalWidth = img.naturalWidth
const imgNaturalHeight = img.naturalHeight
if (imgNaturalWidth > 0 && imgNaturalHeight > 0) {
imageNaturalSize.value = { width: imgNaturalWidth, height: imgNaturalHeight }
// 默认居中,大小为图片的 30%
const bboxSize = Math.min(imgNaturalWidth, imgNaturalHeight) * 0.3
const centerX = imgNaturalWidth / 2
const centerY = imgNaturalHeight / 2
editingFaceBbox.value = [
centerX - bboxSize / 2,
centerY - bboxSize / 2,
centerX + bboxSize / 2,
centerY + bboxSize / 2
]
// 标记图片已加载
imageLoaded.value = true
// 再次等待DOM更新,确保边界框已渲染
await nextTick()
} else {
// 如果图片尺寸无效,使用默认值
editingFaceBbox.value = [0, 0, 100, 100]
imageLoaded.value = true
await nextTick()
}
} else {
// 如果图片还没加载,使用默认值
editingFaceBbox.value = [0, 0, 100, 100]
imageLoaded.value = true
await nextTick()
}
} else {
// 编辑现有角色
if (!form.detectedFaces || !form.detectedFaces[faceIndex]) return
isAddingNewFace.value = false
const face = form.detectedFaces[faceIndex]
editingFaceIndex.value = faceIndex
editingFaceBbox.value = [...(face.bbox || [0, 0, 0, 0])]
showFaceEditModal.value = true
// 等待DOM更新,确保图片元素已渲染
await nextTick()
await nextTick() // 多等待一次,确保图片元素完全渲染
// 等待图片加载完成,确保边界框能正确显示
const img = imageContainerRef.value?.querySelector('img')
if (img) {
// 如果图片已经加载完成(从缓存),立即设置状态
if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
imageNaturalSize.value = { width: img.naturalWidth, height: img.naturalHeight }
imageLoaded.value = true
} else {
// 确保图片完全加载
await new Promise((resolve) => {
if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
imageNaturalSize.value = { width: img.naturalWidth, height: img.naturalHeight }
imageLoaded.value = true
resolve()
} else {
const onLoad = () => {
// 确保图片尺寸已正确设置
if (img.naturalWidth > 0 && img.naturalHeight > 0) {
imageNaturalSize.value = { width: img.naturalWidth, height: img.naturalHeight }
imageLoaded.value = true
img.removeEventListener('load', onLoad)
img.removeEventListener('error', onError)
resolve()
}
}
const onError = () => {
imageLoaded.value = true
img.removeEventListener('load', onLoad)
img.removeEventListener('error', onError)
resolve() // 即使加载失败也继续
}
img.addEventListener('load', onLoad)
img.addEventListener('error', onError)
}
})
}
// 再次等待,确保图片尺寸已正确设置
await nextTick()
} else {
imageLoaded.value = true
}
}
}
// 处理脸部编辑模态框中的图片加载
const handleFaceEditImageLoad = () => {
const img = imageContainerRef.value?.querySelector('img')
if (img && img.naturalWidth > 0 && img.naturalHeight > 0) {
imageNaturalSize.value = { width: img.naturalWidth, height: img.naturalHeight }
imageLoaded.value = true
nextTick()
}
}
// 处理脸部编辑模态框中的图片加载错误
const handleFaceEditImageError = () => {
imageLoaded.value = true // 即使加载失败也显示,避免一直显示加载中
}
// 关闭脸部编辑模态框
const closeFaceEditModal = () => {
showFaceEditModal.value = false
editingFaceIndex.value = -1
isDraggingBbox.value = false
isAddingNewFace.value = false
imageLoaded.value = false
imageNaturalSize.value = { width: 0, height: 0 }
}
// 保存边界框更改
const saveFaceBbox = async () => {
const form = getCurrentForm()
if (!form) return
// 保存当前状态(在关闭模态框之前)
const wasAddingNewFace = isAddingNewFace.value
const currentEditingIndex = editingFaceIndex.value // 保存编辑索引
const currentBbox = [...editingFaceBbox.value] // 保存边界框
const currentImageUrl = originalImageUrl.value // 保存图片URL
// 立即关闭模态框
closeFaceEditModal()
// 如果是新增模式,显示加载状态
if (wasAddingNewFace) {
faceSaving.value = true
}
try {
// 如果是新增模式
if (wasAddingNewFace) {
if (!form.detectedFaces) {
form.detectedFaces = []
}
// 创建新角色
const newFaceIndex = form.detectedFaces.length
const newFace = {
bbox: [...currentBbox],
roleName: `角色${newFaceIndex + 1}`,
roleIndex: newFaceIndex,
isEditing: false,
face_image: null
}
// 根据新的 bbox 坐标,从原始图片裁剪出新的 face_image
try {
let imageUrl = currentImageUrl
if (imageUrl.startsWith('data:image')) {
// 保持 data URL 格式,可以直接使用
} else if (!imageUrl.startsWith('http') && !imageUrl.startsWith('/')) {
imageUrl = currentImageUrl
}
// 裁剪出新的 face_image
const croppedImage = await cropFaceImage(imageUrl, newFace.bbox)
// 移除 data URL 前缀,只保留 base64 部分(与后端返回的格式一致)
const base64Data = croppedImage.split(',')[1] || croppedImage
newFace.face_image = base64Data
} catch (error) {
console.error('Failed to crop face image:', error)
}
// 添加到角色列表
form.detectedFaces.push(newFace)
// 触发响应式更新
form.detectedFaces = [...form.detectedFaces]
// 如果是在 s2v 模式下且有上传的音频,自动重新分割音频
if (selectedTaskId.value === 's2v' && getCurrentAudioPreview()) {
try {
const audioDataUrl = getCurrentAudioPreview()
await separateAudioTracks(audioDataUrl, form.detectedFaces.length)
} catch (error) {
console.error('Failed to re-separate audio after adding face:', error)
}
}
} else {
// 编辑现有角色
if (!form.detectedFaces || currentEditingIndex < 0 || !form.detectedFaces[currentEditingIndex]) {
console.error('Invalid editing index or face not found:', currentEditingIndex)
return
}
const face = form.detectedFaces[currentEditingIndex]
// editingFaceBbox.value 存储的是原始图片坐标 [x1, y1, x2, y2]
face.bbox = [...currentBbox]
// 根据新的 bbox 坐标,从原始图片裁剪出新的 face_image
try {
let imageUrl = currentImageUrl
if (imageUrl.startsWith('data:image')) {
// 保持 data URL 格式,可以直接使用
} else if (!imageUrl.startsWith('http') && !imageUrl.startsWith('/')) {
imageUrl = currentImageUrl
}
// 裁剪出新的 face_image
const croppedImage = await cropFaceImage(imageUrl, face.bbox)
// 移除 data URL 前缀,只保留 base64 部分(与后端返回的格式一致)
const base64Data = croppedImage.split(',')[1] || croppedImage
face.face_image = base64Data
} catch (error) {
console.error('Failed to crop face image:', error)
}
// 触发响应式更新
form.detectedFaces = [...form.detectedFaces]
}
} finally {
// 隐藏加载状态
faceSaving.value = false
}
}
// 删除角色
const removeFace = async (faceIndex) => {
const form = getCurrentForm()
if (!form || !form.detectedFaces || faceIndex < 0 || faceIndex >= form.detectedFaces.length) return
// 从角色列表中删除
form.detectedFaces.splice(faceIndex, 1)
// 触发响应式更新
form.detectedFaces = [...form.detectedFaces]
// 如果是在 s2v 模式下且有上传的音频,自动重新分割音频
if (selectedTaskId.value === 's2v' && getCurrentAudioPreview() && form.detectedFaces.length > 0) {
try {
const audioDataUrl = getCurrentAudioPreview()
await separateAudioTracks(audioDataUrl, form.detectedFaces.length)
} catch (error) {
console.error('Failed to re-separate audio after removing face:', error)
}
} else if (selectedTaskId.value === 's2v') {
// 如果没有角色了,清空分离的音频
s2vForm.value.separatedAudios = []
}
}
// 获取图片缩放比例
const getImageScale = () => {
const container = imageContainerRef.value
if (!container) return { scaleX: 1, scaleY: 1, imgWidth: 0, imgHeight: 0 }
const img = container.querySelector('img')
if (!img || !img.complete) return { scaleX: 1, scaleY: 1, imgWidth: 0, imgHeight: 0 }
const imgRect = img.getBoundingClientRect()
const scaleX = img.naturalWidth > 0 ? imgRect.width / img.naturalWidth : 1
const scaleY = img.naturalHeight > 0 ? imgRect.height / img.naturalHeight : 1
return { scaleX, scaleY, imgWidth: imgRect.width, imgHeight: imgRect.height }
}
// 获取图片相对于容器的偏移
const getImageOffset = () => {
const container = imageContainerRef.value
if (!container) return { offsetX: 0, offsetY: 0 }
const img = container.querySelector('img')
if (!img) return { offsetX: 0, offsetY: 0 }
const containerRect = container.getBoundingClientRect()
const imgRect = img.getBoundingClientRect()
return {
offsetX: imgRect.left - containerRect.left,
offsetY: imgRect.top - containerRect.top
}
}
// 开始拖拽边界框
const startDragBbox = (event, type = 'move') => {
event.preventDefault()
event.stopPropagation()
const container = imageContainerRef.value
if (!container) return
const img = container.querySelector('img')
if (!img) return
// 获取图片的实际显示尺寸和原始尺寸
const imgRect = img.getBoundingClientRect()
const containerRect = container.getBoundingClientRect()
const displayWidth = imgRect.width
const displayHeight = imgRect.height
const naturalWidth = img.naturalWidth
const naturalHeight = img.naturalHeight
// 检查图片是否已加载(通过尺寸判断,而不是 complete 属性)
// 因为 complete 可能在图片尺寸设置之前就为 true
if (naturalWidth === 0 || naturalHeight === 0 || displayWidth === 0 || displayHeight === 0) {
console.warn('Image not ready for dragging:', { naturalWidth, naturalHeight, displayWidth, displayHeight, complete: img.complete })
return
}
// 检查 #app 是否有 transform: scale
const appElement = document.getElementById('app')
let appScale = 1
if (appElement) {
const appStyle = window.getComputedStyle(appElement)
const transform = appStyle.transform
if (transform && transform !== 'none') {
const matrix = transform.match(/matrix\(([^)]+)\)/)
if (matrix) {
const values = matrix[1].split(',').map(v => parseFloat(v.trim()))
appScale = values[0] || 1
} else {
const scaleMatch = transform.match(/scale\(([^)]+)\)/)
if (scaleMatch) {
appScale = parseFloat(scaleMatch[1])
}
}
}
}
// 计算缩放比例(补偿 #app 的缩放)
const scaleX = displayWidth / (naturalWidth * appScale)
const scaleY = displayHeight / (naturalHeight * appScale)
// 图片在容器中的偏移
const offsetX = imgRect.left - containerRect.left
const offsetY = imgRect.top - containerRect.top
// 边界框坐标
const [x1, y1, x2, y2] = editingFaceBbox.value
// 计算边界框在容器中的显示位置
const bboxRect = {
left: offsetX + x1 * scaleX,
top: offsetY + y1 * scaleY,
right: offsetX + x2 * scaleX,
bottom: offsetY + y2 * scaleY
}
// 点击位置相对于容器
const clickX = event.clientX - containerRect.left
const clickY = event.clientY - containerRect.top
// 如果是拖拽手柄(type 不是 'move'),直接开始拖拽
if (type !== 'move') {
isDraggingBbox.value = true
dragType.value = type
dragStartPos.value = { x: clickX, y: clickY }
dragStartBbox.value = [...editingFaceBbox.value]
return
}
// 检查点击是否在边界框内(移动模式)
if (clickX < bboxRect.left || clickX > bboxRect.right ||
clickY < bboxRect.top || clickY > bboxRect.bottom) {
return
}
isDraggingBbox.value = true
dragType.value = 'move'
dragStartPos.value = { x: clickX, y: clickY }
dragStartBbox.value = [...editingFaceBbox.value]
}
// 拖拽边界框
const dragBbox = (event) => {
if (!isDraggingBbox.value) return
const container = imageContainerRef.value
if (!container) return
const img = container.querySelector('img')
if (!img || !img.complete) return
// 获取图片的实际显示尺寸和原始尺寸
const imgRect = img.getBoundingClientRect()
const containerRect = container.getBoundingClientRect()
const displayWidth = imgRect.width
const displayHeight = imgRect.height
const naturalWidth = img.naturalWidth
const naturalHeight = img.naturalHeight
if (naturalWidth === 0 || naturalHeight === 0) return
// 检查 #app 是否有 transform: scale
const appElement = document.getElementById('app')
let appScale = 1
if (appElement) {
const appStyle = window.getComputedStyle(appElement)
const transform = appStyle.transform
if (transform && transform !== 'none') {
const matrix = transform.match(/matrix\(([^)]+)\)/)
if (matrix) {
const values = matrix[1].split(',').map(v => parseFloat(v.trim()))
appScale = values[0] || 1
} else {
const scaleMatch = transform.match(/scale\(([^)]+)\)/)
if (scaleMatch) {
appScale = parseFloat(scaleMatch[1])
}
}
}
}
// 计算缩放比例(补偿 #app 的缩放)
// displayWidth 已经是经过 appScale 缩放后的尺寸,所以需要除以 appScale 来得到相对于原始图片的缩放比例
const scaleX = displayWidth / (naturalWidth * appScale)
const scaleY = displayHeight / (naturalHeight * appScale)
// 图片在容器中的偏移
const offsetX = imgRect.left - containerRect.left
const offsetY = imgRect.top - containerRect.top
// 鼠标当前位置相对于容器
const containerRect2 = container.getBoundingClientRect()
const currentX = event.clientX - containerRect2.left
const currentY = event.clientY - containerRect2.top
// 将坐标转换为相对于图片的位置(考虑图片在容器中的偏移)
const imgCurrentX = currentX - offsetX
const imgCurrentY = currentY - offsetY
const imgStartX = dragStartPos.value.x - offsetX
const imgStartY = dragStartPos.value.y - offsetY
const deltaX = (imgCurrentX - imgStartX) / scaleX
const deltaY = (imgCurrentY - imgStartY) / scaleY
// 获取拖拽开始时的bbox
const [startX1, startY1, startX2, startY2] = dragStartBbox.value
const startWidth = startX2 - startX1
const startHeight = startY2 - startY1
let newX1 = startX1
let newY1 = startY1
let newX2 = startX2
let newY2 = startY2
// 根据拖拽类型调整bbox
const type = dragType.value
if (type === 'move') {
// 移动模式:整体移动
newX1 = startX1 + deltaX
newY1 = startY1 + deltaY
newX2 = startX2 + deltaX
newY2 = startY2 + deltaY
} else if (type === 'resize-n') {
// 调整顶部
newY1 = Math.min(startY1 + deltaY, startY2 - 10) // 最小高度10px
newX1 = startX1
newX2 = startX2
newY2 = startY2
} else if (type === 'resize-s') {
// 调整底部
newY2 = Math.max(startY2 + deltaY, startY1 + 10) // 最小高度10px
newX1 = startX1
newY1 = startY1
newX2 = startX2
} else if (type === 'resize-w') {
// 调整左侧
newX1 = Math.min(startX1 + deltaX, startX2 - 10) // 最小宽度10px
newY1 = startY1
newX2 = startX2
newY2 = startY2
} else if (type === 'resize-e') {
// 调整右侧
newX2 = Math.max(startX2 + deltaX, startX1 + 10) // 最小宽度10px
newX1 = startX1
newY1 = startY1
newY2 = startY2
} else if (type === 'resize-nw') {
// 调整左上角
newX1 = Math.min(startX1 + deltaX, startX2 - 10)
newY1 = Math.min(startY1 + deltaY, startY2 - 10)
newX2 = startX2
newY2 = startY2
} else if (type === 'resize-ne') {
// 调整右上角
newX2 = Math.max(startX2 + deltaX, startX1 + 10)
newY1 = Math.min(startY1 + deltaY, startY2 - 10)
newX1 = startX1
newY2 = startY2
} else if (type === 'resize-sw') {
// 调整左下角
newX1 = Math.min(startX1 + deltaX, startX2 - 10)
newY2 = Math.max(startY2 + deltaY, startY1 + 10)
newX2 = startX2
newY1 = startY1
} else if (type === 'resize-se') {
// 调整右下角
newX2 = Math.max(startX2 + deltaX, startX1 + 10)
newY2 = Math.max(startY2 + deltaY, startY1 + 10)
newX1 = startX1
newY1 = startY1
}
// 边界限制:确保bbox在图片范围内
const minSize = 10 // 最小尺寸
// X方向边界限制
if (newX1 < 0) {
newX1 = 0
if (type.includes('w') || type === 'resize-nw' || type === 'resize-sw') {
// 如果是调整左边或左角,需要保持宽度
newX2 = Math.max(newX2, minSize)
}
}
if (newX2 > naturalWidth) {
newX2 = naturalWidth
if (type.includes('e') || type === 'resize-ne' || type === 'resize-se') {
// 如果是调整右边或右角,需要保持宽度
newX1 = Math.min(newX1, naturalWidth - minSize)
}
}
// Y方向边界限制
if (newY1 < 0) {
newY1 = 0
if (type.includes('n') || type === 'resize-nw' || type === 'resize-ne') {
// 如果是调整上边或上角,需要保持高度
newY2 = Math.max(newY2, minSize)
}
}
if (newY2 > naturalHeight) {
newY2 = naturalHeight
if (type.includes('s') || type === 'resize-sw' || type === 'resize-se') {
// 如果是调整下边或下角,需要保持高度
newY1 = Math.min(newY1, naturalHeight - minSize)
}
}
// 确保最小尺寸
if (newX2 - newX1 < minSize) {
if (type.includes('w') || type === 'resize-nw' || type === 'resize-sw') {
newX1 = newX2 - minSize
} else {
newX2 = newX1 + minSize
}
}
if (newY2 - newY1 < minSize) {
if (type.includes('n') || type === 'resize-nw' || type === 'resize-ne') {
newY1 = newY2 - minSize
} else {
newY2 = newY1 + minSize
}
}
// 更新边界框坐标
editingFaceBbox.value = [newX1, newY1, newX2, newY2]
}
// 结束拖拽
const endDragBbox = () => {
isDraggingBbox.value = false
dragType.value = 'move'
}
// 计算边界框的样式(用于在放大图片上显示)
const getBboxStyle = computed(() => {
if (!imageContainerRef.value || editingFaceBbox.value.length !== 4 || !imageLoaded.value) {
return {}
}
const container = imageContainerRef.value
const img = container.querySelector('img')
if (!img || !img.complete || img.naturalWidth === 0 || img.naturalHeight === 0) {
return {}
}
// 获取图片的实际显示尺寸
// getBoundingClientRect() 返回的是相对于 viewport 的坐标
// 如果 #app 有 transform: scale(0.8),那么所有元素都会被缩放 0.8
const imgRect = img.getBoundingClientRect()
const containerRect = container.getBoundingClientRect()
// 图片的实际显示尺寸(已考虑所有CSS样式和可能的缩放,包括 #app 的 0.8 缩放)
const displayWidth = imgRect.width
const displayHeight = imgRect.height
// 图片的原始尺寸(naturalWidth/naturalHeight 是图片文件的真实尺寸)
const naturalWidth = img.naturalWidth
const naturalHeight = img.naturalHeight
if (naturalWidth === 0 || naturalHeight === 0) {
return {}
}
// 检查 #app 是否有 transform: scale
// 如果模态框在 #app 内部,会受到 #app 的 transform 影响
const appElement = document.getElementById('app')
let appScale = 1
if (appElement) {
const appStyle = window.getComputedStyle(appElement)
const transform = appStyle.transform
if (transform && transform !== 'none') {
// 解析 transform matrix 或 scale
const matrix = transform.match(/matrix\(([^)]+)\)/)
if (matrix) {
const values = matrix[1].split(',').map(v => parseFloat(v.trim()))
// matrix(a, b, c, d, tx, ty) 中,a 和 d 是缩放值
appScale = values[0] || 1
} else {
const scaleMatch = transform.match(/scale\(([^)]+)\)/)
if (scaleMatch) {
appScale = parseFloat(scaleMatch[1])
}
}
}
}
// 计算缩放比例
// displayWidth 已经是经过 appScale 缩放后的尺寸
// 所以相对于原始图片的实际缩放比例是 displayWidth / naturalWidth
// 但由于 #app 的缩放,边界框在模态框中的尺寸需要补偿这个缩放
// 如果 appScale = 0.8,那么边界框的尺寸应该是 displayWidth / appScale / naturalWidth = displayWidth / (naturalWidth * appScale)
const scaleX = displayWidth / (naturalWidth * appScale)
const scaleY = displayHeight / (naturalHeight * appScale)
// 图片在容器中的偏移(相对于容器)
// 如果容器是 inline-block,图片和容器可能在同一位置
// 我们需要检查容器是否包裹了图片,或者图片就是容器的唯一内容
let offsetX = imgRect.left - containerRect.left
let offsetY = imgRect.top - containerRect.top
// 如果计算出的偏移很小(可能是浮点数误差),或者容器和图片尺寸相同,说明图片填充了整个容器
// 在这种情况下,offset 应该为 0
if (Math.abs(offsetX) < 1 && Math.abs(offsetY) < 1) {
offsetX = 0
offsetY = 0
}
// 边界框坐标(原始图片坐标 [x1, y1, x2, y2])
// 这些坐标是基于原始图片尺寸的绝对像素坐标
const [x1, y1, x2, y2] = editingFaceBbox.value
// 转换为显示坐标
// bbox坐标是基于原始图片尺寸的,需要乘以缩放比例得到显示尺寸
// 注意:这里计算的是边界框在容器中的位置和尺寸
const left = offsetX + x1 * scaleX
const top = offsetY + y1 * scaleY
const width = (x2 - x1) * scaleX
const height = (y2 - y1) * scaleY
const indicatorSize = 12
// 确保边界框的尺寸计算正确(考虑 border 的影响)
// border-2 = 2px,左右各2px,所以总宽度需要包含 border
// 但由于使用了 box-sizing: border-box,所以不需要额外调整
return {
left: `${left}px`,
top: `${top}px`,
width: `${width}px`,
height: `${height}px`,
indicatorSize: indicatorSize,
boxSizing: 'border-box'
}
})
// 计算角色名字标签的样式(显示在边界框上方)
const getRoleNameLabelStyle = computed(() => {
const bboxStyle = getBboxStyle.value
if (!bboxStyle.left || !bboxStyle.top) {
return {}
}
// 获取当前编辑的人脸信息
const form = getCurrentForm()
let roleName
// 如果是新增模式
if (isAddingNewFace.value) {
// 计算新角色的序号:当前角色数量 + 1
const newRoleIndex = (form?.detectedFaces?.length || 0) + 1
roleName = `角色${newRoleIndex}`
} else {
// 编辑现有角色
const face = form?.detectedFaces?.[editingFaceIndex.value]
roleName = face?.roleName || `角色${editingFaceIndex.value + 1}`
}
// 计算标签位置:在边界框上方居中
const left = parseFloat(bboxStyle.left) || 0
const top = parseFloat(bboxStyle.top) || 0
const width = parseFloat(bboxStyle.width) || 0
// 标签在边界框上方,水平居中
const labelLeft = left + width / 2
return {
left: `${labelLeft}px`,
top: `${top - 28}px`, // 在边界框上方 28px
transform: 'translateX(-50%)', // 水平居中
roleName: roleName
}
})
// 处理提交任务并滚动到任务区域
const handleSubmitTask = async () => {
......@@ -463,7 +1465,7 @@ const handleTTSComplete = (audioBlob) => {
showVoiceTTSModal.value = false
// 显示成功提示
showAlert('语音合成完成,已自动添加到音频素材', 'success')
showAlert(t('ttsCompleted'), 'success')
}
// 跳转到项目页面
......@@ -608,62 +1610,204 @@ const updateScreenSize = () => {
let resizeHandler = null
// 路由监听和URL同步
watch(() => route.query, (newQuery) => {
// 标记是否正在更新 URL,避免循环更新
let isUpdatingUrl = false
// 标记是否正在从路由恢复状态
let isRestoringFromRoute = false
// 存储待处理的路由参数(当 availableTaskTypes 还未加载完成时)
let pendingRouteRestore = null
// 处理路由参数恢复的函数
const restoreFromRoute = (newQuery, oldQuery) => {
// 如果正在更新 URL,跳过处理,避免循环更新
if (isUpdatingUrl) {
return
}
// 如果 availableTaskTypes 还没有加载完成,保存参数等待处理
if (availableTaskTypes.value.length === 0) {
pendingRouteRestore = { newQuery, oldQuery }
return
}
// 标记正在从路由恢复状态
isRestoringFromRoute = true
// 同步URL参数到组件状态
// 首次加载时(oldQuery 为 undefined),或者参数真正变化时才更新
const isInitialLoad = !oldQuery
// 处理任务类型
if (newQuery.taskType) {
// 根据URL参数设置任务类型
const taskType = newQuery.taskType
if (availableTaskTypes.value.some(type => type.value === taskType)) {
selectTask(taskType)
const shouldUpdate = isInitialLoad || (newQuery.taskType !== oldQuery?.taskType)
// availableTaskTypes 是字符串数组,不是对象数组
if (shouldUpdate && availableTaskTypes.value.includes(taskType)) {
// 如果当前任务类型不匹配,执行 selectTask
// 在首次加载时,即使值已经匹配,也执行 selectTask 以确保所有相关状态正确设置
if (selectedTaskId.value !== taskType || isInitialLoad) {
selectTask(taskType)
// 等待 selectTask 完成后再处理 model(因为 selectTask 可能会改变 availableModelClasses)
// 使用 nextTick 确保 selectTask 的副作用已完成
nextTick(() => {
// 再次等待,确保 availableModelClasses 已更新
setTimeout(() => {
// 处理模型(在任务类型设置后)
if (newQuery.model) {
const model = newQuery.model
const shouldUpdateModel = isInitialLoad || (newQuery.model !== oldQuery?.model)
// availableModelClasses 是字符串数组,不是对象数组
// 在首次加载时,即使值已经匹配,也执行 selectModel 以确保所有相关状态正确设置
if (shouldUpdateModel && availableModelClasses.value.includes(model) && (selectedModel.value !== model || isInitialLoad)) {
selectModel(model)
}
}
}, 100)
})
}
} else if (!shouldUpdate && newQuery.model) {
// 如果任务类型没有变化,但需要更新模型
const model = newQuery.model
const shouldUpdateModel = isInitialLoad || (newQuery.model !== oldQuery?.model)
// availableModelClasses 是字符串数组,不是对象数组
// 在首次加载时,即使值已经匹配,也执行 selectModel 以确保所有相关状态正确设置
if (shouldUpdateModel && availableModelClasses.value.includes(model) && (selectedModel.value !== model || isInitialLoad)) {
selectModel(model)
}
}
}
if (newQuery.model) {
// 根据URL参数设置模型
} else if (newQuery.model && selectedTaskId.value) {
// 如果没有任务类型参数,但任务类型已经设置,直接处理模型
const model = newQuery.model
if (availableModelClasses.value.some(m => m.value === model)) {
const shouldUpdate = isInitialLoad || (newQuery.model !== oldQuery?.model)
// availableModelClasses 是字符串数组,不是对象数组
// 在首次加载时,即使值已经匹配,也执行 selectModel 以确保所有相关状态正确设置
if (shouldUpdate && availableModelClasses.value.includes(model) && (selectedModel.value !== model || isInitialLoad)) {
selectModel(model)
}
}
if (newQuery.expanded === 'true') {
// 处理 expanded 状态
const shouldBeExpanded = newQuery.expanded === 'true'
if (shouldBeExpanded && !isCreationAreaExpanded.value) {
// 展开创建区域
expandCreationArea()
} else if (!shouldBeExpanded && isCreationAreaExpanded.value && (isInitialLoad || oldQuery?.expanded === 'true')) {
// 如果 URL 中 expanded 从 'true' 变为其他值,收缩创建区域
contractCreationArea()
}
// 恢复状态完成,使用 setTimeout 确保所有状态更新完成后再重置标志
setTimeout(() => {
isRestoringFromRoute = false
}, 200)
}
// 监听 availableTaskTypes,当它加载完成后处理待处理的路由恢复
watch(availableTaskTypes, (newVal) => {
if (newVal && newVal.length > 0 && pendingRouteRestore) {
// availableTaskTypes 加载完成,处理待处理的路由恢复
const { newQuery, oldQuery } = pendingRouteRestore
pendingRouteRestore = null
restoreFromRoute(newQuery, oldQuery)
}
}, { immediate: true })
watch(() => route.query, (newQuery, oldQuery) => {
restoreFromRoute(newQuery, oldQuery)
// 注意:分享数据导入功能已移至 Share.vue 中的 createSimilar 函数
// 这里不再需要处理分享数据导入
}, { immediate: true })
// 监听组件状态变化,同步到URL
watch([selectedTaskId, isCreationAreaExpanded, selectedModel], () => {
watch([selectedTaskId, isCreationAreaExpanded, selectedModel], (newVals, oldVals) => {
// 如果正在更新 URL 或正在从路由恢复状态,跳过处理,避免循环更新
if (isUpdatingUrl || isRestoringFromRoute) {
return
}
// 检查任务类型是否变化
const taskTypeChanged = oldVals && oldVals[0] !== newVals[0]
// 如果任务类型变化,检查当前模型是否属于新任务类型
if (taskTypeChanged && selectedTaskId.value && selectedModel.value) {
const isModelValid = availableModelClasses.value.includes(selectedModel.value)
// 如果模型不属于新任务类型,延迟更新路由,等待模型更新完成
if (!isModelValid) {
setTimeout(() => {
// 再次检查,确保模型已经更新
if (!isUpdatingUrl && !isRestoringFromRoute) {
updateRouteFromState()
}
}, 150)
return
}
}
updateRouteFromState()
}, { deep: true })
// 更新路由的函数
const updateRouteFromState = () => {
// 如果正在更新 URL 或正在从路由恢复状态,跳过处理,避免循环更新
if (isUpdatingUrl || isRestoringFromRoute) {
return
}
// 获取当前查询参数,保留其他参数(如分享相关的参数)
const currentQuery = { ...route.query }
const query = {}
// 只更新我们关心的参数
if (selectedTaskId.value) {
query.taskType = selectedTaskId.value
} else {
// 如果任务类型被清除,也从 URL 中移除
delete currentQuery.taskType
}
if (isCreationAreaExpanded.value) {
query.expanded = 'true'
} else {
// 如果创作区域收缩,从 URL 中移除 expanded 参数
delete currentQuery.expanded
}
if (selectedModel.value) {
query.model = selectedModel.value
} else {
// 如果模型被清除,也从 URL 中移除
delete currentQuery.model
}
// 更新URL但不触发路由监听
router.replace({ query })
})
// 合并查询参数,保留其他参数
const finalQuery = { ...currentQuery, ...query }
// 检查是否需要更新 URL(避免不必要的更新)
const needsUpdate =
finalQuery.taskType !== route.query.taskType ||
finalQuery.expanded !== route.query.expanded ||
finalQuery.model !== route.query.model
if (needsUpdate) {
isUpdatingUrl = true
// 更新URL但不触发路由监听(使用 replace 而不是 push,避免历史记录堆积)
router.replace({ query: finalQuery }).finally(() => {
// 使用 nextTick 确保路由更新完成后再重置标志
nextTick(() => {
isUpdatingUrl = false
})
})
}
}
// 组件挂载时初始化
onMounted(async () => {
// 确保URL参数正确同步
const query = route.query
if (query.taskType) {
selectTask(query.taskType)
}
if (query.model) {
selectModel(query.model)
}
if (query.expanded === 'true') {
expandCreationArea()
}
// 注意:watch route.query 已经使用 immediate: true 处理了 URL 参数的恢复
// 这里不需要再次处理,避免重复执行
// 如果需要额外的初始化逻辑,可以在这里添加
// 初始化屏幕尺寸
updateScreenSize()
......@@ -675,6 +1819,10 @@ onMounted(async () => {
}
window.addEventListener('resize', resizeHandler)
// 添加全局鼠标事件监听用于拖拽边界框
document.addEventListener('mousemove', dragBbox)
document.addEventListener('mouseup', endDragBbox)
// 加载精选模版数据
await loadFeaturedTemplates(true)
// 获取随机精选模版
......@@ -727,9 +1875,9 @@ const handleImageDrop = (e) => {
}
handleImageUpload(event)
showAlert('图片拖拽上传成功', 'success')
showAlert(t('imageDragSuccess'), 'success')
} else {
showAlert('请拖拽图片文件', 'warning')
showAlert(t('pleaseDragImage'), 'warning')
}
}
......@@ -755,9 +1903,51 @@ const handleAudioDrop = (e) => {
}
handleAudioUpload(event)
showAlert('音频/视频拖拽上传成功', 'success')
showAlert(t('audioDragSuccess'), 'success')
} else {
showAlert('请拖拽音频或视频文件', 'warning')
showAlert(t('pleaseDragAudio'), 'warning')
}
}
// 触发视频上传
const triggerVideoUpload = () => {
// 使用 nextTick 确保 DOM 已更新
nextTick(() => {
const videoInput = document.querySelector('input[type="file"][data-role="video-input"]')
if (videoInput) {
videoInput.click()
} else {
console.warn('视频输入框未找到,请确保已选择 animate 任务类型')
}
})
}
// 处理视频拖拽上传
const handleVideoDrop = (e) => {
e.preventDefault()
e.stopPropagation()
isDragOver.value = false
const files = Array.from(e.dataTransfer.files)
const videoFile = files.find(file => file.type.startsWith('video/'))
if (videoFile) {
// 创建FileList对象来模拟input[type="file"]的change事件
const dataTransfer = new DataTransfer()
dataTransfer.items.add(videoFile)
const fileList = dataTransfer.files
// 创建模拟的change事件
const event = {
target: {
files: fileList
}
}
handleVideoUpload(event)
showAlert(t('videoDragSuccess'), 'success')
} else {
showAlert(t('pleaseDragVideo'), 'warning')
}
}
......@@ -842,6 +2032,317 @@ watch(() => getCurrentAudioPreview(), (newPreview) => {
}
})
// 监听分离后的音频变化,重置播放器状态
watch(() => s2vForm.value.separatedAudios, (newAudios, oldAudios) => {
// 如果音频列表发生变化(重新分割),清理旧的音频元素和状态
if (newAudios && newAudios.length > 0 && oldAudios && oldAudios.length > 0) {
// 检查是否是重新分割(音频数量或内容发生变化)
const isReseparation = newAudios.length !== oldAudios.length ||
newAudios.some((audio, index) => {
const oldAudio = oldAudios[index]
return !oldAudio || audio.audioDataUrl !== oldAudio.audioDataUrl
})
if (isReseparation) {
// 停止所有正在播放的音频
separatedAudioElements.value.forEach((audioElement, index) => {
if (audioElement) {
audioElement.pause()
separatedAudioElements.value[index] = null
}
})
// 清理所有状态
separatedAudioElements.value = []
separatedAudioPlaying.value = {}
separatedAudioDuration.value = {}
separatedAudioCurrentTime.value = {}
separatedAudioIsDragging.value = {}
// 等待 DOM 更新后重新加载音频
nextTick(() => {
// 音频元素会在模板中自动重新创建和加载
})
}
} else if (!newAudios || newAudios.length === 0) {
// 如果音频列表被清空,清理所有状态
separatedAudioElements.value.forEach((audioElement, index) => {
if (audioElement) {
audioElement.pause()
separatedAudioElements.value[index] = null
}
})
separatedAudioElements.value = []
separatedAudioPlaying.value = {}
separatedAudioDuration.value = {}
separatedAudioCurrentTime.value = {}
separatedAudioIsDragging.value = {}
}
}, { deep: true })
// 分离后的音频播放器控制函数
const toggleSeparatedAudioPlayback = (index) => {
const audioElement = separatedAudioElements.value[index]
if (!audioElement) return
if (audioElement.paused) {
audioElement.play().catch(error => {
console.log('播放失败:', error)
})
} else {
audioElement.pause()
}
}
const getSeparatedAudioPlaying = (index) => {
return separatedAudioPlaying.value[index] || false
}
const getSeparatedAudioDuration = (index) => {
return separatedAudioDuration.value[index] || 0
}
const getSeparatedAudioCurrentTime = (index) => {
return separatedAudioCurrentTime.value[index] || 0
}
const onSeparatedAudioLoaded = (index) => {
const audioElement = separatedAudioElements.value[index]
if (audioElement) {
separatedAudioDuration.value[index] = audioElement.duration || 0
}
}
const onSeparatedAudioTimeUpdate = (index) => {
const audioElement = separatedAudioElements.value[index]
if (audioElement && !separatedAudioIsDragging.value[index]) {
separatedAudioCurrentTime.value[index] = audioElement.currentTime || 0
}
}
const onSeparatedAudioProgressChange = (index, event) => {
if (separatedAudioDuration.value[index] > 0 && separatedAudioElements.value[index] && event.target) {
const newTime = parseFloat(event.target.value)
separatedAudioCurrentTime.value[index] = newTime
separatedAudioElements.value[index].currentTime = newTime
}
}
const onSeparatedAudioProgressEnd = (index, event) => {
const audioElement = separatedAudioElements.value[index]
if (audioElement && separatedAudioDuration.value[index] > 0 && event.target) {
const newTime = parseFloat(event.target.value)
audioElement.currentTime = newTime
separatedAudioCurrentTime.value[index] = newTime
}
separatedAudioIsDragging.value[index] = false
}
const onSeparatedAudioEnded = (index) => {
separatedAudioPlaying.value[index] = false
separatedAudioCurrentTime.value[index] = 0
}
const handleSeparatedAudioError = (index) => {
console.error(`分离后的音频 ${index} 加载失败`)
separatedAudioPlaying.value[index] = false
}
// 拖拽排序函数 - 角色
const onRoleDragStart = (event, index) => {
draggedRoleIndex.value = index
event.dataTransfer.effectAllowed = 'move'
event.dataTransfer.setData('text/html', event.target.outerHTML)
// 创建拖拽预览
const target = event.currentTarget
const rect = target.getBoundingClientRect()
dragOffset.value = {
x: event.clientX - rect.left,
y: event.clientY - rect.top
}
// 创建拖拽预览图片
const dragImage = target.cloneNode(true)
// 设置固定尺寸,确保预览正确显示
dragImage.style.width = `${rect.width}px`
dragImage.style.height = `${rect.height}px`
dragImage.style.position = 'fixed'
dragImage.style.top = '-9999px'
dragImage.style.left = '-9999px'
dragImage.style.opacity = '0.9'
dragImage.style.transform = 'rotate(2deg)'
dragImage.style.pointerEvents = 'none'
dragImage.style.zIndex = '10000'
dragImage.style.boxShadow = '0 8px 24px rgba(0,0,0,0.3)'
dragImage.style.backgroundColor = 'transparent'
// 立即添加到 DOM
document.body.appendChild(dragImage)
// 强制重排,确保元素已渲染
void dragImage.offsetHeight
// 同步设置拖拽图片(必须在 dragstart 事件中同步调用)
try {
event.dataTransfer.setDragImage(dragImage, dragOffset.value.x, dragOffset.value.y)
} catch (e) {
console.warn('Failed to set drag image:', e)
}
// 延迟移除预览元素
setTimeout(() => {
if (dragImage.parentNode) {
dragImage.parentNode.removeChild(dragImage)
}
}, 0)
}
const onRoleDragOver = (event, index) => {
event.preventDefault()
event.dataTransfer.dropEffect = 'move'
if (draggedRoleIndex.value !== index) {
dragOverRoleIndex.value = index
}
}
const onRoleDragLeave = () => {
dragOverRoleIndex.value = -1
}
const onRoleDrop = (event, targetIndex) => {
event.preventDefault()
if (draggedRoleIndex.value === -1 || draggedRoleIndex.value === targetIndex) {
draggedRoleIndex.value = -1
dragOverRoleIndex.value = -1
return
}
const form = getCurrentForm()
if (!form || !form.detectedFaces) return
// 保存原始状态
const originalFaces = [...form.detectedFaces]
// 重新排序角色(只改变角色顺序,不影响音频顺序)
const faces = [...form.detectedFaces]
const draggedFace = faces[draggedRoleIndex.value]
faces.splice(draggedRoleIndex.value, 1)
faces.splice(targetIndex, 0, draggedFace)
form.detectedFaces = faces
// 更新音频的 roleIndex 和 roleName,以匹配新的角色位置
// 但不改变音频的显示顺序
if (s2vForm.value.separatedAudios && s2vForm.value.separatedAudios.length > 0) {
s2vForm.value.separatedAudios.forEach((audio) => {
// 找到这个音频原来对应的角色
const originalRoleIndex = audio.roleIndex !== undefined ? audio.roleIndex : -1
if (originalRoleIndex >= 0 && originalRoleIndex < originalFaces.length) {
const originalFace = originalFaces[originalRoleIndex]
// 找到这个角色在新列表中的位置
const newRoleIndex = faces.findIndex(f => f === originalFace)
if (newRoleIndex >= 0) {
audio.roleIndex = newRoleIndex
audio.roleName = faces[newRoleIndex].roleName || `角色${newRoleIndex + 1}`
}
}
})
// 触发响应式更新
s2vForm.value.separatedAudios = [...s2vForm.value.separatedAudios]
}
draggedRoleIndex.value = -1
dragOverRoleIndex.value = -1
}
// 拖拽排序函数 - 音频
const onAudioDragStart = (event, index) => {
draggedAudioIndex.value = index
event.dataTransfer.effectAllowed = 'move'
event.dataTransfer.setData('text/html', event.target.outerHTML)
// 创建拖拽预览
const target = event.currentTarget
const rect = target.getBoundingClientRect()
dragOffset.value = {
x: event.clientX - rect.left,
y: event.clientY - rect.top
}
// 创建拖拽预览图片
const dragImage = target.cloneNode(true)
// 设置固定尺寸,确保预览正确显示
dragImage.style.width = `${rect.width}px`
dragImage.style.height = `${rect.height}px`
dragImage.style.position = 'fixed'
dragImage.style.top = '-9999px'
dragImage.style.left = '-9999px'
dragImage.style.opacity = '0.9'
dragImage.style.transform = 'rotate(2deg)'
dragImage.style.pointerEvents = 'none'
dragImage.style.zIndex = '10000'
dragImage.style.boxShadow = '0 8px 24px rgba(0,0,0,0.3)'
dragImage.style.backgroundColor = 'transparent'
// 立即添加到 DOM
document.body.appendChild(dragImage)
// 强制重排,确保元素已渲染
void dragImage.offsetHeight
// 同步设置拖拽图片(必须在 dragstart 事件中同步调用)
try {
event.dataTransfer.setDragImage(dragImage, dragOffset.value.x, dragOffset.value.y)
} catch (e) {
console.warn('Failed to set drag image:', e)
}
// 延迟移除预览元素
setTimeout(() => {
if (dragImage.parentNode) {
dragImage.parentNode.removeChild(dragImage)
}
}, 0)
}
const onAudioDragOver = (event, index) => {
event.preventDefault()
event.dataTransfer.dropEffect = 'move'
if (draggedAudioIndex.value !== index) {
dragOverAudioIndex.value = index
}
}
const onAudioDragLeave = () => {
dragOverAudioIndex.value = -1
}
const onAudioDrop = (event, targetIndex) => {
event.preventDefault()
if (draggedAudioIndex.value === -1 || draggedAudioIndex.value === targetIndex) {
draggedAudioIndex.value = -1
dragOverAudioIndex.value = -1
return
}
if (!s2vForm.value.separatedAudios) return
// 重新排序音频(只改变音频顺序,不影响角色顺序)
const audios = [...s2vForm.value.separatedAudios]
const draggedAudio = audios[draggedAudioIndex.value]
audios.splice(draggedAudioIndex.value, 1)
audios.splice(targetIndex, 0, draggedAudio)
// 音频的 roleIndex 和 roleName 保持不变,因为它们仍然对应原来的角色
// 不需要更新 roleIndex,因为角色顺序没有改变
s2vForm.value.separatedAudios = audios
draggedAudioIndex.value = -1
dragOverAudioIndex.value = -1
}
// 组件卸载时清理
onUnmounted(() => {
if (resizeHandler) {
......@@ -852,6 +2353,18 @@ onUnmounted(() => {
audioPreviewElement.value.pause()
audioPreviewElement.value = null
}
// 停止并清理所有分离后的音频播放器
separatedAudioElements.value.forEach((audioElement, index) => {
if (audioElement) {
audioElement.pause()
separatedAudioElements.value[index] = null
}
})
separatedAudioElements.value = []
separatedAudioPlaying.value = {}
separatedAudioDuration.value = {}
separatedAudioCurrentTime.value = {}
separatedAudioIsDragging.value = {}
})
</script>
......@@ -984,9 +2497,9 @@ onUnmounted(() => {
</button>
</div>
<div v-if="selectedTaskId === 'i2v' || selectedTaskId === 's2v'" class="upload-section">
<div v-if="selectedTaskId === 'i2v' || selectedTaskId === 's2v' || selectedTaskId === 'animate'" class="upload-section">
<!-- 上传图片 - Apple 风格 -->
<div v-if="selectedTaskId === 'i2v' || selectedTaskId === 's2v'">
<div v-if="selectedTaskId === 'i2v' || selectedTaskId === 's2v' || selectedTaskId === 'animate'">
<!-- 图片标签 -->
<div class="flex justify-between items-center mb-3">
<label class="text-sm font-medium text-[#1d1d1f] dark:text-[#f5f5f7] tracking-tight">
......@@ -1030,26 +2543,34 @@ onUnmounted(() => {
</div>
</div>
<!-- 图片预览 - Apple 风格 -->
<div v-if="getCurrentImagePreview()" class="relative w-full min-h-[220px] flex items-center justify-center group">
<!-- 图片预览区域 - 只显示主图 -->
<div v-if="getCurrentImagePreview()" class="flex items-center justify-center w-full min-h-[220px]">
<!-- 主图预览 - Apple 风格 -->
<div class="relative w-auto max-w-full min-h-[220px] flex items-center justify-center group">
<img :src="getCurrentImagePreviewUrl()" alt="t('previewImage')"
class="max-w-full max-h-[220px] w-auto h-auto object-contain rounded-xl transition-all duration-200">
<!-- 删除按钮 - Apple 风格 -->
<!-- 删除按钮 - Apple 风格 -->
<div
class="absolute inset-x-0 bottom-4 flex items-center justify-center opacity-100 md:opacity-0 md:group-hover:opacity-100 transition-opacity duration-200">
<div class="flex gap-3">
<button @click.stop="removeImage"
<button @click.stop="handleRemoveImage"
class="w-11 h-11 flex items-center justify-center bg-white/95 dark:bg-[#2c2c2e]/95 backdrop-blur-[20px] border border-black/8 dark:border-white/8 text-red-500 dark:text-red-400 rounded-full transition-all duration-200 hover:scale-110 hover:shadow-[0_4px_12px_rgba(239,68,68,0.2)] dark:hover:shadow-[0_4px_12px_rgba(248,113,113,0.3)] active:scale-100"
:title="t('deleteImage')">
:title="t('deleteImage')">
<i class="fas fa-trash text-base"></i>
</button>
</div>
</div>
</div>
<input type="file" ref="imageInput" @change="handleImageUpload" accept="image/*"
style="display: none;">
</div>
<!-- 角色检测加载提示 -->
<div v-if="faceDetecting" class="mt-3 flex items-center justify-center gap-2 text-sm text-[#86868b] dark:text-[#98989d] tracking-tight">
<i class="fas fa-spinner fa-spin text-[color:var(--brand-primary)] dark:text-[color:var(--brand-primary-light)]"></i>
<span>{{ t('detectingCharacters') }}</span>
</div>
</div>
<!-- 上传音频 - Apple 风格 -->
......@@ -1082,10 +2603,19 @@ onUnmounted(() => {
<button @click.stop="showVoiceTTSModal = true"
class="w-12 h-12 flex items-center justify-center bg-[color:var(--brand-primary)] dark:bg-[color:var(--brand-primary-light)] border border-black/8 dark:border-white/8 text-white rounded-full transition-all duration-200 hover:scale-110 hover:shadow-[0_4px_12px_rgba(0,0,0,0.1)] dark:hover:shadow-[0_4px_12px_rgba(0,0,0,0.3)] active:scale-100"
:title="t('textToSpeech')">
<i class="fas fa-volume-up text-base"></i>
<i class="fi fi-bs-text text-lg"></i>
</button>
<span class="text-xs text-[#86868b] dark:text-[#98989d] tracking-tight">{{ t('textToSpeech') }}</span>
</div>
<div class="flex flex-col items-center gap-2">
<button @click.stop="router.push('/podcast_generate')"
class="w-12 h-12 flex items-center justify-center bg-[color:var(--brand-primary)] dark:bg-[color:var(--brand-primary-light)] border border-black/8 dark:border-white/8 text-white rounded-full transition-all duration-200 hover:scale-110 hover:shadow-[0_4px_12px_rgba(0,0,0,0.1)] dark:hover:shadow-[0_4px_12px_rgba(0,0,0,0.3)] active:scale-100"
:title="t('podcast.dualPersonPodcast')">
<!-- 讲话的icon,用fa-microphone-alt如果有,否则fa-microphone -->
<i class="fi fi-bs-signal-stream text-xl"></i>
</button>
<span class="text-xs text-[#86868b] dark:text-[#98989d] tracking-tight">{{ t('podcast.dualPersonPodcast') }}</span>
</div>
<div class="flex flex-col items-center gap-2">
<button
class="w-12 h-12 flex items-center justify-center bg-[color:var(--brand-primary)] dark:bg-[color:var(--brand-primary-light)] text-white rounded-full transition-all duration-200 hover:scale-110 hover:shadow-[0_4px_12px_rgba(var(--brand-primary-rgb),0.3)] dark:hover:shadow-[0_4px_12px_rgba(var(--brand-primary-light-rgb),0.4)] active:scale-100"
......@@ -1109,7 +2639,7 @@ onUnmounted(() => {
class="w-12 h-12 flex items-center justify-center rounded-full transition-all duration-200 hover:scale-110 active:scale-100"
:class="isRecording ? 'bg-red-500 dark:bg-red-400 text-white shadow-[0_4px_12px_rgba(239,68,68,0.3)] dark:shadow-[0_4px_12px_rgba(248,113,113,0.4)]' : 'bg-white dark:bg-[#3a3a3c] border border-black/8 dark:border-white/8 text-[#1d1d1f] dark:text-[#f5f5f7] hover:shadow-[0_4px_12px_rgba(0,0,0,0.1)] dark:hover:shadow-[0_4px_12px_rgba(0,0,0,0.3)]'"
:title="isRecording ? t('stopRecording') : t('recordAudio')">
<i class="fas fa-microphone text-base" :class="{ 'animate-pulse': isRecording }"></i>
<i class="fas fa-microphone-alt text-base" :class="{ 'animate-pulse': isRecording }"></i>
</button>
<span class="text-xs text-[#86868b] dark:text-[#98989d] tracking-tight">{{ isRecording ? formatRecordingDuration(recordingDuration) : t('recordAudio') }}</span>
</div>
......@@ -1117,7 +2647,7 @@ onUnmounted(() => {
</div>
</div>
<!-- 音频预览 - Apple 风格(播放器卡片样式) -->
<!-- 音频预览 - 原始音频播放器 -->
<div v-if="getCurrentAudioPreview()" class="relative w-full min-h-[220px] flex items-center justify-center">
<div class="bg-white/80 dark:bg-[#2c2c2e]/80 backdrop-blur-[20px] border border-black/8 dark:border-white/8 rounded-xl transition-all duration-200 hover:bg-white dark:hover:bg-[#3a3a3c] hover:border-black/12 dark:hover:border-white/12 hover:shadow-[0_4px_12px_rgba(0,0,0,0.08)] dark:hover:shadow-[0_4px_12px_rgba(0,0,0,0.2)] w-full p-4">
<div class="relative flex items-center mb-3">
......@@ -1186,42 +2716,460 @@ onUnmounted(() => {
></audio>
</div>
<input type="file" ref="audioInput" @change="handleAudioUpload" accept="audio/*,video/*" data-role="audio-input"
<input type="file" ref="audioInput" @change="handleAudioUpload" accept="audio/*,audio/mp4,audio/x-m4a,video/*" data-role="audio-input"
style="display: none;">
</div>
<!-- 音频分割加载提示 -->
<div v-if="audioSeparating" class="mt-3 flex items-center justify-center gap-2 text-sm text-[#86868b] dark:text-[#98989d] tracking-tight">
<i class="fas fa-spinner fa-spin text-[color:var(--brand-primary)] dark:text-[color:var(--brand-primary-light)]"></i>
<span>{{t('splitingAudio')}}</span>
</div>
</div>
</div>
<!-- 提示词输入区域 - Apple 风格 -->
<!-- 上传视频 - Apple 风格(用于 animate 任务类型) -->
<div v-if="selectedTaskId === 'animate'">
<!-- 视频标签 -->
<div class="flex justify-between items-center mb-3">
<label class="text-sm font-medium text-[#1d1d1f] dark:text-[#f5f5f7] flex items-center tracking-tight">
{{ t('prompt') }}
<button @click="showPromptModal = true; promptModalTab = 'templates'"
class="ml-2 text-xs text-[#86868b] dark:text-[#98989d] hover:text-[color:var(--brand-primary)] dark:hover:text-[color:var(--brand-primary-light)] transition-colors"
:title="t('promptTemplates')">
<i class="fas fa-lightbulb"></i>
</button>
<label class="text-sm font-medium text-[#1d1d1f] dark:text-[#f5f5f7] tracking-tight">
{{ t('video') || '视频' }}
</label>
<div class="text-xs text-[#86868b] dark:text-[#98989d] tracking-tight">
{{ getCurrentForm().prompt?.length || 0 }} / 1000
</div>
</div>
<div class="relative">
<textarea v-model="getCurrentForm().prompt"
class="relative w-full bg-white/80 dark:bg-[#2c2c2e]/80 backdrop-blur-[20px] border border-black/8 dark:border-white/8 rounded-2xl px-5 py-4 text-[15px] text-[#1d1d1f] dark:text-[#f5f5f7] transition-all duration-200 resize-none main-scrollbar placeholder-[#86868b] dark:placeholder-[#98989d] tracking-tight hover:bg-white dark:hover:bg-[#3a3a3c] hover:border-black/12 dark:hover:border-white/12 focus:outline-none focus:border-[color:var(--brand-primary)]/50 dark:focus:border-[color:var(--brand-primary-light)]/60 focus:shadow-[0_4px_16px_rgba(var(--brand-primary-rgb),0.12)] dark:focus:shadow-[0_4px_16px_rgba(var(--brand-primary-light-rgb),0.2)]"
:placeholder="getPromptPlaceholder()"
rows="2"
maxlength="1000"
required></textarea>
<!-- 上传视频区域 - Apple 风格 -->
<div class="relative bg-white/80 dark:bg-[#2c2c2e]/80 backdrop-blur-[20px] border border-black/8 dark:border-white/8 rounded-2xl p-2 min-h-[220px] transition-all duration-200 hover:bg-white dark:hover:bg-[#3a3a3c] hover:border-black/12 dark:hover:border-white/12 hover:shadow-[0_4px_16px_rgba(0,0,0,0.1)] dark:hover:shadow-[0_4px_16px_rgba(0,0,0,0.3)]"
@drop="handleVideoDrop"
@dragover="handleDragOver"
@dragenter="handleDragEnter"
@dragleave="handleDragLeave"
:class="{
'border-[color:var(--brand-primary)] dark:border-[color:var(--brand-primary-light)] bg-[color:var(--brand-primary)]/5 dark:bg-[color:var(--brand-primary-light)]/10': isDragOver,
'p-8': !getCurrentVideoPreview()
}"
>
<!-- 默认上传界面 - Apple 风格 -->
<div v-if="!getCurrentVideoPreview()" class="flex flex-col items-center justify-center h-full">
<p class="text-base font-semibold text-[#1d1d1f] dark:text-[#f5f5f7] mb-2 tracking-tight">{{ t('uploadVideo')}}</p>
<p class="text-xs text-[#86868b] dark:text-[#98989d] mb-6 tracking-tight">{{ t('supportedVideoFormats') }}</p>
<div class="flex items-center justify-center gap-4">
<div class="flex flex-col items-center gap-2">
<button
class="w-12 h-12 flex items-center justify-center bg-[color:var(--brand-primary)] dark:bg-[color:var(--brand-primary-light)] text-white rounded-full transition-all duration-200 hover:scale-110 hover:shadow-[0_4px_12px_rgba(var(--brand-primary-rgb),0.3)] dark:hover:shadow-[0_4px_12px_rgba(var(--brand-primary-light-rgb),0.4)] active:scale-100"
@click="triggerVideoUpload"
:title="t('uploadVideo') || '上传视频'">
<i class="fas fa-upload text-base"></i>
</button>
<span class="text-xs text-[#86868b] dark:text-[#98989d] tracking-tight">{{ t('upload') }}</span>
</div>
</div>
</div>
<!-- 视频预览区域 -->
<div v-if="getCurrentVideoPreview()" class="relative w-full min-h-[220px] flex items-center justify-center">
<div class="bg-white/80 dark:bg-[#2c2c2e]/80 backdrop-blur-[20px] border border-black/8 dark:border-white/8 rounded-xl transition-all duration-200 hover:bg-white dark:hover:bg-[#3a3a3c] hover:border-black/12 dark:hover:border-white/12 hover:shadow-[0_4px_12px_rgba(0,0,0,0.08)] dark:hover:shadow-[0_4px_12px_rgba(0,0,0,0.2)] w-full p-4">
<div class="relative flex items-center mb-3">
<!-- 视频预览 -->
<div class="flex-1 min-w-0">
<video
:src="getCurrentVideoPreviewUrl()"
class="w-full max-h-[180px] rounded-lg object-contain"
controls
preload="metadata"
></video>
</div>
<!-- 删除按钮 -->
<button @click.stop="removeVideo"
class="ml-3 w-9 h-9 flex items-center justify-center bg-white/80 dark:bg-[#2c2c2e]/80 border border-black/8 dark:border-white/8 text-red-500 dark:text-red-400 rounded-full transition-all duration-200 hover:scale-110 hover:shadow-[0_4px_12px_rgba(239,68,68,0.2)] dark:hover:shadow-[0_4px_12px_rgba(248,113,113,0.3)] active:scale-100 flex-shrink-0"
:title="t('deleteVideo') || '删除视频'">
<i class="fas fa-trash text-sm"></i>
</button>
</div>
</div>
</div>
<input type="file" ref="videoInput" @change="handleVideoUpload" accept="video/*" data-role="video-input"
style="display: none;">
</div>
</div>
</div>
<div class="flex justify-between items-center mt-3">
<button @click="clearPrompt"
class="flex items-center text-sm rounded-lg px-3 py-1.5 transition-all duration-200 text-[#86868b] dark:text-[#98989d] hover:text-[color:var(--brand-primary)] dark:hover:text-[color:var(--brand-primary-light)] hover:bg-black/4 dark:hover:bg-white/6 group tracking-tight">
<i class="fas fa-sync-alt text-sm mr-2 group-hover:rotate-180 transition-transform duration-300"></i>
{{ t('clear') }}
<!-- 角色和音频配对区域 -->
<div v-if="selectedTaskId === 's2v'" class="mt-8">
<!-- 模式切换开关 - 始终显示 -->
<div class="flex justify-center items-center mb-4">
<div class="flex items-center gap-3">
<!-- 开关按钮 -->
<button
@click="toggleRoleMode"
class="relative w-14 h-7 rounded-full transition-all duration-300 focus:outline-none focus:ring-2 focus:ring-[color:var(--brand-primary)]/20 dark:focus:ring-[color:var(--brand-primary-light)]/20"
:class="isMultiRoleMode ? 'bg-[color:var(--brand-primary)] dark:bg-[color:var(--brand-primary-light)]' : 'bg-[#86868b]/30 dark:bg-[#98989d]/30'"
:title="isMultiRoleMode ? '切换到单角色模式' : '切换到多角色模式'"
>
<!-- 滑动圆点 -->
<span
class="absolute top-0.5 left-0.5 w-6 h-6 bg-white rounded-full shadow-md transition-transform duration-300 flex items-center justify-center"
:class="{ 'translate-x-7': isMultiRoleMode, 'translate-x-0': !isMultiRoleMode }"
>
<i :class="isMultiRoleMode ? 'fas fa-users text-[8px] text-[color:var(--brand-primary)] dark:text-[color:var(--brand-primary-light)]' : 'fas fa-user text-[8px] text-[#86868b] dark:text-[#98989d]'"></i>
</span>
</button>
<span class="text-sm font-medium text-[#1d1d1f] dark:text-[#f5f5f7] tracking-tight" :class="{ 'text-[#86868b] dark:text-[#98989d]': isMultiRoleMode }">{{ isMultiRoleMode ? '多角色模式' : '单角色模式' }}</span>
<!-- Info 图标按钮 -->
<button
@click="showRoleModeInfo = true"
class="w-5 h-5 flex items-center justify-center text-[#86868b] dark:text-[#98989d] hover:text-[color:var(--brand-primary)] dark:hover:text-[color:var(--brand-primary-light)] transition-colors duration-200 rounded-full hover:bg-[#86868b]/10 dark:hover:bg-[#98989d]/10"
:title="t('roleModeInfo.title')"
>
<i class="fas fa-info-circle text-xs"></i>
</button>
</div>
</div>
<!-- 角色模式说明弹窗 - Apple 风格 -->
<div v-if="showRoleModeInfo"
class="fixed inset-0 bg-black/50 dark:bg-black/60 backdrop-blur-sm z-[70] flex items-center justify-center p-4"
@click="showRoleModeInfo = false">
<div class="w-full max-w-md bg-white/95 dark:bg-[#1e1e1e]/95 backdrop-blur-[40px] backdrop-saturate-[180%] border border-black/10 dark:border-white/10 rounded-3xl shadow-[0_20px_60px_rgba(0,0,0,0.2)] dark:shadow-[0_20px_60px_rgba(0,0,0,0.6)] overflow-hidden"
@click.stop>
<!-- 弹窗头部 -->
<div class="flex items-center justify-between px-6 py-4 border-b border-black/8 dark:border-white/8 bg-white/50 dark:bg-[#1e1e1e]/50 backdrop-blur-[20px]">
<h3 class="text-lg font-semibold text-[#1d1d1f] dark:text-[#f5f5f7] tracking-tight">
{{ t('roleModeInfo.title') }}
</h3>
<button @click="showRoleModeInfo = false"
class="w-8 h-8 flex items-center justify-center bg-white/80 dark:bg-[#2c2c2e]/80 border border-black/8 dark:border-white/8 text-[#86868b] dark:text-[#98989d] hover:text-red-500 dark:hover:text-red-400 hover:bg-white dark:hover:bg-[#3a3a3c] rounded-full transition-all duration-200 hover:scale-110 active:scale-100"
:title="t('close')">
<i class="fas fa-times text-sm"></i>
</button>
</div>
<!-- 弹窗内容 -->
<div class="p-6 space-y-6">
<!-- 单角色模式说明 -->
<div class="space-y-3">
<h4 class="text-base font-semibold text-[#1d1d1f] dark:text-[#f5f5f7] tracking-tight flex items-center gap-2">
<i class="fas fa-user text-sm text-[color:var(--brand-primary)] dark:text-[color:var(--brand-primary-light)]"></i>
{{ t('roleModeInfo.singleMode.title') }}
</h4>
<ul class="space-y-2 pl-6">
<li v-for="(point, index) in tm('roleModeInfo.singleMode.points')" :key="index"
class="text-sm text-[#1d1d1f] dark:text-[#f5f5f7] tracking-tight leading-relaxed flex items-start gap-2">
<span class="text-[color:var(--brand-primary)] dark:text-[color:var(--brand-primary-light)] mt-1.5 flex-shrink-0"></span>
<span>{{ point }}</span>
</li>
</ul>
</div>
<!-- 多角色模式说明 -->
<div class="space-y-3">
<h4 class="text-base font-semibold text-[#1d1d1f] dark:text-[#f5f5f7] tracking-tight flex items-center gap-2">
<i class="fas fa-users text-sm text-[color:var(--brand-primary)] dark:text-[color:var(--brand-primary-light)]"></i>
{{ t('roleModeInfo.multiMode.title') }}
</h4>
<ul class="space-y-2 pl-6">
<li v-for="(point, index) in tm('roleModeInfo.multiMode.points')" :key="index"
class="text-sm text-[#1d1d1f] dark:text-[#f5f5f7] tracking-tight leading-relaxed flex items-center gap-2">
<span class="text-[color:var(--brand-primary)] dark:text-[color:var(--brand-primary-light)] flex-shrink-0"></span>
<span>{{ point }}</span>
</li>
</ul>
</div>
</div>
</div>
</div>
<!-- 保存角色加载提示 -->
<div v-if="faceSaving" class="flex items-center justify-center gap-2 text-sm text-[#86868b] dark:text-[#98989d] tracking-tight mb-4">
<i class="fas fa-spinner fa-spin text-[color:var(--brand-primary)] dark:text-[color:var(--brand-primary-light)]"></i>
<span>正在保存角色并更新音频...</span>
</div>
<!-- 角色和音频配对区域 - 每行一个配对(仅在多角色模式且有角色时显示) -->
<div v-if="isMultiRoleMode && currentDetectedFaces && currentDetectedFaces.length > 0" class="flex flex-col items-center space-y-3">
<div
v-for="(face, index) in currentDetectedFaces"
:key="index"
class="flex items-stretch gap-4"
:class="{
'border-[color:var(--brand-primary)]/50 dark:border-[color:var(--brand-primary-light)]/50': dragOverRoleIndex === index || dragOverAudioIndex === index
}"
>
<!-- 左侧:角色卡片 -->
<div
class="w-85 bg-white/80 dark:bg-[#2c2c2e]/80 backdrop-blur-[20px] border border-black/8 dark:border-white/8 rounded-xl p-3 transition-all duration-200 hover:bg-white dark:hover:bg-[#3a3a3c] hover:border-black/12 dark:hover:border-white/12 hover:shadow-[0_4px_12px_rgba(0,0,0,0.08)] dark:hover:shadow-[0_4px_12px_rgba(0,0,0,0.2)]"
:class="{
'border-[color:var(--brand-primary)]/50 dark:border-[color:var(--brand-primary-light)]/50 bg-[color:var(--brand-primary)]/5 dark:bg-[color:var(--brand-primary-light)]/10': dragOverRoleIndex === index,
'opacity-40 scale-95 shadow-lg': draggedRoleIndex === index,
'transform translate-y-0': draggedRoleIndex !== index
}"
@dragover.prevent="onRoleDragOver($event, index)"
@dragleave="onRoleDragLeave"
@drop="onRoleDrop($event, index)"
>
<!-- 角色区域 - 可拖拽 -->
<div
class="flex items-center justify-between gap-2 h-full w-full transition-all duration-200"
:class="{
'opacity-50 scale-95': draggedRoleIndex === index,
'opacity-100': draggedRoleIndex !== index
}"
:draggable="true"
@dragstart="onRoleDragStart($event, index)"
@dragend="draggedRoleIndex = -1; dragOverRoleIndex = -1"
>
<!-- 左侧:拖拽手柄和角色名 -->
<div class="flex items-center gap-2 flex-1 min-w-0">
<!-- 拖拽手柄 -->
<div class="cursor-move text-[#86868b] dark:text-[#98989d] hover:text-[color:var(--brand-primary)] dark:hover:text-[color:var(--brand-primary-light)] transition-colors">
<i class="fas fa-grip-vertical text-sm"></i>
</div>
<!-- 角色名显示/编辑 -->
<div class="flex items-center">
<!-- 编辑模式 -->
<input
v-if="face.isEditing"
type="text"
:value="face.roleName"
:data-face-index="index"
@input="updateFaceRoleName(index, $event.target.value)"
@blur="saveFaceRoleName(index, $event.target.value)"
@keyup.enter="saveFaceRoleName(index, $event.target.value)"
@keyup.esc="toggleFaceEditing(index)"
:ref="(el) => { if (el && face.isEditing) { nextTick(() => el.focus()); } }"
class="w-24 px-2 py-1.5 text-sm font-medium text-[#1d1d1f] dark:text-[#f5f5f7] bg-white/80 dark:bg-[#2c2c2e]/80 border border-[color:var(--brand-primary)]/50 dark:border-[color:var(--brand-primary-light)]/60 rounded-lg focus:outline-none focus:ring-2 focus:ring-[color:var(--brand-primary)]/20 dark:focus:ring-[color:var(--brand-primary-light)]/20 transition-all duration-200"
:placeholder="`角色${index + 1}`"
@click.stop>
<!-- 显示模式 - 可点击编辑 -->
<span
v-else
@click.stop="toggleFaceEditing(index)"
class="w-24 px-2 py-1.5 text-sm font-medium text-[#1d1d1f] dark:text-[#f5f5f7] truncate tracking-tight cursor-text hover:bg-[color:var(--brand-primary)]/10 dark:hover:bg-[color:var(--brand-primary-light)]/15 hover:text-[color:var(--brand-primary)] dark:hover:text-[color:var(--brand-primary-light)] rounded transition-colors duration-200"
>
{{ face.roleName || `角色${index + 1}` }}
</span>
</div>
</div>
<!-- 右侧头像编辑按钮和删除按钮 -->
<div class="flex items-center gap-2 flex-shrink-0">
<!-- 角色头像容器 - 相对定位用于放置编辑按钮 -->
<div class="relative flex-shrink-0">
<!-- 角色头像 - 可点击 -->
<div
@click.stop="openFaceEditModal(index)"
class="flex-shrink-0 w-14 h-14 rounded-lg overflow-hidden border border-black/8 dark:border-white/8 bg-black/5 dark:bg-white/5 cursor-pointer hover:border-[color:var(--brand-primary)]/50 dark:hover:border-[color:var(--brand-primary-light)]/50 transition-all duration-200 hover:scale-105"
>
<img v-if="face.face_image"
:src="'data:image/png;base64,' + face.face_image"
alt="Face"
class="w-full h-full object-cover"
@error="(e) => { console.error('Face image load error:', index, e); e.target.style.display = 'none'; }">
<div v-else class="w-full h-full flex items-center justify-center text-[#86868b] dark:text-[#98989d] text-xs">
<i class="fas fa-image"></i>
</div>
</div>
<!-- 编辑按钮 - 放在头像右上角 -->
<button
v-if="!face.isEditing"
@click.stop="openFaceEditModal(index)"
class="absolute -top-1 -right-1 w-5 h-5 flex items-center justify-center bg-white/95 dark:bg-[#2c2c2e]/95 backdrop-blur-[10px] border border-black/8 dark:border-white/8 text-[#86868b] dark:text-[#98989d] hover:text-[color:var(--brand-primary)] dark:hover:text-[color:var(--brand-primary-light)] rounded-full transition-all duration-200 hover:scale-110 shadow-sm"
:title="t('edit') || '编辑'">
<i class="fas fa-edit text-xs"></i>
</button>
<!-- 保存按钮 -->
<button
v-else
@click.stop="() => {
const inputEl = document.querySelector(`input[data-face-index='${index}']`);
const newRoleName = inputEl?.value || face.roleName;
saveFaceRoleName(index, newRoleName);
}"
class="absolute -top-1 -right-1 w-5 h-5 flex items-center justify-center bg-[color:var(--brand-primary)]/90 dark:bg-[color:var(--brand-primary-light)]/90 text-white rounded-full transition-all duration-200 hover:scale-110 shadow-sm"
:title="t('save') || '保存'">
<i class="fas fa-check text-xs"></i>
</button>
</div>
<!-- 删除按钮 -->
<button
@click.stop="removeFace(index)"
class="flex-shrink-0 w-6 h-6 flex items-center justify-center text-red-500 dark:text-red-400 hover:text-red-600 dark:hover:text-red-300 hover:bg-red-50 dark:hover:bg-red-900/20 rounded transition-all duration-200"
:title="t('delete') || '删除'">
<i class="fas fa-trash text-xs"></i>
</button>
</div>
</div>
</div>
<!-- 中间链接符号 -->
<div class="flex items-center justify-center flex-shrink-0">
<div class="w-8 h-8 flex items-center justify-center text-[#86868b] dark:text-[#98989d]">
<i class="fas fa-link text-lg"></i>
</div>
</div>
<!-- 右侧音频卡片 -->
<div
v-if="currentSeparatedAudios && currentSeparatedAudios.length > index"
class="w-85 bg-white/80 dark:bg-[#2c2c2e]/80 backdrop-blur-[20px] border border-black/8 dark:border-white/8 rounded-xl p-3 transition-all duration-200 hover:bg-white dark:hover:bg-[#3a3a3c] hover:border-black/12 dark:hover:border-white/12 hover:shadow-[0_4px_12px_rgba(0,0,0,0.08)] dark:hover:shadow-[0_4px_12px_rgba(0,0,0,0.2)]"
:class="{
'border-[color:var(--brand-primary)]/50 dark:border-[color:var(--brand-primary-light)]/50 bg-[color:var(--brand-primary)]/5 dark:bg-[color:var(--brand-primary-light)]/10': dragOverAudioIndex === index,
'opacity-40 scale-95 shadow-lg': draggedAudioIndex === index,
'transform translate-y-0': draggedAudioIndex !== index
}"
@dragover.prevent="onAudioDragOver($event, index)"
@dragleave="onAudioDragLeave"
@drop="onAudioDrop($event, index)"
>
<!-- 音频区域 - 可拖拽 -->
<div
class="flex items-center gap-2 h-full transition-all duration-200"
:class="{
'opacity-50 scale-95': draggedAudioIndex === index,
'opacity-100': draggedAudioIndex !== index
}"
:draggable="true"
@dragstart="onAudioDragStart($event, index)"
@dragend="draggedAudioIndex = -1; dragOverAudioIndex = -1"
>
<!-- 拖拽手柄 -->
<div class="cursor-move text-[#86868b] dark:text-[#98989d] hover:text-[color:var(--brand-primary)] dark:hover:text-[color:var(--brand-primary-light)] transition-colors">
<i class="fas fa-grip-vertical text-sm"></i>
</div>
<!-- 音色名显示/编辑 -->
<div class="flex items-center">
<!-- 编辑模式 -->
<input
v-if="currentSeparatedAudios[index].isEditing"
type="text"
:value="currentSeparatedAudios[index].audioName"
@input="updateSeparatedAudioName(index, $event.target.value)"
@blur="saveSeparatedAudioName(index, $event.target.value)"
@keyup.enter="saveSeparatedAudioName(index, $event.target.value)"
@keyup.esc="toggleSeparatedAudioEditing(index)"
:ref="(el) => { if (el && currentSeparatedAudios[index].isEditing) { nextTick(() => el.focus()); } }"
class="w-24 px-2 py-1 text-sm font-medium text-[#1d1d1f] dark:text-[#f5f5f7] bg-white/80 dark:bg-[#2c2c2e]/80 border border-[color:var(--brand-primary)]/50 dark:border-[color:var(--brand-primary-light)]/60 rounded-lg focus:outline-none focus:ring-2 focus:ring-[color:var(--brand-primary)]/20 dark:focus:ring-[color:var(--brand-primary-light)]/20 transition-all duration-200"
:placeholder="`音色${index + 1}`"
@click.stop>
<!-- 显示模式 - 可点击编辑 -->
<span
v-else
@click.stop="toggleSeparatedAudioEditing(index)"
class="w-24 px-2 py-1 text-sm font-medium text-[#1d1d1f] dark:text-[#f5f5f7] tracking-tight truncate cursor-text hover:bg-[color:var(--brand-primary)]/10 dark:hover:bg-[color:var(--brand-primary-light)]/15 hover:text-[color:var(--brand-primary)] dark:hover:text-[color:var(--brand-primary-light)] rounded transition-colors duration-200"
>
{{ currentSeparatedAudios[index].audioName || `音色${index + 1}` }}
</span>
</div>
<!-- 音频播放器 -->
<div class="flex items-center gap-2 justify-center flex-shrink-0">
<!-- 播放/暂停按钮 -->
<button
@click="toggleSeparatedAudioPlayback(index)"
class="flex-shrink-0 w-10 h-10 bg-[color:var(--brand-primary)]/90 dark:bg-[color:var(--brand-primary-light)]/90 rounded-full flex items-center justify-center text-white cursor-pointer hover:scale-110 transition-all duration-200 shadow-[0_2px_8px_rgba(var(--brand-primary-rgb),0.3)] dark:shadow-[0_2px_8px_rgba(var(--brand-primary-light-rgb),0.4)]"
>
<i :class="getSeparatedAudioPlaying(index) ? 'fas fa-pause' : 'fas fa-play'" class="text-xs ml-0.5"></i>
</button>
<!-- 右侧时长和进度条 -->
<div class="flex flex-col justify-center" style="gap: 2px;">
<!-- 音频时长 - 显示在进度条上方 -->
<div class="text-xs font-medium text-[#86868b] dark:text-[#98989d] tracking-tight text-center" style="width: 128px;">
{{ formatAudioPreviewTime(getSeparatedAudioCurrentTime(index)) }} / {{ formatAudioPreviewTime(getSeparatedAudioDuration(index)) }}
</div>
<!-- 进度条 -->
<div class="w-32" v-if="getSeparatedAudioDuration(index) > 0">
<input
type="range"
:min="0"
:max="getSeparatedAudioDuration(index)"
:value="getSeparatedAudioCurrentTime(index)"
@input="(e) => onSeparatedAudioProgressChange(index, e)"
@change="(e) => onSeparatedAudioProgressChange(index, e)"
@mousedown="separatedAudioIsDragging[index] = true"
@mouseup="(e) => onSeparatedAudioProgressEnd(index, e)"
class="w-full h-1 bg-black/6 dark:bg-white/15 rounded-full appearance-none cursor-pointer [&::-webkit-slider-thumb]:appearance-none [&::-webkit-slider-thumb]:w-3 [&::-webkit-slider-thumb]:h-3 [&::-webkit-slider-thumb]:bg-[color:var(--brand-primary)] dark:[&::-webkit-slider-thumb]:bg-[color:var(--brand-primary-light)] [&::-webkit-slider-thumb]:rounded-full [&::-webkit-slider-thumb]:cursor-pointer"
/>
</div>
</div>
<!-- 隐藏的音频元素 -->
<audio
:ref="el => { if (el) separatedAudioElements[index] = el }"
:src="currentSeparatedAudios[index].audioDataUrl"
@loadedmetadata="() => onSeparatedAudioLoaded(index)"
@timeupdate="() => onSeparatedAudioTimeUpdate(index)"
@ended="() => onSeparatedAudioEnded(index)"
@play="() => separatedAudioPlaying[index] = true"
@pause="() => separatedAudioPlaying[index] = false"
@error="() => handleSeparatedAudioError(index)"
class="hidden"
></audio>
</div>
</div>
</div>
<!-- 音频占位符如果没有对应的分离音频 -->
<div
v-else
class="w-85 bg-white/80 dark:bg-[#2c2c2e]/80 backdrop-blur-[20px] border border-black/8 dark:border-white/8 rounded-xl p-3 flex items-center justify-center text-sm text-[#86868b] dark:text-[#98989d] tracking-tight"
>
<span>{{ t('waitingForMultipleRolesAudio') }}</span>
</div>
</div>
</div>
<!-- 新增角色按钮 -->
<div v-if="selectedTaskId === 's2v' && getCurrentImagePreview() && isMultiRoleMode" class="flex justify-center mt-4">
<button
@click="openFaceEditModal(-1)"
class="w-8 h-8 flex items-center justify-center bg-[#86868b]/20 dark:bg-[#98989d]/20 text-[#86868b] dark:text-[#98989d] rounded-full transition-all duration-200 hover:bg-[#86868b]/30 dark:hover:bg-[#98989d]/30 hover:scale-110 active:scale-100"
:title="t('addRole') || '新增角色'"
>
<i class="fas fa-plus text-sm"></i>
</button>
</div>
</div>
<!-- 提示词输入区域 - Apple 风格animate 任务类型不显示 -->
<div v-if="selectedTaskId !== 'animate'">
<div class="mt-8 space-y-3 flex justify-between items-center mb-3">
<label class="text-sm font-medium text-[#1d1d1f] dark:text-[#f5f5f7] flex items-center tracking-tight">
{{ t('prompt') }}
<button @click="showPromptModal = true; promptModalTab = 'templates'"
class="ml-2 text-xs text-[#86868b] dark:text-[#98989d] hover:text-[color:var(--brand-primary)] dark:hover:text-[color:var(--brand-primary-light)] transition-colors"
:title="t('promptTemplates')">
<i class="fas fa-lightbulb text-lg"></i>
</button>
</label>
<div class="text-xs text-[#86868b] dark:text-[#98989d] tracking-tight">
{{ getCurrentForm().prompt?.length || 0 }} / 1000
</div>
</div>
<div class="relative">
<textarea v-model="getCurrentForm().prompt"
class="relative w-full bg-white/80 dark:bg-[#2c2c2e]/80 backdrop-blur-[20px] border border-black/8 dark:border-white/8 rounded-2xl px-5 py-4 text-[15px] text-[#1d1d1f] dark:text-[#f5f5f7] transition-all duration-200 resize-none main-scrollbar placeholder-[#86868b] dark:placeholder-[#98989d] tracking-tight hover:bg-white dark:hover:bg-[#3a3a3c] hover:border-black/12 dark:hover:border-white/12 focus:outline-none focus:border-[color:var(--brand-primary)]/50 dark:focus:border-[color:var(--brand-primary-light)]/60 focus:shadow-[0_4px_16px_rgba(var(--brand-primary-rgb),0.12)] dark:focus:shadow-[0_4px_16px_rgba(var(--brand-primary-light-rgb),0.2)]"
:placeholder="getPromptPlaceholder()"
rows="2"
maxlength="1000"
required></textarea>
</div>
<div class="flex justify-between items-center mt-3">
<button @click="clearPrompt"
class="flex items-center text-sm rounded-lg px-3 py-1.5 transition-all duration-200 text-[#86868b] dark:text-[#98989d] hover:text-[color:var(--brand-primary)] dark:hover:text-[color:var(--brand-primary-light)] hover:bg-black/4 dark:hover:bg-white/6 group tracking-tight">
<i class="fas fa-sync-alt text-sm mr-2 group-hover:rotate-180 transition-transform duration-300"></i>
{{ t('clear') }}
</button>
</div>
</div>
<!-- 提交按钮 - Apple 极简风格 -->
<div class="flex justify-center mt-8">
......@@ -1231,7 +3179,7 @@ onUnmounted(() => {
<i v-if="submitting" class="fas fa-spinner fa-spin text-lg mr-2 text-[color:var(--brand-primary)] dark:text-[color:var(--brand-primary-light)]"></i>
<i v-else-if="templateLoading" class="fas fa-spinner fa-spin text-lg mr-2 text-[color:var(--brand-primary)] dark:text-[color:var(--brand-primary-light)]"></i>
<i v-else class="fi fi-sr-cursor-finger-click text-lg text-[color:var(--brand-primary)] dark:text-[color:var(--brand-primary-light)] transition-all duration-200 pointer-events-none"></i>
<i v-else class="fi fi-sr-select text-lg text-[color:var(--brand-primary)] dark:text-[color:var(--brand-primary-light)] transition-all duration-200 pointer-events-none"></i>
<span class="pl-2 text-base font-semibold transition-all duration-200 pointer-events-none">{{ submitting ? t('submitting') : templateLoading ? '模板加载中...' : t('generateVideo') }}</span>
</button>
</div>
......@@ -1319,7 +3267,7 @@ onUnmounted(() => {
<!-- 视频预览 -->
<video v-if="item?.outputs?.output_video"
:src="getTemplateFileUrl(item.outputs.output_video,'videos')"
:poster="getTemplateFileUrl(item.inputs.input_image,'images')"
:poster="item?.inputs?.input_image ? getTemplateFileUrl(item.inputs.input_image,'images') : undefined"
class="w-full h-auto object-contain group-hover:scale-[1.02] transition-transform duration-200"
preload="auto" playsinline webkit-playsinline
@mouseenter="playVideo($event)" @mouseleave="pauseVideo($event)"
......@@ -1327,12 +3275,16 @@ onUnmounted(() => {
@ended="handleMasonryVideoEnded($event)"
@error="handleMasonryVideoError($event)"></video>
<!-- 图片缩略图 -->
<img v-else
<img v-else-if="item?.inputs?.input_image"
:src="getTemplateFileUrl(item.inputs.input_image,'images')"
:alt="item.params?.prompt || '模板图片'"
class="w-full h-auto object-contain group-hover:scale-[1.02] transition-transform duration-200"
@load="handleMasonryImageLoaded"
@error="handleMasonryImageError" />
<!-- 如果没有图片显示占位符 -->
<div v-else class="w-full h-[200px] flex items-center justify-center bg-[#f5f5f7] dark:bg-[#1c1c1e]">
<i class="fas fa-image text-3xl text-[#86868b]/30 dark:text-[#98989d]/30"></i>
</div>
<!-- 移动端播放按钮 - Apple 风格 -->
<button v-if="item?.outputs?.output_video"
@click.stop="toggleVideoPlay($event)"
......@@ -1368,6 +3320,195 @@ onUnmounted(() => {
</div>
</div>
<!-- 脸部编辑模态框 - 显示放大图片和可拖拽的边界框 -->
<div v-if="showFaceEditModal"
class="fixed inset-0 z-[9999] flex items-center justify-center bg-black/60 dark:bg-black/80 backdrop-blur-sm"
@click="closeFaceEditModal">
<div
@click.stop
class="relative bg-white/95 dark:bg-[#2c2c2e]/95 backdrop-blur-[20px] rounded-2xl p-6 max-w-4xl max-h-[90vh] overflow-auto shadow-[0_12px_32px_rgba(0,0,0,0.6)] dark:shadow-[0_12px_32px_rgba(0,0,0,0.8)]">
<!-- 关闭按钮 -->
<button
@click="closeFaceEditModal"
class="absolute top-4 right-4 w-10 h-10 flex items-center justify-center bg-white/80 dark:bg-[#2c2c2e]/80 border border-black/8 dark:border-white/8 text-[#86868b] dark:text-[#98989d] hover:text-[#1d1d1f] dark:hover:text-[#f5f5f7] hover:bg-white dark:hover:bg-[#3a3a3c] rounded-full transition-all duration-200 z-10"
:title="t('close') || '关闭'">
<i class="fas fa-times text-sm"></i>
</button>
<!-- 标题 -->
<h3 class="text-xl font-semibold text-[#1d1d1f] dark:text-[#f5f5f7] mb-4 tracking-tight">
{{ isAddingNewFace ? (t('addNewRole') || '新增角色') : (t('adjustFaceBox') || '调整人脸边界框') }}
</h3>
<!-- 图片容器 -->
<div
ref="imageContainerRef"
class="relative inline-block max-w-full"
:style="imageNaturalSize.width > 0 && imageNaturalSize.height > 0 && !imageLoaded ? {
width: `${Math.min(imageNaturalSize.width, 800)}px`,
height: `${Math.min(imageNaturalSize.height, 600)}px`,
aspectRatio: `${imageNaturalSize.width} / ${imageNaturalSize.height}`
} : {}">
<!-- 占位符 - 图片加载前显示 -->
<div
v-show="!imageLoaded"
class="absolute inset-0 w-full h-full min-w-[400px] min-h-[300px] bg-[#f5f5f7] dark:bg-[#1e1e1e] rounded-xl flex items-center justify-center z-10">
<div class="flex flex-col items-center gap-3">
<i class="fas fa-spinner fa-spin text-2xl text-[#86868b] dark:text-[#98989d]"></i>
<span class="text-sm text-[#86868b] dark:text-[#98989d]">{{ t('loading') || '加载中...' }}</span>
</div>
</div>
<!-- 实际图片 - 始终渲染但加载完成后才显示 -->
<img
:src="originalImageUrl"
alt="Face Edit"
class="max-w-full max-h-[70vh] h-auto object-contain rounded-xl"
:class="{ 'opacity-0': !imageLoaded }"
@load="handleFaceEditImageLoad"
@error="handleFaceEditImageError">
<!-- 遮罩层 - 框外区域变暗 -->
<svg
v-if="editingFaceBbox.length === 4 && getBboxStyle.left"
class="absolute inset-0 pointer-events-none z-[5]"
:style="{
left: 0,
top: 0,
width: '100%',
height: '100%'
}">
<defs>
<mask id="bbox-mask">
<rect width="100%" height="100%" fill="white"/>
<rect
:x="getBboxStyle.left"
:y="getBboxStyle.top"
:width="getBboxStyle.width"
:height="getBboxStyle.height"
fill="black"/>
</mask>
</defs>
<rect
width="100%"
height="100%"
fill="rgba(0,0,0,0.5)"
mask="url(#bbox-mask)"/>
</svg>
<!-- 角色名字标签 - 显示在边界框上方 -->
<div
v-if="editingFaceBbox.length === 4 && getBboxStyle.left && getRoleNameLabelStyle.roleName"
:style="{
left: getRoleNameLabelStyle.left,
top: getRoleNameLabelStyle.top,
transform: getRoleNameLabelStyle.transform
}"
class="absolute px-2 py-1 text-xs font-medium text-white bg-[color:var(--brand-primary)]/90 dark:bg-[color:var(--brand-primary-light)]/90 rounded-md shadow-lg whitespace-nowrap pointer-events-none z-10">
{{ getRoleNameLabelStyle.roleName }}
</div>
<!-- 边界框 -->
<div
v-if="editingFaceBbox.length === 4 && getBboxStyle.left"
:style="getBboxStyle"
@mousedown="(e) => startDragBbox(e, 'move')"
class="absolute border-2 border-[color:var(--brand-primary)] dark:border-[color:var(--brand-primary-light)] cursor-move bg-transparent hover:bg-[color:var(--brand-primary)]/5 dark:hover:bg-[color:var(--brand-primary-light)]/5 transition-colors duration-200"
:class="{ 'ring-2 ring-[color:var(--brand-primary)]/50 dark:ring-[color:var(--brand-primary-light)]/50 bg-[color:var(--brand-primary)]/10 dark:bg-[color:var(--brand-primary-light)]/10': isDraggingBbox }"
style="box-sizing: border-box;">
<!-- 四个角的拖拽手柄 -->
<div
@mousedown.stop="(e) => startDragBbox(e, 'resize-nw')"
:style="{
width: `${getBboxStyle.indicatorSize || 16}px`,
height: `${getBboxStyle.indicatorSize || 16}px`,
left: `${-(getBboxStyle.indicatorSize || 16) / 2}px`,
top: `${-(getBboxStyle.indicatorSize || 16) / 2}px`
}"
class="absolute bg-[color:var(--brand-primary)] dark:bg-[color:var(--brand-primary-light)] rounded-full border-2 border-white dark:border-[#2c2c2e] shadow-lg cursor-nw-resize hover:scale-110 transition-transform z-20"></div>
<div
@mousedown.stop="(e) => startDragBbox(e, 'resize-ne')"
:style="{
width: `${getBboxStyle.indicatorSize || 16}px`,
height: `${getBboxStyle.indicatorSize || 16}px`,
right: `${-(getBboxStyle.indicatorSize || 16) / 2}px`,
top: `${-(getBboxStyle.indicatorSize || 16) / 2}px`
}"
class="absolute bg-[color:var(--brand-primary)] dark:bg-[color:var(--brand-primary-light)] rounded-full border-2 border-white dark:border-[#2c2c2e] shadow-lg cursor-ne-resize hover:scale-110 transition-transform z-20"></div>
<div
@mousedown.stop="(e) => startDragBbox(e, 'resize-sw')"
:style="{
width: `${getBboxStyle.indicatorSize || 16}px`,
height: `${getBboxStyle.indicatorSize || 16}px`,
left: `${-(getBboxStyle.indicatorSize || 16) / 2}px`,
bottom: `${-(getBboxStyle.indicatorSize || 16) / 2}px`
}"
class="absolute bg-[color:var(--brand-primary)] dark:bg-[color:var(--brand-primary-light)] rounded-full border-2 border-white dark:border-[#2c2c2e] shadow-lg cursor-sw-resize hover:scale-110 transition-transform z-20"></div>
<div
@mousedown.stop="(e) => startDragBbox(e, 'resize-se')"
:style="{
width: `${getBboxStyle.indicatorSize || 16}px`,
height: `${getBboxStyle.indicatorSize || 16}px`,
right: `${-(getBboxStyle.indicatorSize || 16) / 2}px`,
bottom: `${-(getBboxStyle.indicatorSize || 16) / 2}px`
}"
class="absolute bg-[color:var(--brand-primary)] dark:bg-[color:var(--brand-primary-light)] rounded-full border-2 border-white dark:border-[#2c2c2e] shadow-lg cursor-se-resize hover:scale-110 transition-transform z-20"></div>
<!-- 四个边的拖拽手柄 -->
<div
@mousedown.stop="(e) => startDragBbox(e, 'resize-n')"
:style="{
width: 'calc(100% + 16px)',
height: `${getBboxStyle.indicatorSize || 16}px`,
left: `${-(getBboxStyle.indicatorSize || 16) / 2}px`,
top: `${-(getBboxStyle.indicatorSize || 16) / 2}px`
}"
class="absolute cursor-n-resize hover:bg-[color:var(--brand-primary)]/20 dark:hover:bg-[color:var(--brand-primary-light)]/20 transition-colors rounded-t z-10"></div>
<div
@mousedown.stop="(e) => startDragBbox(e, 'resize-s')"
:style="{
width: 'calc(100% + 16px)',
height: `${getBboxStyle.indicatorSize || 16}px`,
left: `${-(getBboxStyle.indicatorSize || 16) / 2}px`,
bottom: `${-(getBboxStyle.indicatorSize || 16) / 2}px`
}"
class="absolute cursor-s-resize hover:bg-[color:var(--brand-primary)]/20 dark:hover:bg-[color:var(--brand-primary-light)]/20 transition-colors rounded-b z-10"></div>
<div
@mousedown.stop="(e) => startDragBbox(e, 'resize-w')"
:style="{
width: `${getBboxStyle.indicatorSize || 16}px`,
height: 'calc(100% + 16px)',
left: `${-(getBboxStyle.indicatorSize || 16) / 2}px`,
top: `${-(getBboxStyle.indicatorSize || 16) / 2}px`
}"
class="absolute cursor-w-resize hover:bg-[color:var(--brand-primary)]/20 dark:hover:bg-[color:var(--brand-primary-light)]/20 transition-colors rounded-l z-10"></div>
<div
@mousedown.stop="(e) => startDragBbox(e, 'resize-e')"
:style="{
width: `${getBboxStyle.indicatorSize || 16}px`,
height: 'calc(100% + 16px)',
right: `${-(getBboxStyle.indicatorSize || 16) / 2}px`,
top: `${-(getBboxStyle.indicatorSize || 16) / 2}px`
}"
class="absolute cursor-e-resize hover:bg-[color:var(--brand-primary)]/20 dark:hover:bg-[color:var(--brand-primary-light)]/20 transition-colors rounded-r z-10"></div>
</div>
</div>
<!-- 操作按钮 -->
<div class="flex items-center justify-end gap-3 mt-6">
<button
@click="closeFaceEditModal"
class="px-4 py-2 text-sm font-medium text-[#86868b] dark:text-[#98989d] hover:text-[#1d1d1f] dark:hover:text-[#f5f5f7] hover:bg-black/4 dark:hover:bg-white/6 rounded-lg transition-all duration-200 tracking-tight">
{{ t('cancel') || '取消' }}
</button>
<button
@click="saveFaceBbox"
class="px-4 py-2 text-sm font-medium text-white bg-[color:var(--brand-primary)] dark:bg-[color:var(--brand-primary-light)] hover:opacity-90 rounded-lg transition-all duration-200 tracking-tight">
{{ t('save') || '保存' }}
</button>
</div>
</div>
</div>
</template>
......
......@@ -220,7 +220,7 @@ onMounted(() => {
<!-- 视频预览 -->
<video v-if="item?.outputs?.output_video"
:src="getTemplateFileUrl(item.outputs.output_video,'videos')"
:poster="getTemplateFileUrl(item.inputs.input_image,'images')"
:poster="item?.inputs?.input_image ? getTemplateFileUrl(item.inputs.input_image,'images') : undefined"
class="w-full h-auto object-contain group-hover:scale-[1.02] transition-transform duration-200"
preload="auto" playsinline webkit-playsinline
@mouseenter="playVideo($event)" @mouseleave="pauseVideo($event)"
......@@ -228,7 +228,7 @@ onMounted(() => {
@ended="onVideoEnded($event)"
@error="onVideoError($event)"></video>
<!-- 图片缩略图 -->
<img v-else
<img v-else-if="item?.inputs?.input_image"
:src="getTemplateFileUrl(item.inputs.input_image,'images')"
:alt="item.params?.prompt || '模板图片'"
class="w-full h-auto object-contain group-hover:scale-[1.02] transition-transform duration-200"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment