Support sekotalk multiperson (#321)

Co-authored-by: PengGao <peng.gaoc@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yang Yong (雍洋) <yongyang1030@163.com>

Support sekotalk multiperson (#321)
Co-authored-by: PengGao <peng.gaoc@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yang Yong (雍洋) <yongyang1030@163.com>
1b0743a5 · sandy · GitHub · 6b7a3cad · 1b0743a5 · 1b0743a5
Commit 1b0743a5 authored Sep 25, 2025 by sandy Committed by GitHub Sep 25, 2025
20 changed files
--- a/lightx2v/models/runners/wan/wan_audio_runner.py
+++ b/lightx2v/models/runners/wan/wan_audio_runner.py
 import gc
 import os
+import warnings
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import numpy as np
 import torch
 import torch.distributed as dist
+import torch.nn.functional as F
 import torchaudio as ta
 import torchvision.transforms.functional as TF
 from PIL import Image
@@ -28,6 +31,9 @@ from lightx2v.utils.profiler import *
 from lightx2v.utils.registry_factory import RUNNER_REGISTER
 from lightx2v.utils.utils import find_torch_model_path, load_weights, vae_to_comfyui_image_inplace
+warnings.filterwarnings("ignore", category=UserWarning, module="torchaudio")
+warnings.filterwarnings("ignore", category=UserWarning, module="torchvision.io")
 def get_optimal_patched_size_with_sp(patched_h, patched_w, sp_size):
    assert sp_size > 0 and (sp_size & (sp_size - 1)) == 0, "sp_size must be a power of 2"
@@ -192,7 +198,7 @@ def resize_image(img, resize_mode="adaptive", bucket_shape=None, fixed_area=None
 class AudioSegment:
    """Data class for audio segment information"""
-    audio_array: np.ndarray
+    audio_array: torch.Tensor
    start_frame: int
    end_frame: int
@@ -248,34 +254,55 @@ class AudioProcessor:
        self.target_fps = target_fps
        self.audio_frame_rate = audio_sr // target_fps
-    def load_audio(self, audio_path: str) -> np.ndarray:
+    def load_audio(self, audio_path: str):
-        """Load and resample audio"""
        audio_array, ori_sr = ta.load(audio_path)
        audio_array = ta.functional.resample(audio_array.mean(0), orig_freq=ori_sr, new_freq=self.audio_sr)
-        return audio_array.numpy()
+        return audio_array
+    def load_multi_person_audio(self, audio_paths: List[str]):
+        audio_arrays = []
+        max_len = 0
+        for audio_path in audio_paths:
+            audio_array = self.load_audio(audio_path)
+            audio_arrays.append(audio_array)
+            max_len = max(max_len, audio_array.numel())
+        num_files = len(audio_arrays)
+        padded = torch.zeros(num_files, max_len, dtype=torch.float32)
+        for i, arr in enumerate(audio_arrays):
+            length = arr.numel()
+            padded[i, :length] = arr
+        return padded
    def get_audio_range(self, start_frame: int, end_frame: int) -> Tuple[int, int]:
        """Calculate audio range for given frame range"""
        return round(start_frame * self.audio_frame_rate), round(end_frame * self.audio_frame_rate)
-    def segment_audio(self, audio_array: np.ndarray, expected_frames: int, max_num_frames: int, prev_frame_length: int = 5) -> List[AudioSegment]:
+    def segment_audio(self, audio_array: torch.Tensor, expected_frames: int, max_num_frames: int, prev_frame_length: int = 5) -> List[AudioSegment]:
-        """Segment audio based on frame requirements"""
+        """
+        Segment audio based on frame requirements
+        audio_array is (N, T) tensor
+        """
        segments = []
        segments_idx = self.init_segments_idx(expected_frames, max_num_frames, prev_frame_length)
        audio_start, audio_end = self.get_audio_range(0, expected_frames)
-        audio_array_ori = audio_array[audio_start:audio_end]
+        audio_array_ori = audio_array[:, audio_start:audio_end]
        for idx, (start_idx, end_idx) in enumerate(segments_idx):
            audio_start, audio_end = self.get_audio_range(start_idx, end_idx)
-            audio_array = audio_array_ori[audio_start:audio_end]
+            audio_array = audio_array_ori[:, audio_start:audio_end]
            if idx < len(segments_idx) - 1:
                end_idx = segments_idx[idx + 1][0]
-            else:
+            else:  # for last segments
-                if audio_array.shape[0] < audio_end - audio_start:
+                if audio_array.shape[1] < audio_end - audio_start:
-                    padding_len = audio_end - audio_start - audio_array.shape[0]
+                    padding_len = audio_end - audio_start - audio_array.shape[1]
-                    audio_array = np.concatenate((audio_array, np.zeros(padding_len)), axis=0)
+                    audio_array = F.pad(audio_array, (0, padding_len))
+                    # Adjust end_idx to account for the frames added by padding
                    end_idx = end_idx - padding_len // self.audio_frame_rate
            segments.append(AudioSegment(audio_array, start_idx, end_idx))
@@ -311,23 +338,108 @@ class WanAudioRunner(WanRunner):  # type:ignore
        self.scheduler = EulerScheduler(self.config)
    def read_audio_input(self):
-        """Read audio input"""
+        """Read audio input - handles both single and multi-person scenarios"""
        audio_sr = self.config.get("audio_sr", 16000)
        target_fps = self.config.get("target_fps", 16)
        self._audio_processor = AudioProcessor(audio_sr, target_fps)
-        if not isinstance(self.config["audio_path"], str):
+        # Get audio files from person objects or legacy format
+        audio_files = self._get_audio_files_from_config()
+        if not audio_files:
            return [], 0
-        audio_array = self._audio_processor.load_audio(self.config["audio_path"])
-        video_duration = self.config.get("video_duration", 5)
+        # Load audio based on single or multi-person mode
+        if len(audio_files) == 1:
+            audio_array = self._audio_processor.load_audio(audio_files[0])
+            audio_array = audio_array.unsqueeze(0)  # Add batch dimension for consistency
+        else:
+            audio_array = self._audio_processor.load_multi_person_audio(audio_files)
-        audio_len = int(audio_array.shape[0] / audio_sr * target_fps)
+        self.config.audio_num = audio_array.size(0)
+        video_duration = self.config.get("video_duration", 5)
+        audio_len = int(audio_array.shape[1] / audio_sr * target_fps)
        expected_frames = min(max(1, int(video_duration * target_fps)), audio_len)
        # Segment audio
        audio_segments = self._audio_processor.segment_audio(audio_array, expected_frames, self.config.get("target_video_length", 81), self.prev_frame_length)
-        return audio_segments, expected_frames
+        return audio_array.size(0), audio_segments, expected_frames
+    def _get_audio_files_from_config(self):
+        talk_objects = self.config.get("talk_objects")
+        if talk_objects:
+            audio_files = []
+            for idx, person in enumerate(talk_objects):
+                audio_path = person.get("audio")
+                if audio_path and Path(audio_path).is_file():
+                    audio_files.append(str(audio_path))
+                else:
+                    logger.warning(f"Person {idx} audio file {audio_path} does not exist or not specified")
+            if audio_files:
+                logger.info(f"Loaded {len(audio_files)} audio files from talk_objects")
+            return audio_files
+        audio_path = self.config.get("audio_path")
+        if audio_path:
+            return [audio_path]
+        logger.error("config audio_path or talk_objects is not specified")
+        return []
+    def read_person_mask(self):
+        mask_files = self._get_mask_files_from_config()
+        if not mask_files:
+            return None
+        mask_latents = []
+        for mask_file in mask_files:
+            mask_latent = self._process_single_mask(mask_file)
+            mask_latents.append(mask_latent)
+        mask_latents = torch.cat(mask_latents, dim=0)
+        return mask_latents
+    def _get_mask_files_from_config(self):
+        talk_objects = self.config.get("talk_objects")
+        if talk_objects:
+            mask_files = []
+            for idx, person in enumerate(talk_objects):
+                mask_path = person.get("mask")
+                if mask_path and Path(mask_path).is_file():
+                    mask_files.append(str(mask_path))
+                elif mask_path:
+                    logger.warning(f"Person {idx} mask file {mask_path} does not exist")
+            if mask_files:
+                logger.info(f"Loaded {len(mask_files)} mask files from talk_objects")
+            return mask_files
+        logger.info("config talk_objects is not specified")
+        return None
+    def _process_single_mask(self, mask_file):
+        mask_img = Image.open(mask_file).convert("RGB")
+        mask_img = TF.to_tensor(mask_img).sub_(0.5).div_(0.5).unsqueeze(0).cuda()
+        if mask_img.shape[1] == 3:  # If it is an RGB three-channel image
+            mask_img = mask_img[:, :1]  # Only take the first channel
+        mask_img, h, w = resize_image(
+            mask_img,
+            resize_mode=self.config.get("resize_mode", "adaptive"),
+            bucket_shape=self.config.get("bucket_shape", None),
+            fixed_area=self.config.get("fixed_area", None),
+            fixed_shape=self.config.get("fixed_shape", None),
+        )
+        mask_latent = torch.nn.functional.interpolate(
+            mask_img,  # (1, 1, H, W)
+            size=(h // 16, w // 16),
+            mode="bicubic",
+        )
+        mask_latent = (mask_latent > 0).to(torch.int8)
+        return mask_latent
    def read_image_input(self, img_path):
        if isinstance(img_path, Image.Image):
@@ -389,7 +501,14 @@ class WanAudioRunner(WanRunner):  # type:ignore
        img = self.read_image_input(self.config["image_path"])
        clip_encoder_out = self.run_image_encoder(img) if self.config.get("use_image_encoder", True) else None
        vae_encode_out = self.run_vae_encoder(img)
-        audio_segments, expected_frames = self.read_audio_input()
+        audio_num, audio_segments, expected_frames = self.read_audio_input()
+        person_mask_latens = self.read_person_mask()
+        self.config.person_num = 0
+        if person_mask_latens is not None:
+            assert audio_num == person_mask_latens.size(0), "audio_num and person_mask_latens.size(0) must be the same"
+            self.config.person_num = person_mask_latens.size(0)
        text_encoder_output = self.run_text_encoder(prompt, None)
        torch.cuda.empty_cache()
        gc.collect()
@@ -401,6 +520,7 @@ class WanAudioRunner(WanRunner):  # type:ignore
            },
            "audio_segments": audio_segments,
            "expected_frames": expected_frames,
+            "person_mask_latens": person_mask_latens,
        }
    def prepare_prev_latents(self, prev_video: Optional[torch.Tensor], prev_frame_length: int) -> Optional[Dict[str, torch.Tensor]]:
@@ -474,15 +594,16 @@ class WanAudioRunner(WanRunner):  # type:ignore
        self.prev_video = None
        if self.config.get("return_video", False):
            self.gen_video_final = torch.zeros((self.inputs["expected_frames"], self.config.tgt_h, self.config.tgt_w, 3), dtype=torch.float32, device="cpu")
+            self.cut_audio_final = torch.zeros((self.inputs["expected_frames"] * self._audio_processor.audio_frame_rate), dtype=torch.float32, device="cpu")
        else:
            self.gen_video_final = None
-        self.cut_audio_final = None
+            self.cut_audio_final = None
    @ProfilingContext4DebugL1("Init run segment")
    def init_run_segment(self, segment_idx, audio_array=None):
        self.segment_idx = segment_idx
        if audio_array is not None:
-            end_idx = audio_array.shape[0] // self._audio_processor.audio_frame_rate - self.prev_frame_length
+            end_idx = audio_array.shape[1] // self._audio_processor.audio_frame_rate - self.prev_frame_length
            self.segment = AudioSegment(audio_array, 0, end_idx)
        else:
            self.segment = self.inputs["audio_segments"][segment_idx]
@@ -494,8 +615,12 @@ class WanAudioRunner(WanRunner):  # type:ignore
        if (self.config.get("lazy_load", False) or self.config.get("unload_modules", False)) and not hasattr(self, "audio_encoder"):
            self.audio_encoder = self.load_audio_encoder()
-        audio_features = self.audio_encoder.infer(self.segment.audio_array)
+        features_list = []
-        audio_features = self.audio_adapter.forward_audio_proj(audio_features, self.model.scheduler.latents.shape[1])
+        for i in range(self.segment.audio_array.shape[0]):
+            feat = self.audio_encoder.infer(self.segment.audio_array[i])
+            feat = self.audio_adapter.forward_audio_proj(feat, self.model.scheduler.latents.shape[1])
+            features_list.append(feat.squeeze(0))
+        audio_features = torch.stack(features_list, dim=0)
        self.inputs["audio_encoder_output"] = audio_features
        self.inputs["previmg_encoder_output"] = self.prepare_prev_latents(self.prev_video, prev_frame_length=self.prev_frame_length)
@@ -509,8 +634,8 @@ class WanAudioRunner(WanRunner):  # type:ignore
        self.gen_video = torch.clamp(self.gen_video, -1, 1).to(torch.float)
        useful_length = self.segment.end_frame - self.segment.start_frame
        video_seg = self.gen_video[:, :, :useful_length].cpu()
-        audio_seg = self.segment.audio_array[: useful_length * self._audio_processor.audio_frame_rate]
+        audio_seg = self.segment.audio_array[:, : useful_length * self._audio_processor.audio_frame_rate]
+        audio_seg = audio_seg.sum(dim=0)  # Multiple audio tracks, mixed into one track
        video_seg = vae_to_comfyui_image_inplace(video_seg)
        # [Warning] Need check whether video segment interpolation works...
@@ -527,7 +652,7 @@ class WanAudioRunner(WanRunner):  # type:ignore
            self.va_recorder.pub_livestream(video_seg, audio_seg)
        elif self.config.get("return_video", False):
            self.gen_video_final[self.segment.start_frame : self.segment.end_frame].copy_(video_seg)
-            self.cut_audio_final = np.concatenate([self.cut_audio_final, audio_seg], axis=0).astype(np.float32) if self.cut_audio_final is not None else audio_seg
+            self.cut_audio_final[self.segment.start_frame * self._audio_processor.audio_frame_rate : self.segment.end_frame * self._audio_processor.audio_frame_rate].copy_(audio_seg)
        # Update prev_video for next iteration
        self.prev_video = self.gen_video
@@ -637,7 +762,7 @@ class WanAudioRunner(WanRunner):  # type:ignore
    @ProfilingContext4DebugL1("Process after vae decoder")
    def process_images_after_vae_decoder(self, save_video=False):
        if self.config.get("return_video", False):
-            audio_waveform = torch.from_numpy(self.cut_audio_final).unsqueeze(0).unsqueeze(0)
+            audio_waveform = self.cut_audio_final.unsqueeze(0).unsqueeze(0)
            comfyui_audio = {"waveform": audio_waveform, "sample_rate": self._audio_processor.audio_sr}
            return {"video": self.gen_video_final, "audio": comfyui_audio}
        return {"video": None, "audio": None}

--- a/lightx2v/server/run_server.py
+++ b/lightx2v/server/run_server.py
+#!/usr/bin/env python
+"""Example script to run the LightX2V server."""
+import argparse
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+from lightx2v.server.main import run_server
+def main():
+    parser = argparse.ArgumentParser(description="Run LightX2V inference server")
+    parser.add_argument("--model_path", type=str, required=True, help="Path to model")
+    parser.add_argument("--model_cls", type=str, required=True, help="Model class name")
+    parser.add_argument("--config_json", type=str, help="Path to model config JSON file")
+    parser.add_argument("--task", type=str, default="i2v", help="Task type (i2v, etc.)")
+    parser.add_argument("--nproc_per_node", type=int, default=1, help="Number of processes per node (GPUs to use)")
+    parser.add_argument("--port", type=int, default=8000, help="Server port")
+    parser.add_argument("--host", type=str, default="127.0.0.1", help="Server host")
+    args = parser.parse_args()
+    run_server(args)
+if __name__ == "__main__":
+    main()
--- a/lightx2v/utils/set_config.py
+++ b/lightx2v/utils/set_config.py
@@ -32,6 +32,8 @@ def get_default_config():
        "tgt_w": None,
        "target_shape": None,
        "return_video": False,
+        "audio_num": None,
+        "person_num": None,
    }
    return default_config
@@ -74,6 +76,20 @@ def set_config(args):
            logger.warning(f"`num_frames - 1` has to be divisible by {config.vae_stride[0]}. Rounding to the nearest number.")
            config.target_video_length = config.target_video_length // config.vae_stride[0] * config.vae_stride[0] + 1
+    if config.audio_path:
+        if os.path.isdir(config.audio_path):
+            logger.info(f"audio_path is a directory, loading config.json from {config.audio_path}")
+            audio_config_path = os.path.join(config.audio_path, "config.json")
+            assert os.path.exists(audio_config_path), "config.json not found in audio_path"
+            with open(audio_config_path, "r") as f:
+                audio_config = json.load(f)
+            for talk_object in audio_config["talk_objects"]:
+                talk_object["audio"] = os.path.join(config.audio_path, talk_object["audio"])
+                talk_object["mask"] = os.path.join(config.audio_path, talk_object["mask"])
+            config.update(audio_config)
+        else:
+            logger.info(f"audio_path is a file: {config.audio_path}")
    assert not (config.save_video_path and config.return_video), "save_video_path and return_video cannot be set at the same time"
    return config

--- a/scripts/seko_talk/multi_person/00_base.sh
+++ b/scripts/seko_talk/multi_person/00_base.sh
+#!/bin/bash
+lightx2v_path=/path/to/LightX2V
+model_path=/path/to/model
+export CUDA_VISIBLE_DEVICES=0
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export SENSITIVE_LAYER_DTYPE=None
+python  -m lightx2v.infer \
+--model_cls seko_talk \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/seko_talk/multi_person/02_base_fp8.json \
+--prompt  "The video features a man and a woman standing by a bench in the park, their expressions tense and voices raised as they argue. The man gestures with both hands, his arms swinging slightly as if to emphasize each heated word, while the woman stands with her hands on her waist, her brows furrowed in frustration. The background is a wide expanse of sunlit grass, the golden light contrasting with the sharp energy of their quarrel. Their voices seem to clash in the air, and the rhythm of their hand movements and body postures interweaves with the rising tension, creating a vivid scene of confrontation." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
+--save_video_path ${lightx2v_path}/save_results/seko_talk_multi_person_00.mp4
--- a/scripts/seko_talk/multi_person/01_base.sh
+++ b/scripts/seko_talk/multi_person/01_base.sh
+#!/bin/bash
+lightx2v_path=/path/to/Lightx2v
+model_path=/path/to/SekoTalk-Distill
+export CUDA_VISIBLE_DEVICES=0
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export SENSITIVE_LAYER_DTYPE=None
+python -m lightx2v.infer \
+--model_cls seko_talk \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/seko_talk/multi_person/02_base_fp8.json \
+--prompt  "The video features a man and a woman standing by a bench in the park, their expressions tense and voices raised as they argue. The man gestures with both hands, his arms swinging slightly as if to emphasize each heated word, while the woman stands with her hands on her waist, her brows furrowed in frustration. The background is a wide expanse of sunlit grass, the golden light contrasting with the sharp energy of their quarrel. Their voices seem to clash in the air, and the rhythm of their hand movements and body postures interweaves with the rising tension, creating a vivid scene of confrontation." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--image_path ${lightx2v_path}/assets/inputs/audio/multi_person/seko_input.png \
+--audio_path ${lightx2v_path}/assets/inputs/audio/multi_person \
+--save_video_path ${lightx2v_path}/save_results/seko_talk_multi_person.mp4
--- a/scripts/seko_talk/multi_person/02_base_single.sh
+++ b/scripts/seko_talk/multi_person/02_base_single.sh
+#!/bin/bash
+lightx2v_path=/path/to/LightX2V
+model_path=/path/to/SekoTalk-Distill
+export CUDA_VISIBLE_DEVICES=0
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export SENSITIVE_LAYER_DTYPE=None
+python  -m lightx2v.infer \
+--model_cls seko_talk \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/seko_talk/multi_person/02_base_fp8.json \
+--prompt  "The video features a man and a woman standing by a bench in the park, their expressions tense and voices raised as they argue. The man gestures with both hands, his arms swinging slightly as if to emphasize each heated word, while the woman stands with her hands on her waist, her brows furrowed in frustration. The background is a wide expanse of sunlit grass, the golden light contrasting with the sharp energy of their quarrel. Their voices seem to clash in the air, and the rhythm of their hand movements and body postures interweaves with the rising tension, creating a vivid scene of confrontation." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--image_path ${lightx2v_path}/assets/inputs/audio/multi_person/seko_input.png \
+--audio_path ${lightx2v_path}/assets/inputs/audio/multi_person \
+--save_video_path ${lightx2v_path}/save_results/seko_talk_multi_person_single.mp4
--- a/scripts/seko_talk/multi_person/03_dist.sh
+++ b/scripts/seko_talk/multi_person/03_dist.sh
+#!/bin/bash
+lightx2v_path=/path/to/Lightx2v
+model_path=/path/to/SekoTalk-Distill
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export SENSITIVE_LAYER_DTYPE=None
+torchrun --nproc-per-node 4 -m lightx2v.infer \
+--model_cls seko_talk \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/seko_talk/multi_person/03_dist.json \
+--prompt  "The video features a man and a woman standing by a bench in the park, their expressions tense and voices raised as they argue. The man gestures with both hands, his arms swinging slightly as if to emphasize each heated word, while the woman stands with her hands on her waist, her brows furrowed in frustration. The background is a wide expanse of sunlit grass, the golden light contrasting with the sharp energy of their quarrel. Their voices seem to clash in the air, and the rhythm of their hand movements and body postures interweaves with the rising tension, creating a vivid scene of confrontation." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--audio_path ${lightx2v_path}/assets/inputs/audio/multi_person \
+--save_video_path ${lightx2v_path}/save_results/seko_talk_multi_person_dist.mp4
--- a/scripts/seko_talk/multi_person/15_base_compile.sh
+++ b/scripts/seko_talk/multi_person/15_base_compile.sh
+#!/bin/bash
+lightx2v_path=/path/to/Lightx2v
+model_path=/path/to/SekoTalk-Distill
+export CUDA_VISIBLE_DEVICES=0
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export SENSITIVE_LAYER_DTYPE=None
+python -m lightx2v.infer \
+--model_cls seko_talk \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/seko_talk/multi_person/15_base_compile.json \
+--prompt  "The video features a man and a woman standing by a bench in the park, their expressions tense and voices raised as they argue. The man gestures with both hands, his arms swinging slightly as if to emphasize each heated word, while the woman stands with her hands on her waist, her brows furrowed in frustration. The background is a wide expanse of sunlit grass, the golden light contrasting with the sharp energy of their quarrel. Their voices seem to clash in the air, and the rhythm of their hand movements and body postures interweaves with the rising tension, creating a vivid scene of confrontation." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--image_path ${lightx2v_path}/assets/inputs/audio/multi_person/seko_input.png \
+--audio_path ${lightx2v_path}/assets/inputs/audio/multi_person \
+--save_video_path ${lightx2v_path}/save_results/seko_talk_multi_person.mp4
--- a/scripts/seko_talk/run_seko_talk_01_base.sh
+++ b/scripts/seko_talk/run_seko_talk_01_base.sh
@@ -20,5 +20,5 @@ python -m lightx2v.infer \
 --prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_02_fp8.sh
+++ b/scripts/seko_talk/run_seko_talk_02_fp8.sh
@@ -20,5 +20,5 @@ python -m lightx2v.infer \
 --prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_03_dist.sh
+++ b/scripts/seko_talk/run_seko_talk_03_dist.sh
@@ -20,5 +20,5 @@ torchrun --nproc-per-node 8 -m lightx2v.infer \
 --prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_04_fp8_dist.sh
+++ b/scripts/seko_talk/run_seko_talk_04_fp8_dist.sh
@@ -20,5 +20,5 @@ torchrun --nproc-per-node 8 -m lightx2v.infer \
 --prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_05_offload_fp8_4090.sh
+++ b/scripts/seko_talk/run_seko_talk_05_offload_fp8_4090.sh
@@ -20,5 +20,5 @@ python -m lightx2v.infer \
 --prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_06_offload_fp8_H100.sh
+++ b/scripts/seko_talk/run_seko_talk_06_offload_fp8_H100.sh
@@ -20,5 +20,5 @@ python -m lightx2v.infer \
 --prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_07_dist_offload.sh
+++ b/scripts/seko_talk/run_seko_talk_07_dist_offload.sh
@@ -20,5 +20,5 @@ torchrun --nproc-per-node 4 -m lightx2v.infer \
 --prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_08_5B_base.sh
+++ b/scripts/seko_talk/run_seko_talk_08_5B_base.sh
@@ -20,5 +20,5 @@ python -m lightx2v.infer \
 --prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_09_base_fixed_min_area.sh
+++ b/scripts/seko_talk/run_seko_talk_09_base_fixed_min_area.sh
@@ -20,5 +20,5 @@ python -m lightx2v.infer \
 --prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_10_fp8_dist_fixed_min_area.sh
+++ b/scripts/seko_talk/run_seko_talk_10_fp8_dist_fixed_min_area.sh
@@ -20,5 +20,5 @@ torchrun --nproc-per-node 4 -m lightx2v.infer \
 --prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_11_fp8_dist_fixed_shape.sh
+++ b/scripts/seko_talk/run_seko_talk_11_fp8_dist_fixed_shape.sh
@@ -20,5 +20,5 @@ torchrun --nproc-per-node 4 -m lightx2v.infer \
 --prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_12_fp8_dist_fixed_shape_8gpus_1s.sh
+++ b/scripts/seko_talk/run_seko_talk_12_fp8_dist_fixed_shape_8gpus_1s.sh
@@ -20,5 +20,5 @@ torchrun --nproc-per-node 8 -m lightx2v.infer \
 --prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4