[Feat] Add f2v for sekotalk (#562)

Co-authored-by: Yang Yong (雍洋) <yongyang1030@163.com>

[Feat] Add f2v for sekotalk (#562)
Co-authored-by: Yang Yong (雍洋) <yongyang1030@163.com>
f67c46e4 · sandy · GitHub · f4ab64f4 · f67c46e4 · f67c46e4
Unverified Commit f67c46e4 authored Dec 04, 2025 by sandy Committed by GitHub Dec 04, 2025
4 changed files
--- a/configs/seko_talk/seko_talk_28_f2v.json
+++ b/configs/seko_talk/seko_talk_28_f2v.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 12,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "prev_frame_length": 1,
+    "resize_mode": "adaptive",
+    "self_attn_1_type": "flash_attn3",
+    "cross_attn_1_type": "flash_attn3",
+    "cross_attn_2_type": "flash_attn3",
+    "sample_guide_scale": 1.0,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "cpu_offload": false,
+    "use_31_block": false,
+    "f2v_process": true,
+    "lora_configs": [
+        {
+        "path": "lightx2v_I2V_14B_480p_cfg_step_distill_rank32_bf16.safetensors",
+        "strength": 1.0
+        }
+    ]
+}
--- a/lightx2v/models/networks/wan/infer/audio/pre_infer.py
+++ b/lightx2v/models/networks/wan/infer/audio/pre_infer.py
@@ -59,7 +59,10 @@ class WanAudioPreInfer(WanPreInfer):

        y = weights.patch_embedding.apply(y.unsqueeze(0))
        y = y.flatten(2).transpose(1, 2).contiguous()
-        x = torch.cat([x, y], dim=1).squeeze(0)
+        if not self.config.get("f2v_process", False):
+            x = torch.cat([x, y], dim=1).squeeze(0)
+        else:
+            x = x.squeeze(0)

        ####for r2v # zero temporl component corresponding to ref embeddings
        # self.freqs[grid_sizes_t:, : self.rope_t_dim] = 0

--- a/lightx2v/models/runners/wan/wan_audio_runner.py
+++ b/lightx2v/models/runners/wan/wan_audio_runner.py
@@ -526,6 +526,8 @@ class WanAudioRunner(WanRunner):  # type:ignore
    @ProfilingContext4DebugL2("Run Encoders")
    def _run_input_encoder_local_s2v(self):
        img, latent_shape, target_shape = self.read_image_input(self.input_info.image_path)
+        if self.config.get("f2v_process", False):
+            self.ref_img = img
        self.input_info.latent_shape = latent_shape  # Important: set latent_shape in input_info
        self.input_info.target_shape = target_shape  # Important: set target_shape in input_info
        clip_encoder_out = self.run_image_encoder(img) if self.config.get("use_image_encoder", True) else None
@@ -558,7 +560,7 @@ class WanAudioRunner(WanRunner):  # type:ignore
        if prev_video is not None:
            # Extract and process last frames
            last_frames = prev_video[:, :, -prev_frame_length:].clone().to(AI_DEVICE)
-            if self.config["model_cls"] != "wan2.2_audio":
+            if self.config["model_cls"] != "wan2.2_audio" and not self.config.get("f2v_process", False):
                last_frames = self.frame_preprocessor.process_prev_frames(last_frames)
            prev_frames[:, :, :prev_frame_length] = last_frames
            prev_len = (prev_frame_length - 1) // 4 + 1
@@ -620,7 +622,10 @@ class WanAudioRunner(WanRunner):  # type:ignore
    def init_run(self):
        super().init_run()
        self.scheduler.set_audio_adapter(self.audio_adapter)
-        self.prev_video = None
+        if self.config.get("f2v_process", False):
+            self.prev_video = self.ref_img.unsqueeze(2)
+        else:
+            self.prev_video = None
        if self.input_info.return_result_tensor:
            self.gen_video_final = torch.zeros((self.inputs["expected_frames"], self.input_info.target_shape[0], self.input_info.target_shape[1], 3), dtype=torch.float32, device="cpu")
            self.cut_audio_final = torch.zeros((self.inputs["expected_frames"] * self._audio_processor.audio_frame_rate), dtype=torch.float32, device="cpu")

--- a/scripts/seko_talk/run_seko_talk_28_f2v.sh
+++ b/scripts/seko_talk/run_seko_talk_28_f2v.sh
+#!/bin/bash
+
+lightx2v_path=
+model_path=
+
+export CUDA_VISIBLE_DEVICES=0
+
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+
+python -m lightx2v.infer \
+--model_cls seko_talk \
+--task s2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_28_f2v.json \
+--prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
+--save_result_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4