update seko (#262)

* update seko talk * update * Update configs * update --------- Co-authored-by: gushiqiao <975033167@qq.com>

update seko (#262)
* update seko talk * update * Update configs * update --------- Co-authored-by: gushiqiao <975033167@qq.com>
a8aea27f · Yang Yong(雍洋) · GitHub · aaf5f643 · aaf5f643 · aaf5f643
Commit a8aea27f authored Aug 29, 2025 by Yang Yong(雍洋) Committed by GitHub Aug 29, 2025
20 changed files
--- a/configs/audio_driven/wan_i2v_audio_adaptive_resize.json
+++ b/configs/audio_driven/wan_i2v_audio_adaptive_resize.json
-{
-  "infer_steps": 4,
-  "target_fps": 16,
-  "video_duration": 16,
-  "audio_sr": 16000,
-  "target_video_length": 81,
-  "target_height": 720,
-  "target_width": 1280,
-  "self_attn_1_type": "flash_attn3",
-  "cross_attn_1_type": "flash_attn3",
-  "cross_attn_2_type": "flash_attn3",
-  "seed": 42,
-  "sample_guide_scale": 1,
-  "sample_shift": 5,
-  "enable_cfg": false,
-  "cpu_offload": false,
-  "adaptive_resize": true,
-  "use_31_block": false
-}
--- a/configs/audio_driven/wan_i2v_audio_offload.json
+++ b/configs/audio_driven/wan_i2v_audio_offload.json
-{
-  "infer_steps": 4,
-  "target_fps": 16,
-  "video_duration": 12,
-  "audio_sr": 16000,
-  "target_video_length": 81,
-  "target_height": 720,
-  "target_width": 1280,
-  "self_attn_1_type": "flash_attn3",
-  "cross_attn_1_type": "flash_attn3",
-  "cross_attn_2_type": "flash_attn3",
-  "seed": 42,
-  "sample_guide_scale": 1,
-  "sample_shift": 5,
-  "enable_cfg": false,
-  "adaptive_resize": true,
-  "use_31_block": false,
-  "cpu_offload": true,
-  "offload_granularity": "block",
-  "offload_ratio_val": 1,
-  "t5_cpu_offload": true,
-  "t5_offload_granularity": "block",
-  "use_tiling_vae": true,
-  "audio_encoder_cpu_offload": true,
-  "audio_adapter_cpu_offload": false
-}
--- a/configs/audio_driven/wan_i2v_audio_offload_4090.json
+++ b/configs/audio_driven/wan_i2v_audio_offload_4090.json
-{
-  "infer_steps": 4,
-  "target_fps": 16,
-  "video_duration": 120,
-  "audio_sr": 16000,
-  "target_video_length": 81,
-  "target_height": 720,
-  "target_width": 1280,
-  "self_attn_1_type": "sage_attn2",
-  "cross_attn_1_type": "sage_attn2",
-  "cross_attn_2_type": "sage_attn2",
-  "seed": 42,
-  "sample_guide_scale": 1,
-  "sample_shift": 5,
-  "enable_cfg": false,
-  "adaptive_resize": true,
-  "use_31_block": false,
-  "cpu_offload": true,
-  "offload_granularity": "block",
-  "offload_ratio_val": 1,
-  "t5_cpu_offload": true,
-  "t5_offload_granularity": "model",
-  "t5_quantized": true,
-  "t5_quant_scheme": "fp8-q8f",
-  "audio_encoder_cpu_offload": false,
-  "audio_adapter_cpu_offload": false,
-  "adapter_quantized": true,
-  "adapter_quant_scheme": "fp8",
-  "vae_cpu_offload": false,
-  "use_tiling_vae": true,
-  "mm_config": {
-        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Q8F"
-  }
-}
--- a/configs/audio_driven/wan_i2v_audio_offload_quant.json
+++ b/configs/audio_driven/wan_i2v_audio_offload_quant.json
-{
-  "infer_steps": 4,
-  "target_fps": 16,
-  "video_duration": 120,
-  "audio_sr": 16000,
-  "target_video_length": 81,
-  "target_height": 720,
-  "target_width": 1280,
-  "self_attn_1_type": "sage_attn2",
-  "cross_attn_1_type": "sage_attn2",
-  "cross_attn_2_type": "sage_attn2",
-  "seed": 42,
-  "sample_guide_scale": 1,
-  "sample_shift": 5,
-  "enable_cfg": false,
-  "adaptive_resize": true,
-  "use_31_block": false,
-  "cpu_offload": true,
-  "offload_granularity": "block",
-  "offload_ratio_val": 1,
-  "t5_cpu_offload": true,
-  "t5_offload_granularity": "model",
-  "t5_quantized": true,
-  "t5_quant_scheme": "fp8",
-  "audio_encoder_cpu_offload": false,
-  "audio_adapter_cpu_offload": false,
-  "adapter_quantized": true,
-  "adapter_quant_scheme": "fp8",
-  "vae_cpu_offload": false,
-  "use_tiling_vae": true,
-  "mm_config": {
-        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
-  }
-}
--- a/configs/audio_driven/wan_i2v_audio.json
+++ b/configs/audio_driven/wan_i2v_audio.json
--- a/configs/audio_driven/wan_i2v_audio_quant.json
+++ b/configs/audio_driven/wan_i2v_audio_quant.json
--- a/configs/audio_driven/wan_i2v_audio_dist.json
+++ b/configs/audio_driven/wan_i2v_audio_dist.json
--- a/configs/audio_driven/wan_i2v_audio_quant_dist.json
+++ b/configs/audio_driven/wan_i2v_audio_quant_dist.json
--- a/configs/seko_talk/seko_talk_05_offload_fp8_4090.json
+++ b/configs/seko_talk/seko_talk_05_offload_fp8_4090.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 120,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "target_height": 720,
+    "target_width": 1280,
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "seed": 42,
+    "sample_guide_scale": 1,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "adaptive_resize": true,
+    "use_31_block": false,
+    "cpu_offload": true,
+    "offload_granularity": "block",
+    "offload_ratio_val": 1,
+    "t5_cpu_offload": true,
+    "t5_offload_granularity": "model",
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8-q8f",
+    "audio_encoder_cpu_offload": false,
+    "audio_adapter_cpu_offload": false,
+    "adapter_quantized": true,
+    "adapter_quant_scheme": "fp8",
+    "vae_cpu_offload": false,
+    "use_tiling_vae": true,
+    "mm_config": {
+            "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Q8F"
+    }
+}
--- a/configs/seko_talk/seko_talk_06_offload_fp8_H100.json
+++ b/configs/seko_talk/seko_talk_06_offload_fp8_H100.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 120,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "target_height": 720,
+    "target_width": 1280,
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "seed": 42,
+    "sample_guide_scale": 1,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "adaptive_resize": true,
+    "use_31_block": false,
+    "cpu_offload": true,
+    "offload_granularity": "block",
+    "offload_ratio_val": 1,
+    "t5_cpu_offload": true,
+    "t5_offload_granularity": "model",
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8",
+    "audio_encoder_cpu_offload": false,
+    "audio_adapter_cpu_offload": false,
+    "adapter_quantized": true,
+    "adapter_quant_scheme": "fp8",
+    "vae_cpu_offload": false,
+    "use_tiling_vae": true,
+    "mm_config": {
+            "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
+    }
+}
--- a/configs/audio_driven/wan_i2v_audio_dist_offload.json
+++ b/configs/audio_driven/wan_i2v_audio_dist_offload.json
--- a/configs/audio_driven/wan22_ti2v_i2v_audio.json
+++ b/configs/audio_driven/wan22_ti2v_i2v_audio.json
--- a/lightx2v/common/ops/attn/sage_attn.py
+++ b/lightx2v/common/ops/attn/sage_attn.py
@@ -52,7 +52,7 @@ class SageAttn2Weight(AttnWeightTemplate):
            )
            x = torch.cat((x1, x2), dim=1)
            x = x.view(max_seqlen_q, -1)
-        elif model_cls in ["wan2.1", "wan2.1_distill", "wan2.1_causvid", "wan2.1_df", "wan2.1_audio", "wan2.2", "wan2.1_vace"]:
+        elif model_cls in ["wan2.1", "wan2.1_distill", "wan2.1_causvid", "wan2.1_df", "seko_talk", "wan2.2", "wan2.1_vace"]:
            x = sageattn(
                q.unsqueeze(0),
                k.unsqueeze(0),

--- a/lightx2v/infer.py
+++ b/lightx2v/infer.py
@@ -48,7 +48,7 @@ def main():
            "wan2.1_skyreels_v2_df",
            "wan2.1_vace",
            "cogvideox",
-            "wan2.1_audio",
+            "seko_talk",
            "wan2.2_moe",
            "wan2.2",
            "wan2.2_moe_audio",

--- a/lightx2v/models/runners/wan/wan_audio_runner.py
+++ b/lightx2v/models/runners/wan/wan_audio_runner.py
@@ -233,7 +233,7 @@ class AudioProcessor:
        return segments
-@RUNNER_REGISTER("wan2.1_audio")
+@RUNNER_REGISTER("seko_talk")
 class WanAudioRunner(WanRunner):  # type:ignore
    def __init__(self, config):
        super().__init__(config)

--- a/scripts/wan/run_wan_i2v_audio_offload.sh
+++ b/scripts/wan/run_wan_i2v_audio_offload.sh
 #!/bin/bash
-# set path and first
+lightx2v_path=/path/to/Lightx2v
-lightx2v_path=
+model_path=/path/to/SekoTalk-Distill
-model_path=
 export CUDA_VISIBLE_DEVICES=0
@@ -16,12 +14,12 @@ export ENABLE_GRAPH_MODE=false
 export SENSITIVE_LAYER_DTYPE=None
 python -m lightx2v.infer \
--model_cls wan2.1_audio \
+--model_cls seko_talk \
 --task i2v \
 --model_path $model_path \
--config_json ${lightx2v_path}/configs/audio_driven/wan_i2v_audio_offload.json \
+--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_01_base.json \
 --prompt  "The video features a old lady is saying something and knitting a sweater." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --image_path ${lightx2v_path}/assets/inputs/audio/15.png \
 --audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/wan/run_wan_i2v_audio.sh
+++ b/scripts/wan/run_wan_i2v_audio.sh
 #!/bin/bash
-# set path and first
+lightx2v_path=/path/to/Lightx2v
-lightx2v_path=
+model_path=/path/to/SekoTalk-Distill-fp8
-model_path=
 export CUDA_VISIBLE_DEVICES=0
@@ -15,12 +14,12 @@ export ENABLE_GRAPH_MODE=false
 export SENSITIVE_LAYER_DTYPE=None
 python -m lightx2v.infer \
--model_cls wan2.1_audio \
+--model_cls seko_talk \
 --task i2v \
 --model_path $model_path \
--config_json ${lightx2v_path}/configs/audio_driven/wan_i2v_audio.json \
+--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_02_fp8.json \
 --prompt  "The video features a old lady is saying something and knitting a sweater." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --image_path ${lightx2v_path}/assets/inputs/audio/15.png \
 --audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_audio_driven/run_wan_i2v_audio_dist.sh
+++ b/scripts/seko_audio_driven/run_wan_i2v_audio_dist.sh
 #!/bin/bash
-lightx2v_path=
+lightx2v_path=/path/to/Lightx2v
-model_path=
+model_path=/path/to/SekoTalk-Distill
 export CUDA_VISIBLE_DEVICES=0,1,2,3
@@ -14,12 +14,12 @@ export ENABLE_GRAPH_MODE=false
 export SENSITIVE_LAYER_DTYPE=None
 torchrun --nproc-per-node 4 -m lightx2v.infer \
--model_cls wan2.1_audio \
+--model_cls seko_talk \
 --task i2v \
 --model_path $model_path \
--config_json ${lightx2v_path}/configs/audio_driven/wan_i2v_audio_dist.json \
+--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_03_dist.json \
 --prompt  "The video features a old lady is saying something and knitting a sweater." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --image_path ${lightx2v_path}/assets/inputs/audio/15.png \
 --audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_audio_driven/run_wan_i2v_audio_quant_dist.sh
+++ b/scripts/seko_audio_driven/run_wan_i2v_audio_quant_dist.sh
 #!/bin/bash
-lightx2v_path=
+lightx2v_path=/path/to/Lightx2v
-model_path=
+model_path=/path/to/SekoTalk-Distill-fp8
 export CUDA_VISIBLE_DEVICES=0,1,2,3
@@ -14,12 +14,12 @@ export ENABLE_GRAPH_MODE=false
 export SENSITIVE_LAYER_DTYPE=None
 torchrun --nproc-per-node 4 -m lightx2v.infer \
--model_cls wan2.1_audio \
+--model_cls seko_talk \
 --task i2v \
 --model_path $model_path \
--config_json ${lightx2v_path}/configs/audio_driven/wan_i2v_audio_quant_dist.json \
+--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_04_fp8_dist.json \
 --prompt  "The video features a old lady is saying something and knitting a sweater." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --image_path ${lightx2v_path}/assets/inputs/audio/15.png \
 --audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_05_offload_fp8_4090.sh
+++ b/scripts/seko_talk/run_seko_talk_05_offload_fp8_4090.sh
+#!/bin/bash
+lightx2v_path=/path/to/Lightx2v
+model_path=/path/to/SekoTalk-Distill-fp8
+export CUDA_VISIBLE_DEVICES=0
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export ENABLE_GRAPH_MODE=false
+export SENSITIVE_LAYER_DTYPE=None
+python -m lightx2v.infer \
+--model_cls seko_talk \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_05_offload_fp8_4090.json \
+--prompt  "The video features a old lady is saying something and knitting a sweater." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
+--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4