update seko model filename & update seko scripts (#257)

* update seko file names * update * [Update] update seko-models configs * Add seko scripts --------- Co-authored-by: gushiqiao <975033167@qq.com>

update seko model filename & update seko scripts (#257)
* update seko file names * update * [Update] update seko-models configs * Add seko scripts --------- Co-authored-by: gushiqiao <975033167@qq.com>
3652b385 · Yang Yong(雍洋) · GitHub · 84e756e9 · 3652b385 · 3652b385
Commit 3652b385 authored Aug 28, 2025 by Yang Yong(雍洋) Committed by GitHub Aug 28, 2025
15 changed files
--- a/configs/audio_driven/wan_i2v_audio.json
+++ b/configs/audio_driven/wan_i2v_audio.json
 {
-  "infer_steps": 4,
+    "infer_steps": 4,
-  "target_fps": 16,
+    "target_fps": 16,
-  "video_duration": 12,
+    "video_duration": 12,
-  "audio_sr": 16000,
+    "audio_sr": 16000,
-  "target_video_length": 81,
+    "target_video_length": 81,
-  "target_height": 720,
+    "target_height": 720,
-  "target_width": 1280,
+    "target_width": 1280,
-  "self_attn_1_type": "flash_attn3",
+    "self_attn_1_type": "flash_attn3",
-  "cross_attn_1_type": "flash_attn3",
+    "cross_attn_1_type": "flash_attn3",
-  "cross_attn_2_type": "flash_attn3",
+    "cross_attn_2_type": "flash_attn3",
-  "seed": 42,
+    "seed": 42,
-  "sample_guide_scale": 1,
+    "sample_guide_scale": 1.0,
-  "sample_shift": 5,
+    "sample_shift": 5,
-  "enable_cfg": false,
+    "enable_cfg": false,
-  "cpu_offload": false,
+    "cpu_offload": false,
-  "adaptive_resize": true,
+    "use_31_block": false,
-  "use_31_block": false
+    "adaptive_resize": true
 }
--- a/configs/audio_driven/wan_i2v_audio_offload_quant.json
+++ b/configs/audio_driven/wan_i2v_audio_offload_quant.json
+{
+  "infer_steps": 4,
+  "target_fps": 16,
+  "video_duration": 120,
+  "audio_sr": 16000,
+  "target_video_length": 81,
+  "target_height": 720,
+  "target_width": 1280,
+  "self_attn_1_type": "sage_attn2",
+  "cross_attn_1_type": "sage_attn2",
+  "cross_attn_2_type": "sage_attn2",
+  "seed": 42,
+  "sample_guide_scale": 1,
+  "sample_shift": 5,
+  "enable_cfg": false,
+  "adaptive_resize": true,
+  "use_31_block": false,
+  "cpu_offload": true,
+  "offload_granularity": "block",
+  "offload_ratio_val": 1,
+  "t5_cpu_offload": true,
+  "t5_offload_granularity": "model",
+  "t5_quantized": true,
+  "t5_quant_scheme": "fp8",
+  "audio_encoder_cpu_offload": false,
+  "audio_adapter_cpu_offload": false,
+  "adapter_quantized": true,
+  "adapter_quant_scheme": "fp8",
+  "vae_cpu_offload": false,
+  "use_tiling_vae": true,
+  "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
+  }
+}
--- a/configs/audio_driven/wan_i2v_audio_quant.json
+++ b/configs/audio_driven/wan_i2v_audio_quant.json
 {
    "infer_steps": 4,
    "target_fps": 16,
-    "video_duration": 16,
+    "video_duration": 12,
    "audio_sr": 16000,
    "target_video_length": 81,
    "target_height": 720,
@@ -10,15 +10,17 @@
    "cross_attn_1_type": "sage_attn2",
    "cross_attn_2_type": "sage_attn2",
    "seed": 42,
-    "sample_guide_scale": 1,
+    "sample_guide_scale": 1.0,
    "sample_shift": 5,
    "enable_cfg": false,
    "cpu_offload": false,
    "use_31_block": false,
-    "dit_quantized_ckpt": "/path/to/Wan2.1-R2V721-Audio-14B-720P/fp8",
+    "adaptive_resize": true,
    "mm_config": {
        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
    },
    "adapter_quantized": true,
-    "adapter_quant_scheme": "fp8"
+    "adapter_quant_scheme": "fp8",
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8"
 }
--- a/configs/audio_driven/wan_i2v_audio_quant_dist.json
+++ b/configs/audio_driven/wan_i2v_audio_quant_dist.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 12,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "target_height": 720,
+    "target_width": 1280,
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "seed": 42,
+    "sample_guide_scale": 1.0,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "cpu_offload": false,
+    "use_31_block": false,
+    "adaptive_resize": true,
+    "parallel": {
+        "seq_p_size": 4,
+        "seq_p_attn_type": "ulysses"
+    },
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
+    },
+    "adapter_quantized": true,
+    "adapter_quant_scheme": "fp8",
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8"
+}
--- a/lightx2v/models/runners/wan/wan_audio_runner.py
+++ b/lightx2v/models/runners/wan/wan_audio_runner.py
@@ -534,7 +534,7 @@ class WanAudioRunner(WanRunner):  # type:ignore
        return base_model
    def load_audio_encoder(self):
-        audio_encoder_path = os.path.join(self.config["model_path"], "audio_encoder")
+        audio_encoder_path = os.path.join(self.config["model_path"], "TencentGameMate-chinese-hubert-large")
        audio_encoder_offload = self.config.get("audio_encoder_cpu_offload", self.config.get("cpu_offload", False))
        model = SekoAudioEncoderModel(audio_encoder_path, self.config["audio_sr"], audio_encoder_offload)
        return model
@@ -561,13 +561,13 @@ class WanAudioRunner(WanRunner):  # type:ignore
        audio_adapter.to(device)
        if self.config.get("adapter_quantized", False):
            if self.config.get("adapter_quant_scheme", None) in ["fp8", "fp8-q8f"]:
-                model_name = "audio_adapter_fp8.safetensors"
+                model_name = "audio_adapter_model_fp8.safetensors"
            elif self.config.get("adapter_quant_scheme", None) == "int8":
-                model_name = "audio_adapter_int8.safetensors"
+                model_name = "audio_adapter_model_int8.safetensors"
            else:
                raise ValueError(f"Unsupported quant_scheme: {self.config.get('adapter_quant_scheme', None)}")
        else:
-            model_name = "audio_adapter.safetensors"
+            model_name = "audio_adapter_model.safetensors"
        weights_dict = load_weights(os.path.join(self.config["model_path"], model_name), cpu_offload=audio_adapter_offload)
        audio_adapter.load_state_dict(weights_dict, strict=False)

--- a/scripts/seko_audio_driven/run_wan_i2v_audio.sh
+++ b/scripts/seko_audio_driven/run_wan_i2v_audio.sh
+#!/bin/bash
+lightx2v_path=
+model_path=
+export CUDA_VISIBLE_DEVICES=0
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export ENABLE_GRAPH_MODE=false
+export SENSITIVE_LAYER_DTYPE=None
+python -m lightx2v.infer \
+--model_cls wan2.1_audio \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/audio_driven/wan_i2v_audio.json \
+--prompt  "The video features a old lady is saying something and knitting a sweater." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
+--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4
--- a/scripts/seko_audio_driven/run_wan_i2v_audio_dist.sh
+++ b/scripts/seko_audio_driven/run_wan_i2v_audio_dist.sh
+#!/bin/bash
+lightx2v_path=
+model_path=
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export ENABLE_GRAPH_MODE=false
+export SENSITIVE_LAYER_DTYPE=None
+torchrun --nproc-per-node 4 -m lightx2v.infer \
+--model_cls wan2.1_audio \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/audio_driven/wan_i2v_audio_dist.json \
+--prompt  "The video features a old lady is saying something and knitting a sweater." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
+--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4
--- a/scripts/seko_audio_driven/run_wan_i2v_audio_quant.sh
+++ b/scripts/seko_audio_driven/run_wan_i2v_audio_quant.sh
+#!/bin/bash
+lightx2v_path=
+model_path=
+export CUDA_VISIBLE_DEVICES=0
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export ENABLE_GRAPH_MODE=false
+export SENSITIVE_LAYER_DTYPE=None
+python -m lightx2v.infer \
+--model_cls wan2.1_audio \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/audio_driven/wan_i2v_audio_quant.json \
+--prompt  "The video features a old lady is saying something and knitting a sweater." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
+--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4
--- a/scripts/seko_audio_driven/run_wan_i2v_audio_quant_dist.sh
+++ b/scripts/seko_audio_driven/run_wan_i2v_audio_quant_dist.sh
+#!/bin/bash
+lightx2v_path=
+model_path=
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export ENABLE_GRAPH_MODE=false
+export SENSITIVE_LAYER_DTYPE=None
+torchrun --nproc-per-node 4 -m lightx2v.infer \
+--model_cls wan2.1_audio \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/audio_driven/wan_i2v_audio_quant_dist.json \
+--prompt  "The video features a old lady is saying something and knitting a sweater." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
+--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4
--- a/scripts/server/start_server.sh
+++ b/scripts/server/start_server.sh
@@ -10,7 +10,6 @@ export CUDA_VISIBLE_DEVICES=0
 source ${lightx2v_path}/scripts/base/base.sh
 export ENABLE_GRAPH_MODE=false
-export TORCH_CUDA_ARCH_LIST="9.0"
 # Start API server with distributed inference service
 python -m lightx2v.api_server \

--- a/scripts/wan/run_wan_i2v_audio.sh
+++ b/scripts/wan/run_wan_i2v_audio.sh
@@ -9,7 +9,6 @@ export CUDA_VISIBLE_DEVICES=0
 # set environment variables
 source ${lightx2v_path}/scripts/base/base.sh
-export TORCH_CUDA_ARCH_LIST="9.0"
 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 export ENABLE_GRAPH_MODE=false

--- a/scripts/wan/run_wan_i2v_audio_dist.sh
+++ b/scripts/wan/run_wan_i2v_audio_dist.sh
@@ -9,7 +9,6 @@ model_path=
 source ${lightx2v_path}/scripts/base/base.sh
 export CUDA_VISIBLE_DEVICES=0,1,2,3
-export TORCH_CUDA_ARCH_LIST="9.0"
 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 export ENABLE_GRAPH_MODE=false

--- a/scripts/wan/run_wan_i2v_audio_offload.sh
+++ b/scripts/wan/run_wan_i2v_audio_offload.sh
@@ -10,7 +10,6 @@ export CUDA_VISIBLE_DEVICES=0
 # set environment variables
 source ${lightx2v_path}/scripts/base/base.sh
-export TORCH_CUDA_ARCH_LIST="9.0"
 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 export ENABLE_GRAPH_MODE=false

--- a/test_cases/run_wan_i2v_audio.sh
+++ b/test_cases/run_wan_i2v_audio.sh
@@ -9,7 +9,6 @@ export CUDA_VISIBLE_DEVICES=0
 # set environment variables
 source ${lightx2v_path}/scripts/base/base.sh
-export TORCH_CUDA_ARCH_LIST="9.0"
 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 export ENABLE_GRAPH_MODE=false

--- a/tools/convert/quant_adapter.py
+++ b/tools/convert/quant_adapter.py
@@ -4,7 +4,7 @@ from safetensors.torch import save_file
 from lightx2v.utils.quant_utils import FloatQuantizer
-model_path = "/data/nvme0/models/Wan2.1-R2V721-Audio-14B-720P/audio_adapter.safetensors"
+model_path = "/data/nvme0/models/Wan2.1-R2V721-Audio-14B-720P/audio_adapter_model.safetensors"
 state_dict = {}
 with safetensors.safe_open(model_path, framework="pt", device="cpu") as f:
@@ -13,7 +13,7 @@ with safetensors.safe_open(model_path, framework="pt", device="cpu") as f:
 new_state_dict = {}
-new_model_path = "/data/nvme0/models/Wan2.1-R2V721-Audio-14B-720P/audio_adapter_fp8.safetensors"
+new_model_path = "/data/nvme0/models/Wan2.1-R2V721-Audio-14B-720P/audio_adapter_model_fp8.safetensors"
 for key in state_dict.keys():
    if key.startswith("ca") and ".to" in key and "weight" in key and "to_kv" not in key: