Add audio input files and update pre-commit config for larger files (#283)

6de0a3b4 · Yang Yong(雍洋) · GitHub · 8de61521 · 6de0a3b4 · 6de0a3b4
Commit 6de0a3b4 authored Sep 02, 2025 by Yang Yong(雍洋) Committed by GitHub Sep 02, 2025
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,6 +16,7 @@ repos:
      - id: check-yaml
      - id: check-toml
      - id: check-added-large-files
+        args: ['--maxkb=3000']  # Allow files up to 3MB
      - id: check-case-conflict
      - id: check-merge-conflict
      - id: debug-statements
--- a/configs/offload/disk/wan_i2v_audio_phase_lazy_load_720p.json
+++ b/configs/offload/disk/wan_i2v_audio_phase_lazy_load_720p.json
 {
    "infer_steps": 4,
    "target_fps": 16,
-    "video_duration": 12,
+    "video_duration": 15,
    "audio_sr": 16000,
    "target_video_length": 81,
    "target_height": 720,

--- a/configs/seko_talk/seko_talk_01_base.json
+++ b/configs/seko_talk/seko_talk_01_base.json
 {
    "infer_steps": 4,
    "target_fps": 16,
-    "video_duration": 12,
+    "video_duration": 15,
    "audio_sr": 16000,
    "target_video_length": 81,
    "resize_mode": "adaptive",

--- a/configs/seko_talk/seko_talk_02_fp8.json
+++ b/configs/seko_talk/seko_talk_02_fp8.json
 {
    "infer_steps": 4,
    "target_fps": 16,
-    "video_duration": 12,
+    "video_duration": 15,
    "audio_sr": 16000,
    "target_video_length": 81,
    "resize_mode": "adaptive",

--- a/configs/seko_talk/seko_talk_03_dist.json
+++ b/configs/seko_talk/seko_talk_03_dist.json
 {
    "infer_steps": 4,
    "target_fps": 16,
-    "video_duration": 12,
+    "video_duration": 15,
    "audio_sr": 16000,
    "target_video_length": 81,
    "resize_mode": "adaptive",
@@ -15,7 +15,7 @@
    "cpu_offload": false,
    "use_31_block": false,
    "parallel": {
-        "seq_p_size": 4,
+        "seq_p_size": 8,
        "seq_p_attn_type": "ulysses"
    }
 }
--- a/configs/seko_talk/seko_talk_04_fp8_dist.json
+++ b/configs/seko_talk/seko_talk_04_fp8_dist.json
 {
    "infer_steps": 4,
    "target_fps": 16,
-    "video_duration": 12,
+    "video_duration": 15,
    "audio_sr": 16000,
    "target_video_length": 81,
    "resize_mode": "adaptive",
@@ -15,7 +15,7 @@
    "cpu_offload": false,
    "use_31_block": false,
    "parallel": {
-        "seq_p_size": 4,
+        "seq_p_size": 8,
        "seq_p_attn_type": "ulysses"
    },
    "mm_config": {

--- a/configs/seko_talk/seko_talk_08_5B_base.json
+++ b/configs/seko_talk/seko_talk_08_5B_base.json
 {
    "infer_steps": 4,
    "target_fps": 24,
-    "video_duration": 12,
+    "video_duration": 15,
    "audio_sr": 16000,
    "target_video_length": 121,
    "resize_mode": "adaptive",
    "text_len": 512,
    "num_channels_latents": 48,
-    "vae_stride": [4, 16, 16],
+    "vae_stride": [
+        4,
+        16,
+        16
+    ],
    "self_attn_1_type": "flash_attn3",
    "cross_attn_1_type": "flash_attn3",
    "cross_attn_2_type": "flash_attn3",
@@ -22,8 +26,8 @@
    "use_31_block": false,
    "lora_configs": [
        {
-        "path": "/mnt/aigc/rtxiang/pretrain/qianhai_weights/lora_model.safetensors",
+            "path": "/mnt/aigc/rtxiang/pretrain/qianhai_weights/lora_model.safetensors",
-        "strength": 0.125
+            "strength": 0.125
        }
    ]
 }
--- a/configs/seko_talk/seko_talk_09_base_fixed_min_area.json
+++ b/configs/seko_talk/seko_talk_09_base_fixed_min_area.json
 {
    "infer_steps": 4,
    "target_fps": 16,
-    "video_duration": 12,
+    "video_duration": 15,
    "audio_sr": 16000,
    "target_video_length": 81,
    "resize_mode": "fixed_min_area",

--- a/configs/seko_talk/seko_talk_10_fp8_dist_fixed_min_area.json
+++ b/configs/seko_talk/seko_talk_10_fp8_dist_fixed_min_area.json
 {
    "infer_steps": 4,
    "target_fps": 16,
-    "video_duration": 12,
+    "video_duration": 15,
    "audio_sr": 16000,
    "target_video_length": 81,
    "resize_mode": "fixed_min_area",

--- a/configs/seko_talk/seko_talk_11_fp8_dist_fixed_shape.json
+++ b/configs/seko_talk/seko_talk_11_fp8_dist_fixed_shape.json
 {
    "infer_steps": 4,
    "target_fps": 16,
-    "video_duration": 12,
+    "video_duration": 15,
    "audio_sr": 16000,
    "target_video_length": 81,
    "resize_mode": "fixed_shape",
-    "fixed_shape": [240, 320],
+    "fixed_shape": [
+        240,
+        320
+    ],
    "self_attn_1_type": "sage_attn2",
    "cross_attn_1_type": "sage_attn2",
    "cross_attn_2_type": "sage_attn2",

--- a/configs/seko_talk/seko_talk_12_fp8_dist_fixed_shape_8gpus_1s.json
+++ b/configs/seko_talk/seko_talk_12_fp8_dist_fixed_shape_8gpus_1s.json
 {
    "infer_steps": 4,
    "target_fps": 16,
-    "video_duration": 12,
+    "video_duration": 15,
    "audio_sr": 16000,
    "target_video_length": 17,
    "prev_frame_length": 1,
    "resize_mode": "fixed_shape",
-    "fixed_shape": [480, 480],
+    "fixed_shape": [
+        480,
+        480
+    ],
    "self_attn_1_type": "flash_attn3",
    "cross_attn_1_type": "flash_attn3",
    "cross_attn_2_type": "flash_attn3",

--- a/scripts/seko_talk/run_seko_talk_01_base.sh
+++ b/scripts/seko_talk/run_seko_talk_01_base.sh
@@ -18,8 +18,8 @@ python -m lightx2v.infer \
 --task i2v \
 --model_path $model_path \
 --config_json ${lightx2v_path}/configs/seko_talk/seko_talk_01_base.json \
--prompt  "The video features a old lady is saying something and knitting a sweater." \
+--prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
+--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_02_fp8.sh
+++ b/scripts/seko_talk/run_seko_talk_02_fp8.sh
@@ -18,8 +18,8 @@ python -m lightx2v.infer \
 --task i2v \
 --model_path $model_path \
 --config_json ${lightx2v_path}/configs/seko_talk/seko_talk_02_fp8.json \
--prompt  "The video features a old lady is saying something and knitting a sweater." \
+--prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
+--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_03_dist.sh
+++ b/scripts/seko_talk/run_seko_talk_03_dist.sh
@@ -3,7 +3,7 @@
 lightx2v_path=/path/to/Lightx2v
 model_path=/path/to/SekoTalk-Distill
-export CUDA_VISIBLE_DEVICES=0,1,2,3
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 # set environment variables
 source ${lightx2v_path}/scripts/base/base.sh
@@ -13,13 +13,13 @@ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 export ENABLE_GRAPH_MODE=false
 export SENSITIVE_LAYER_DTYPE=None
-torchrun --nproc-per-node 4 -m lightx2v.infer \
+torchrun --nproc-per-node 8 -m lightx2v.infer \
 --model_cls seko_talk \
 --task i2v \
 --model_path $model_path \
 --config_json ${lightx2v_path}/configs/seko_talk/seko_talk_03_dist.json \
--prompt  "The video features a old lady is saying something and knitting a sweater." \
+--prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
+--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_04_fp8_dist.sh
+++ b/scripts/seko_talk/run_seko_talk_04_fp8_dist.sh
@@ -3,7 +3,7 @@
 lightx2v_path=/path/to/Lightx2v
 model_path=/path/to/SekoTalk-Distill-fp8
-export CUDA_VISIBLE_DEVICES=0,1,2,3
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 # set environment variables
 source ${lightx2v_path}/scripts/base/base.sh
@@ -13,13 +13,13 @@ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 export ENABLE_GRAPH_MODE=false
 export SENSITIVE_LAYER_DTYPE=None
-torchrun --nproc-per-node 4 -m lightx2v.infer \
+torchrun --nproc-per-node 8 -m lightx2v.infer \
 --model_cls seko_talk \
 --task i2v \
 --model_path $model_path \
 --config_json ${lightx2v_path}/configs/seko_talk/seko_talk_04_fp8_dist.json \
--prompt  "The video features a old lady is saying something and knitting a sweater." \
+--prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
+--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_05_offload_fp8_4090.sh
+++ b/scripts/seko_talk/run_seko_talk_05_offload_fp8_4090.sh
@@ -18,8 +18,8 @@ python -m lightx2v.infer \
 --task i2v \
 --model_path $model_path \
 --config_json ${lightx2v_path}/configs/seko_talk/seko_talk_05_offload_fp8_4090.json \
--prompt  "The video features a old lady is saying something and knitting a sweater." \
+--prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
+--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_06_offload_fp8_H100.sh
+++ b/scripts/seko_talk/run_seko_talk_06_offload_fp8_H100.sh
@@ -18,8 +18,8 @@ python -m lightx2v.infer \
 --task i2v \
 --model_path $model_path \
 --config_json ${lightx2v_path}/configs/seko_talk/seko_talk_06_offload_fp8_H100.json \
--prompt  "The video features a old lady is saying something and knitting a sweater." \
+--prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
+--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_07_dist_offload.sh
+++ b/scripts/seko_talk/run_seko_talk_07_dist_offload.sh
@@ -18,8 +18,8 @@ torchrun --nproc-per-node 4 -m lightx2v.infer \
 --task i2v \
 --model_path $model_path \
 --config_json ${lightx2v_path}/configs/seko_talk/seko_talk_07_dist_offload.json \
--prompt  "The video features a old lady is saying something and knitting a sweater." \
+--prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
+--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_08_5B_base.sh
+++ b/scripts/seko_talk/run_seko_talk_08_5B_base.sh
@@ -18,8 +18,8 @@ python -m lightx2v.infer \
 --task i2v \
 --model_path $model_path \
 --config_json ${lightx2v_path}/configs/seko_talk/seko_talk_08_5B_base.json \
--prompt  "The video features a old lady is saying something and knitting a sweater." \
+--prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
+--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_09_base_fixed_min_area.sh
+++ b/scripts/seko_talk/run_seko_talk_09_base_fixed_min_area.sh
@@ -18,8 +18,8 @@ python -m lightx2v.infer \
 --task i2v \
 --model_path $model_path \
 --config_json ${lightx2v_path}/configs/seko_talk/seko_talk_09_base_fixed_min_area.json \
--prompt  "The video features a old lady is saying something and knitting a sweater." \
+--prompt  "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
+--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
+--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4