Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xuwx1
LightX2V
Commits
6de0a3b4
Commit
6de0a3b4
authored
Sep 02, 2025
by
Yang Yong(雍洋)
Committed by
GitHub
Sep 02, 2025
Browse files
Add audio input files and update pre-commit config for larger files (#283)
parent
8de61521
Changes
24
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
59 additions
and
48 deletions
+59
-48
.pre-commit-config.yaml
.pre-commit-config.yaml
+1
-0
configs/offload/disk/wan_i2v_audio_phase_lazy_load_720p.json
configs/offload/disk/wan_i2v_audio_phase_lazy_load_720p.json
+1
-1
configs/seko_talk/seko_talk_01_base.json
configs/seko_talk/seko_talk_01_base.json
+1
-1
configs/seko_talk/seko_talk_02_fp8.json
configs/seko_talk/seko_talk_02_fp8.json
+1
-1
configs/seko_talk/seko_talk_03_dist.json
configs/seko_talk/seko_talk_03_dist.json
+2
-2
configs/seko_talk/seko_talk_04_fp8_dist.json
configs/seko_talk/seko_talk_04_fp8_dist.json
+2
-2
configs/seko_talk/seko_talk_08_5B_base.json
configs/seko_talk/seko_talk_08_5B_base.json
+8
-4
configs/seko_talk/seko_talk_09_base_fixed_min_area.json
configs/seko_talk/seko_talk_09_base_fixed_min_area.json
+1
-1
configs/seko_talk/seko_talk_10_fp8_dist_fixed_min_area.json
configs/seko_talk/seko_talk_10_fp8_dist_fixed_min_area.json
+1
-1
configs/seko_talk/seko_talk_11_fp8_dist_fixed_shape.json
configs/seko_talk/seko_talk_11_fp8_dist_fixed_shape.json
+5
-2
configs/seko_talk/seko_talk_12_fp8_dist_fixed_shape_8gpus_1s.json
...seko_talk/seko_talk_12_fp8_dist_fixed_shape_8gpus_1s.json
+5
-2
scripts/seko_talk/run_seko_talk_01_base.sh
scripts/seko_talk/run_seko_talk_01_base.sh
+3
-3
scripts/seko_talk/run_seko_talk_02_fp8.sh
scripts/seko_talk/run_seko_talk_02_fp8.sh
+3
-3
scripts/seko_talk/run_seko_talk_03_dist.sh
scripts/seko_talk/run_seko_talk_03_dist.sh
+5
-5
scripts/seko_talk/run_seko_talk_04_fp8_dist.sh
scripts/seko_talk/run_seko_talk_04_fp8_dist.sh
+5
-5
scripts/seko_talk/run_seko_talk_05_offload_fp8_4090.sh
scripts/seko_talk/run_seko_talk_05_offload_fp8_4090.sh
+3
-3
scripts/seko_talk/run_seko_talk_06_offload_fp8_H100.sh
scripts/seko_talk/run_seko_talk_06_offload_fp8_H100.sh
+3
-3
scripts/seko_talk/run_seko_talk_07_dist_offload.sh
scripts/seko_talk/run_seko_talk_07_dist_offload.sh
+3
-3
scripts/seko_talk/run_seko_talk_08_5B_base.sh
scripts/seko_talk/run_seko_talk_08_5B_base.sh
+3
-3
scripts/seko_talk/run_seko_talk_09_base_fixed_min_area.sh
scripts/seko_talk/run_seko_talk_09_base_fixed_min_area.sh
+3
-3
No files found.
.pre-commit-config.yaml
View file @
6de0a3b4
...
...
@@ -16,6 +16,7 @@ repos:
-
id
:
check-yaml
-
id
:
check-toml
-
id
:
check-added-large-files
args
:
[
'
--maxkb=3000'
]
# Allow files up to 3MB
-
id
:
check-case-conflict
-
id
:
check-merge-conflict
-
id
:
debug-statements
configs/offload/disk/wan_i2v_audio_phase_lazy_load_720p.json
View file @
6de0a3b4
{
"infer_steps"
:
4
,
"target_fps"
:
16
,
"video_duration"
:
1
2
,
"video_duration"
:
1
5
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"target_height"
:
720
,
...
...
configs/seko_talk/seko_talk_01_base.json
View file @
6de0a3b4
{
"infer_steps"
:
4
,
"target_fps"
:
16
,
"video_duration"
:
1
2
,
"video_duration"
:
1
5
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"resize_mode"
:
"adaptive"
,
...
...
configs/seko_talk/seko_talk_02_fp8.json
View file @
6de0a3b4
{
"infer_steps"
:
4
,
"target_fps"
:
16
,
"video_duration"
:
1
2
,
"video_duration"
:
1
5
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"resize_mode"
:
"adaptive"
,
...
...
configs/seko_talk/seko_talk_03_dist.json
View file @
6de0a3b4
{
"infer_steps"
:
4
,
"target_fps"
:
16
,
"video_duration"
:
1
2
,
"video_duration"
:
1
5
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"resize_mode"
:
"adaptive"
,
...
...
@@ -15,7 +15,7 @@
"cpu_offload"
:
false
,
"use_31_block"
:
false
,
"parallel"
:
{
"seq_p_size"
:
4
,
"seq_p_size"
:
8
,
"seq_p_attn_type"
:
"ulysses"
}
}
configs/seko_talk/seko_talk_04_fp8_dist.json
View file @
6de0a3b4
{
"infer_steps"
:
4
,
"target_fps"
:
16
,
"video_duration"
:
1
2
,
"video_duration"
:
1
5
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"resize_mode"
:
"adaptive"
,
...
...
@@ -15,7 +15,7 @@
"cpu_offload"
:
false
,
"use_31_block"
:
false
,
"parallel"
:
{
"seq_p_size"
:
4
,
"seq_p_size"
:
8
,
"seq_p_attn_type"
:
"ulysses"
},
"mm_config"
:
{
...
...
configs/seko_talk/seko_talk_08_5B_base.json
View file @
6de0a3b4
{
"infer_steps"
:
4
,
"target_fps"
:
24
,
"video_duration"
:
1
2
,
"video_duration"
:
1
5
,
"audio_sr"
:
16000
,
"target_video_length"
:
121
,
"resize_mode"
:
"adaptive"
,
"text_len"
:
512
,
"num_channels_latents"
:
48
,
"vae_stride"
:
[
4
,
16
,
16
],
"vae_stride"
:
[
4
,
16
,
16
],
"self_attn_1_type"
:
"flash_attn3"
,
"cross_attn_1_type"
:
"flash_attn3"
,
"cross_attn_2_type"
:
"flash_attn3"
,
...
...
configs/seko_talk/seko_talk_09_base_fixed_min_area.json
View file @
6de0a3b4
{
"infer_steps"
:
4
,
"target_fps"
:
16
,
"video_duration"
:
1
2
,
"video_duration"
:
1
5
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"resize_mode"
:
"fixed_min_area"
,
...
...
configs/seko_talk/seko_talk_10_fp8_dist_fixed_min_area.json
View file @
6de0a3b4
{
"infer_steps"
:
4
,
"target_fps"
:
16
,
"video_duration"
:
1
2
,
"video_duration"
:
1
5
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"resize_mode"
:
"fixed_min_area"
,
...
...
configs/seko_talk/seko_talk_11_fp8_dist_fixed_shape.json
View file @
6de0a3b4
{
"infer_steps"
:
4
,
"target_fps"
:
16
,
"video_duration"
:
1
2
,
"video_duration"
:
1
5
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"resize_mode"
:
"fixed_shape"
,
"fixed_shape"
:
[
240
,
320
],
"fixed_shape"
:
[
240
,
320
],
"self_attn_1_type"
:
"sage_attn2"
,
"cross_attn_1_type"
:
"sage_attn2"
,
"cross_attn_2_type"
:
"sage_attn2"
,
...
...
configs/seko_talk/seko_talk_12_fp8_dist_fixed_shape_8gpus_1s.json
View file @
6de0a3b4
{
"infer_steps"
:
4
,
"target_fps"
:
16
,
"video_duration"
:
1
2
,
"video_duration"
:
1
5
,
"audio_sr"
:
16000
,
"target_video_length"
:
17
,
"prev_frame_length"
:
1
,
"resize_mode"
:
"fixed_shape"
,
"fixed_shape"
:
[
480
,
480
],
"fixed_shape"
:
[
480
,
480
],
"self_attn_1_type"
:
"flash_attn3"
,
"cross_attn_1_type"
:
"flash_attn3"
,
"cross_attn_2_type"
:
"flash_attn3"
,
...
...
scripts/seko_talk/run_seko_talk_01_base.sh
View file @
6de0a3b4
...
...
@@ -18,8 +18,8 @@ python -m lightx2v.infer \
--task
i2v
\
--model_path
$model_path
\
--config_json
${
lightx2v_path
}
/configs/seko_talk/seko_talk_01_base.json
\
--prompt
"The video features a
old lady is saying something and knitting a sweater
."
\
--prompt
"The video features a
male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze
."
\
--negative_prompt
色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.wav
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.wav
\
--save_video_path
${
lightx2v_path
}
/save_results/output_lightx2v_seko_talk.mp4
scripts/seko_talk/run_seko_talk_02_fp8.sh
View file @
6de0a3b4
...
...
@@ -18,8 +18,8 @@ python -m lightx2v.infer \
--task
i2v
\
--model_path
$model_path
\
--config_json
${
lightx2v_path
}
/configs/seko_talk/seko_talk_02_fp8.json
\
--prompt
"The video features a
old lady is saying something and knitting a sweater
."
\
--prompt
"The video features a
male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze
."
\
--negative_prompt
色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.wav
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.wav
\
--save_video_path
${
lightx2v_path
}
/save_results/output_lightx2v_seko_talk.mp4
scripts/seko_talk/run_seko_talk_03_dist.sh
View file @
6de0a3b4
...
...
@@ -3,7 +3,7 @@
lightx2v_path
=
/path/to/Lightx2v
model_path
=
/path/to/SekoTalk-Distill
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3
,4,5,6,7
# set environment variables
source
${
lightx2v_path
}
/scripts/base/base.sh
...
...
@@ -13,13 +13,13 @@ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export
ENABLE_GRAPH_MODE
=
false
export
SENSITIVE_LAYER_DTYPE
=
None
torchrun
--nproc-per-node
4
-m
lightx2v.infer
\
torchrun
--nproc-per-node
8
-m
lightx2v.infer
\
--model_cls
seko_talk
\
--task
i2v
\
--model_path
$model_path
\
--config_json
${
lightx2v_path
}
/configs/seko_talk/seko_talk_03_dist.json
\
--prompt
"The video features a
old lady is saying something and knitting a sweater
."
\
--prompt
"The video features a
male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze
."
\
--negative_prompt
色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.wav
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.wav
\
--save_video_path
${
lightx2v_path
}
/save_results/output_lightx2v_seko_talk.mp4
scripts/seko_talk/run_seko_talk_04_fp8_dist.sh
View file @
6de0a3b4
...
...
@@ -3,7 +3,7 @@
lightx2v_path
=
/path/to/Lightx2v
model_path
=
/path/to/SekoTalk-Distill-fp8
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3
,4,5,6,7
# set environment variables
source
${
lightx2v_path
}
/scripts/base/base.sh
...
...
@@ -13,13 +13,13 @@ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export
ENABLE_GRAPH_MODE
=
false
export
SENSITIVE_LAYER_DTYPE
=
None
torchrun
--nproc-per-node
4
-m
lightx2v.infer
\
torchrun
--nproc-per-node
8
-m
lightx2v.infer
\
--model_cls
seko_talk
\
--task
i2v
\
--model_path
$model_path
\
--config_json
${
lightx2v_path
}
/configs/seko_talk/seko_talk_04_fp8_dist.json
\
--prompt
"The video features a
old lady is saying something and knitting a sweater
."
\
--prompt
"The video features a
male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze
."
\
--negative_prompt
色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.wav
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.wav
\
--save_video_path
${
lightx2v_path
}
/save_results/output_lightx2v_seko_talk.mp4
scripts/seko_talk/run_seko_talk_05_offload_fp8_4090.sh
View file @
6de0a3b4
...
...
@@ -18,8 +18,8 @@ python -m lightx2v.infer \
--task
i2v
\
--model_path
$model_path
\
--config_json
${
lightx2v_path
}
/configs/seko_talk/seko_talk_05_offload_fp8_4090.json
\
--prompt
"The video features a
old lady is saying something and knitting a sweater
."
\
--prompt
"The video features a
male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze
."
\
--negative_prompt
色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.wav
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.wav
\
--save_video_path
${
lightx2v_path
}
/save_results/output_lightx2v_seko_talk.mp4
scripts/seko_talk/run_seko_talk_06_offload_fp8_H100.sh
View file @
6de0a3b4
...
...
@@ -18,8 +18,8 @@ python -m lightx2v.infer \
--task
i2v
\
--model_path
$model_path
\
--config_json
${
lightx2v_path
}
/configs/seko_talk/seko_talk_06_offload_fp8_H100.json
\
--prompt
"The video features a
old lady is saying something and knitting a sweater
."
\
--prompt
"The video features a
male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze
."
\
--negative_prompt
色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.wav
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.wav
\
--save_video_path
${
lightx2v_path
}
/save_results/output_lightx2v_seko_talk.mp4
scripts/seko_talk/run_seko_talk_07_dist_offload.sh
View file @
6de0a3b4
...
...
@@ -18,8 +18,8 @@ torchrun --nproc-per-node 4 -m lightx2v.infer \
--task
i2v
\
--model_path
$model_path
\
--config_json
${
lightx2v_path
}
/configs/seko_talk/seko_talk_07_dist_offload.json
\
--prompt
"The video features a
old lady is saying something and knitting a sweater
."
\
--prompt
"The video features a
male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze
."
\
--negative_prompt
色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.wav
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.wav
\
--save_video_path
${
lightx2v_path
}
/save_results/output_lightx2v_seko_talk.mp4
scripts/seko_talk/run_seko_talk_08_5B_base.sh
View file @
6de0a3b4
...
...
@@ -18,8 +18,8 @@ python -m lightx2v.infer \
--task
i2v
\
--model_path
$model_path
\
--config_json
${
lightx2v_path
}
/configs/seko_talk/seko_talk_08_5B_base.json
\
--prompt
"The video features a
old lady is saying something and knitting a sweater
."
\
--prompt
"The video features a
male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze
."
\
--negative_prompt
色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.wav
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.wav
\
--save_video_path
${
lightx2v_path
}
/save_results/output_lightx2v_seko_talk.mp4
scripts/seko_talk/run_seko_talk_09_base_fixed_min_area.sh
View file @
6de0a3b4
...
...
@@ -18,8 +18,8 @@ python -m lightx2v.infer \
--task
i2v
\
--model_path
$model_path
\
--config_json
${
lightx2v_path
}
/configs/seko_talk/seko_talk_09_base_fixed_min_area.json
\
--prompt
"The video features a
old lady is saying something and knitting a sweater
."
\
--prompt
"The video features a
male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze
."
\
--negative_prompt
色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
15
.wav
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/
seko_input
.wav
\
--save_video_path
${
lightx2v_path
}
/save_results/output_lightx2v_seko_talk.mp4
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment