Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xuwx1
LightX2V
Commits
af5105c7
Commit
af5105c7
authored
Sep 01, 2025
by
Yang Yong(雍洋)
Committed by
GitHub
Sep 01, 2025
Browse files
Support resize_mode for SekoTalk model (#269)
parent
cf6ce7c7
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
139 additions
and
39 deletions
+139
-39
configs/seko_talk/seko_talk_01_base.json
configs/seko_talk/seko_talk_01_base.json
+2
-4
configs/seko_talk/seko_talk_02_fp8.json
configs/seko_talk/seko_talk_02_fp8.json
+1
-3
configs/seko_talk/seko_talk_03_dist.json
configs/seko_talk/seko_talk_03_dist.json
+1
-3
configs/seko_talk/seko_talk_04_fp8_dist.json
configs/seko_talk/seko_talk_04_fp8_dist.json
+1
-3
configs/seko_talk/seko_talk_05_offload_fp8_4090.json
configs/seko_talk/seko_talk_05_offload_fp8_4090.json
+1
-3
configs/seko_talk/seko_talk_06_offload_fp8_H100.json
configs/seko_talk/seko_talk_06_offload_fp8_H100.json
+1
-3
configs/seko_talk/seko_talk_07_dist_offload.json
configs/seko_talk/seko_talk_07_dist_offload.json
+1
-3
configs/seko_talk/seko_talk_08_5B_base.json
configs/seko_talk/seko_talk_08_5B_base.json
+1
-3
configs/seko_talk/seko_talk_09_base_fixed_min_area.json
configs/seko_talk/seko_talk_09_base_fixed_min_area.json
+17
-0
configs/seko_talk/seko_talk_10_fp8_dist_fixed_min_area.json
configs/seko_talk/seko_talk_10_fp8_dist_fixed_min_area.json
+28
-0
lightx2v/models/runners/wan/wan_audio_runner.py
lightx2v/models/runners/wan/wan_audio_runner.py
+35
-14
scripts/seko_talk/run_seko_talk_09_base_fixed_min_area.sh
scripts/seko_talk/run_seko_talk_09_base_fixed_min_area.sh
+25
-0
scripts/seko_talk/run_seko_talk_10_fp8_dist_fixed_min_area.sh
...pts/seko_talk/run_seko_talk_10_fp8_dist_fixed_min_area.sh
+25
-0
No files found.
configs/seko_talk/seko_talk_01_base.json
View file @
af5105c7
...
...
@@ -4,8 +4,7 @@
"video_duration"
:
12
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"target_height"
:
720
,
"target_width"
:
1280
,
"resize_mode"
:
"adaptive"
,
"self_attn_1_type"
:
"flash_attn3"
,
"cross_attn_1_type"
:
"flash_attn3"
,
"cross_attn_2_type"
:
"flash_attn3"
,
...
...
@@ -14,6 +13,5 @@
"sample_shift"
:
5
,
"enable_cfg"
:
false
,
"cpu_offload"
:
false
,
"use_31_block"
:
false
,
"adaptive_resize"
:
true
"use_31_block"
:
false
}
configs/seko_talk/seko_talk_02_fp8.json
View file @
af5105c7
...
...
@@ -4,8 +4,7 @@
"video_duration"
:
12
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"target_height"
:
720
,
"target_width"
:
1280
,
"resize_mode"
:
"adaptive"
,
"self_attn_1_type"
:
"sage_attn2"
,
"cross_attn_1_type"
:
"sage_attn2"
,
"cross_attn_2_type"
:
"sage_attn2"
,
...
...
@@ -15,7 +14,6 @@
"enable_cfg"
:
false
,
"cpu_offload"
:
false
,
"use_31_block"
:
false
,
"adaptive_resize"
:
true
,
"mm_config"
:
{
"mm_type"
:
"W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
},
...
...
configs/seko_talk/seko_talk_03_dist.json
View file @
af5105c7
...
...
@@ -4,8 +4,7 @@
"video_duration"
:
12
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"target_height"
:
720
,
"target_width"
:
1280
,
"resize_mode"
:
"adaptive"
,
"self_attn_1_type"
:
"flash_attn3"
,
"cross_attn_1_type"
:
"flash_attn3"
,
"cross_attn_2_type"
:
"flash_attn3"
,
...
...
@@ -15,7 +14,6 @@
"enable_cfg"
:
false
,
"cpu_offload"
:
false
,
"use_31_block"
:
false
,
"adaptive_resize"
:
true
,
"parallel"
:
{
"seq_p_size"
:
4
,
"seq_p_attn_type"
:
"ulysses"
...
...
configs/seko_talk/seko_talk_04_fp8_dist.json
View file @
af5105c7
...
...
@@ -4,8 +4,7 @@
"video_duration"
:
12
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"target_height"
:
720
,
"target_width"
:
1280
,
"resize_mode"
:
"adaptive"
,
"self_attn_1_type"
:
"sage_attn2"
,
"cross_attn_1_type"
:
"sage_attn2"
,
"cross_attn_2_type"
:
"sage_attn2"
,
...
...
@@ -15,7 +14,6 @@
"enable_cfg"
:
false
,
"cpu_offload"
:
false
,
"use_31_block"
:
false
,
"adaptive_resize"
:
true
,
"parallel"
:
{
"seq_p_size"
:
4
,
"seq_p_attn_type"
:
"ulysses"
...
...
configs/seko_talk/seko_talk_05_offload_fp8_4090.json
View file @
af5105c7
...
...
@@ -4,8 +4,7 @@
"video_duration"
:
120
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"target_height"
:
720
,
"target_width"
:
1280
,
"resize_mode"
:
"adaptive"
,
"self_attn_1_type"
:
"sage_attn2"
,
"cross_attn_1_type"
:
"sage_attn2"
,
"cross_attn_2_type"
:
"sage_attn2"
,
...
...
@@ -13,7 +12,6 @@
"sample_guide_scale"
:
1
,
"sample_shift"
:
5
,
"enable_cfg"
:
false
,
"adaptive_resize"
:
true
,
"use_31_block"
:
false
,
"cpu_offload"
:
true
,
"offload_granularity"
:
"block"
,
...
...
configs/seko_talk/seko_talk_06_offload_fp8_H100.json
View file @
af5105c7
...
...
@@ -4,8 +4,7 @@
"video_duration"
:
120
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"target_height"
:
720
,
"target_width"
:
1280
,
"resize_mode"
:
"adaptive"
,
"self_attn_1_type"
:
"sage_attn2"
,
"cross_attn_1_type"
:
"sage_attn2"
,
"cross_attn_2_type"
:
"sage_attn2"
,
...
...
@@ -13,7 +12,6 @@
"sample_guide_scale"
:
1
,
"sample_shift"
:
5
,
"enable_cfg"
:
false
,
"adaptive_resize"
:
true
,
"use_31_block"
:
false
,
"cpu_offload"
:
true
,
"offload_granularity"
:
"block"
,
...
...
configs/seko_talk/seko_talk_07_dist_offload.json
View file @
af5105c7
...
...
@@ -4,8 +4,7 @@
"video_duration"
:
5
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"target_height"
:
720
,
"target_width"
:
1280
,
"resize_mode"
:
"adaptive"
,
"self_attn_1_type"
:
"flash_attn3"
,
"cross_attn_1_type"
:
"flash_attn3"
,
"cross_attn_2_type"
:
"flash_attn3"
,
...
...
@@ -14,7 +13,6 @@
"sample_shift"
:
5
,
"enable_cfg"
:
false
,
"use_31_block"
:
false
,
"adaptive_resize"
:
true
,
"parallel"
:
{
"seq_p_size"
:
4
,
"seq_p_attn_type"
:
"ulysses"
...
...
configs/seko_talk/seko_talk_08_5B_base.json
View file @
af5105c7
...
...
@@ -4,9 +4,8 @@
"video_duration"
:
12
,
"audio_sr"
:
16000
,
"target_video_length"
:
121
,
"resize_mode"
:
"adaptive"
,
"text_len"
:
512
,
"target_height"
:
704
,
"target_width"
:
1280
,
"num_channels_latents"
:
48
,
"vae_stride"
:
[
4
,
16
,
16
],
"self_attn_1_type"
:
"flash_attn3"
,
...
...
@@ -20,7 +19,6 @@
"offload_granularity"
:
"model"
,
"fps"
:
24
,
"use_image_encoder"
:
false
,
"adaptive_resize"
:
true
,
"use_31_block"
:
false
,
"lora_configs"
:
[
{
...
...
configs/seko_talk/seko_talk_09_base_fixed_min_area.json
0 → 100644
View file @
af5105c7
{
"infer_steps"
:
4
,
"target_fps"
:
16
,
"video_duration"
:
12
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"resize_mode"
:
"fixed_min_area"
,
"self_attn_1_type"
:
"flash_attn3"
,
"cross_attn_1_type"
:
"flash_attn3"
,
"cross_attn_2_type"
:
"flash_attn3"
,
"seed"
:
42
,
"sample_guide_scale"
:
1.0
,
"sample_shift"
:
5
,
"enable_cfg"
:
false
,
"cpu_offload"
:
false
,
"use_31_block"
:
false
}
configs/seko_talk/seko_talk_10_fp8_dist_fixed_min_area.json
0 → 100755
View file @
af5105c7
{
"infer_steps"
:
4
,
"target_fps"
:
16
,
"video_duration"
:
12
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"resize_mode"
:
"fixed_min_area"
,
"self_attn_1_type"
:
"sage_attn2"
,
"cross_attn_1_type"
:
"sage_attn2"
,
"cross_attn_2_type"
:
"sage_attn2"
,
"seed"
:
42
,
"sample_guide_scale"
:
1.0
,
"sample_shift"
:
5
,
"enable_cfg"
:
false
,
"cpu_offload"
:
false
,
"use_31_block"
:
false
,
"parallel"
:
{
"seq_p_size"
:
4
,
"seq_p_attn_type"
:
"ulysses"
},
"mm_config"
:
{
"mm_type"
:
"W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
},
"adapter_quantized"
:
true
,
"adapter_quant_scheme"
:
"fp8"
,
"t5_quantized"
:
true
,
"t5_quant_scheme"
:
"fp8"
}
lightx2v/models/runners/wan/wan_audio_runner.py
View file @
af5105c7
...
...
@@ -80,7 +80,9 @@ def isotropic_crop_resize(frames: torch.Tensor, size: tuple):
return
resized_frames
def
adaptive_resize
(
img
):
def
resize_image
(
img
,
resize_mode
=
"adaptive"
,
fixed_area
=
None
):
assert
resize_mode
in
[
"adaptive"
,
"keep_ratio_fixed_area"
,
"fixed_min_area"
,
"fixed_max_area"
]
bucket_config
=
{
0.667
:
(
np
.
array
([[
480
,
832
],
[
544
,
960
],
[
720
,
1280
]],
dtype
=
np
.
int64
),
np
.
array
([
0.2
,
0.5
,
0.3
])),
1.0
:
(
np
.
array
([[
480
,
480
],
[
576
,
576
],
[
704
,
704
],
[
960
,
960
]],
dtype
=
np
.
int64
),
np
.
array
([
0.1
,
0.1
,
0.5
,
0.3
])),
...
...
@@ -89,18 +91,36 @@ def adaptive_resize(img):
ori_height
=
img
.
shape
[
-
2
]
ori_weight
=
img
.
shape
[
-
1
]
ori_ratio
=
ori_height
/
ori_weight
aspect_ratios
=
np
.
array
(
np
.
array
(
list
(
bucket_config
.
keys
())))
closet_aspect_idx
=
np
.
argmin
(
np
.
abs
(
aspect_ratios
-
ori_ratio
))
closet_ratio
=
aspect_ratios
[
closet_aspect_idx
]
if
ori_ratio
<
1.0
:
target_h
,
target_w
=
480
,
832
elif
ori_ratio
==
1.0
:
target_h
,
target_w
=
480
,
480
else
:
target_h
,
target_w
=
832
,
480
for
resolution
in
bucket_config
[
closet_ratio
][
0
]:
if
ori_height
*
ori_weight
>=
resolution
[
0
]
*
resolution
[
1
]:
target_h
,
target_w
=
resolution
if
resize_mode
==
"adaptive"
:
aspect_ratios
=
np
.
array
(
np
.
array
(
list
(
bucket_config
.
keys
())))
closet_aspect_idx
=
np
.
argmin
(
np
.
abs
(
aspect_ratios
-
ori_ratio
))
closet_ratio
=
aspect_ratios
[
closet_aspect_idx
]
if
ori_ratio
<
1.0
:
target_h
,
target_w
=
480
,
832
elif
ori_ratio
==
1.0
:
target_h
,
target_w
=
480
,
480
else
:
target_h
,
target_w
=
832
,
480
for
resolution
in
bucket_config
[
closet_ratio
][
0
]:
if
ori_height
*
ori_weight
>=
resolution
[
0
]
*
resolution
[
1
]:
target_h
,
target_w
=
resolution
elif
resize_mode
==
"keep_ratio_fixed_area"
:
assert
fixed_area
in
[
"480p"
,
"720p"
],
f
"fixed_area must be in ['480p', '720p'], but got
{
fixed_area
}
, please set fixed_area in config."
fixed_area
=
480
*
832
if
fixed_area
==
"480p"
else
720
*
1280
target_h
=
round
(
np
.
sqrt
(
fixed_area
*
ori_ratio
))
target_w
=
round
(
np
.
sqrt
(
fixed_area
/
ori_ratio
))
elif
resize_mode
==
"fixed_min_area"
:
aspect_ratios
=
np
.
array
(
np
.
array
(
list
(
bucket_config
.
keys
())))
closet_aspect_idx
=
np
.
argmin
(
np
.
abs
(
aspect_ratios
-
ori_ratio
))
closet_ratio
=
aspect_ratios
[
closet_aspect_idx
]
target_h
,
target_w
=
bucket_config
[
closet_ratio
][
0
][
0
]
elif
resize_mode
==
"fixed_max_area"
:
aspect_ratios
=
np
.
array
(
np
.
array
(
list
(
bucket_config
.
keys
())))
closet_aspect_idx
=
np
.
argmin
(
np
.
abs
(
aspect_ratios
-
ori_ratio
))
closet_ratio
=
aspect_ratios
[
closet_aspect_idx
]
target_h
,
target_w
=
bucket_config
[
closet_ratio
][
0
][
-
1
]
cropped_img
=
isotropic_crop_resize
(
img
,
(
target_h
,
target_w
))
return
cropped_img
,
target_h
,
target_w
...
...
@@ -269,7 +289,8 @@ class WanAudioRunner(WanRunner): # type:ignore
ref_img
=
Image
.
open
(
img_path
).
convert
(
"RGB"
)
ref_img
=
TF
.
to_tensor
(
ref_img
).
sub_
(
0.5
).
div_
(
0.5
).
unsqueeze
(
0
).
cuda
()
ref_img
,
h
,
w
=
adaptive_resize
(
ref_img
)
ref_img
,
h
,
w
=
resize_image
(
ref_img
,
resize_mode
=
self
.
config
.
get
(
"resize_mode"
,
"adaptive"
),
fixed_area
=
self
.
config
.
get
(
"fixed_area"
,
None
))
logger
.
info
(
f
"[wan_audio] resize_image target_h:
{
h
}
, target_w:
{
w
}
"
)
patched_h
=
h
//
self
.
config
.
vae_stride
[
1
]
//
self
.
config
.
patch_size
[
1
]
patched_w
=
w
//
self
.
config
.
vae_stride
[
2
]
//
self
.
config
.
patch_size
[
2
]
...
...
scripts/seko_talk/run_seko_talk_09_base_fixed_min_area.sh
0 → 100755
View file @
af5105c7
#!/bin/bash
lightx2v_path
=
model_path
=
export
CUDA_VISIBLE_DEVICES
=
0
# set environment variables
source
${
lightx2v_path
}
/scripts/base/base.sh
export
PYTORCH_CUDA_ALLOC_CONF
=
expandable_segments:True
export
ENABLE_GRAPH_MODE
=
false
export
SENSITIVE_LAYER_DTYPE
=
None
python
-m
lightx2v.infer
\
--model_cls
seko_talk
\
--task
i2v
\
--model_path
$model_path
\
--config_json
${
lightx2v_path
}
/configs/seko_talk/seko_talk_09_base_fixed_min_area.json
\
--prompt
"The video features a old lady is saying something and knitting a sweater."
\
--negative_prompt
色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/15.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/15.wav
\
--save_video_path
${
lightx2v_path
}
/save_results/output_lightx2v_seko_talk.mp4
scripts/seko_talk/run_seko_talk_10_fp8_dist_fixed_min_area.sh
0 → 100755
View file @
af5105c7
#!/bin/bash
lightx2v_path
=
model_path
=
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3
# set environment variables
source
${
lightx2v_path
}
/scripts/base/base.sh
export
PYTORCH_CUDA_ALLOC_CONF
=
expandable_segments:True
export
ENABLE_GRAPH_MODE
=
false
export
SENSITIVE_LAYER_DTYPE
=
None
torchrun
--nproc-per-node
4
-m
lightx2v.infer
\
--model_cls
seko_talk
\
--task
i2v
\
--model_path
$model_path
\
--config_json
${
lightx2v_path
}
/configs/seko_talk/seko_talk_10_fp8_dist_fixed_min_area.json
\
--prompt
"The video features a old lady is saying something and knitting a sweater."
\
--negative_prompt
色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/15.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/15.wav
\
--save_video_path
${
lightx2v_path
}
/save_results/output_lightx2v_seko_talk.mp4
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment