Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xuwx1
LightX2V
Commits
ad73b271
Commit
ad73b271
authored
Sep 04, 2025
by
Yang Yong(雍洋)
Committed by
GitHub
Sep 04, 2025
Browse files
Support custom bucket_shape for sekotalk model (#287)
parent
c33c4896
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
150 additions
and
12 deletions
+150
-12
configs/seko_talk/seko_talk_13_fp8_dist_bucket_shape_8gpus_5s_realtime.json
...seko_talk_13_fp8_dist_bucket_shape_8gpus_5s_realtime.json
+33
-0
configs/seko_talk/seko_talk_14_fp8_dist_bucket_shape_8gpus_1s_realtime.json
...seko_talk_14_fp8_dist_bucket_shape_8gpus_1s_realtime.json
+34
-0
lightx2v/models/runners/wan/wan_audio_runner.py
lightx2v/models/runners/wan/wan_audio_runner.py
+31
-10
scripts/seko_talk/run_seko_talk_12_fp8_dist_fixed_shape_8gpus_1s.sh
...ko_talk/run_seko_talk_12_fp8_dist_fixed_shape_8gpus_1s.sh
+2
-2
scripts/seko_talk/run_seko_talk_13_fp8_dist_bucket_shape_8gpus_5s_realtime.sh
...n_seko_talk_13_fp8_dist_bucket_shape_8gpus_5s_realtime.sh
+25
-0
scripts/seko_talk/run_seko_talk_14_fp8_dist_bucket_shape_8gpus_1s_realtime.sh
...n_seko_talk_14_fp8_dist_bucket_shape_8gpus_1s_realtime.sh
+25
-0
No files found.
configs/seko_talk/seko_talk_13_fp8_dist_bucket_shape_8gpus_5s_realtime.json
0 → 100755
View file @
ad73b271
{
"infer_steps"
:
4
,
"target_fps"
:
16
,
"video_duration"
:
15
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"resize_mode"
:
"adaptive"
,
"bucket_shape"
:
{
"0.667"
:
[[
480
,
832
],
[
544
,
960
]],
"1.500"
:
[[
832
,
480
],
[
960
,
544
]],
"1.000"
:
[[
480
,
480
],
[
576
,
576
],
[
704
,
704
]]
},
"self_attn_1_type"
:
"sage_attn2"
,
"cross_attn_1_type"
:
"sage_attn2"
,
"cross_attn_2_type"
:
"sage_attn2"
,
"seed"
:
42
,
"sample_guide_scale"
:
1.0
,
"sample_shift"
:
5
,
"enable_cfg"
:
false
,
"cpu_offload"
:
false
,
"use_31_block"
:
false
,
"parallel"
:
{
"seq_p_size"
:
8
,
"seq_p_attn_type"
:
"ulysses"
},
"mm_config"
:
{
"mm_type"
:
"W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl"
},
"adapter_quantized"
:
true
,
"adapter_quant_scheme"
:
"fp8"
,
"t5_quantized"
:
true
,
"t5_quant_scheme"
:
"fp8"
}
configs/seko_talk/seko_talk_14_fp8_dist_bucket_shape_8gpus_1s_realtime.json
0 → 100755
View file @
ad73b271
{
"infer_steps"
:
4
,
"target_fps"
:
16
,
"video_duration"
:
15
,
"audio_sr"
:
16000
,
"target_video_length"
:
17
,
"prev_frame_length"
:
1
,
"resize_mode"
:
"adaptive"
,
"bucket_shape"
:
{
"0.667"
:
[[
480
,
832
],
[
544
,
960
]],
"1.500"
:
[[
832
,
480
],
[
960
,
544
]],
"1.000"
:
[[
480
,
480
],
[
576
,
576
],
[
704
,
704
]]
},
"self_attn_1_type"
:
"flash_attn3"
,
"cross_attn_1_type"
:
"flash_attn3"
,
"cross_attn_2_type"
:
"flash_attn3"
,
"seed"
:
42
,
"sample_guide_scale"
:
1.0
,
"sample_shift"
:
5
,
"enable_cfg"
:
false
,
"cpu_offload"
:
false
,
"use_31_block"
:
false
,
"parallel"
:
{
"seq_p_size"
:
8
,
"seq_p_attn_type"
:
"ulysses"
},
"mm_config"
:
{
"mm_type"
:
"W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl"
},
"adapter_quantized"
:
true
,
"adapter_quant_scheme"
:
"fp8"
,
"t5_quantized"
:
true
,
"t5_quant_scheme"
:
"fp8"
}
lightx2v/models/runners/wan/wan_audio_runner.py
View file @
ad73b271
...
...
@@ -103,7 +103,7 @@ def fixed_shape_resize(img, target_height, target_width):
return
resized_img
,
h
,
w
def
resize_image
(
img
,
resize_mode
=
"adaptive"
,
fixed_area
=
None
,
fixed_shape
=
None
):
def
resize_image
(
img
,
resize_mode
=
"adaptive"
,
bucket_shape
=
None
,
fixed_area
=
None
,
fixed_shape
=
None
):
assert
resize_mode
in
[
"adaptive"
,
"keep_ratio_fixed_area"
,
"fixed_min_area"
,
"fixed_max_area"
,
"fixed_shape"
]
if
resize_mode
==
"fixed_shape"
:
...
...
@@ -111,11 +111,26 @@ def resize_image(img, resize_mode="adaptive", fixed_area=None, fixed_shape=None)
logger
.
info
(
f
"[wan_audio] fixed_shape_resize fixed_height:
{
fixed_shape
[
0
]
}
, fixed_width:
{
fixed_shape
[
1
]
}
"
)
return
fixed_shape_resize
(
img
,
fixed_shape
[
0
],
fixed_shape
[
1
])
bucket_config
=
{
0.667
:
(
np
.
array
([[
480
,
832
],
[
544
,
960
],
[
720
,
1280
]],
dtype
=
np
.
int64
),
np
.
array
([
0.2
,
0.5
,
0.3
])),
1.0
:
(
np
.
array
([[
480
,
480
],
[
576
,
576
],
[
704
,
704
],
[
960
,
960
]],
dtype
=
np
.
int64
),
np
.
array
([
0.1
,
0.1
,
0.5
,
0.3
])),
1.5
:
(
np
.
array
([[
480
,
832
],
[
544
,
960
],
[
720
,
1280
]],
dtype
=
np
.
int64
)[:,
::
-
1
],
np
.
array
([
0.2
,
0.5
,
0.3
])),
}
if
bucket_shape
is
not
None
:
"""
"adaptive_shape": {
"0.667": [[480, 832], [544, 960], [720, 1280]],
"1.500": [[832, 480], [960, 544], [1280, 720]],
"1.000": [[480, 480], [576, 576], [704, 704], [960, 960]]
}
"""
bucket_config
=
{}
for
ratio
,
resolutions
in
bucket_shape
.
items
():
bucket_config
[
float
(
ratio
)]
=
np
.
array
(
resolutions
,
dtype
=
np
.
int64
)
logger
.
info
(
f
"[wan_audio] use custom bucket_shape:
{
bucket_config
}
"
)
else
:
bucket_config
=
{
0.667
:
np
.
array
([[
480
,
832
],
[
544
,
960
],
[
720
,
1280
]],
dtype
=
np
.
int64
),
1.500
:
np
.
array
([[
832
,
480
],
[
960
,
544
],
[
1280
,
720
]],
dtype
=
np
.
int64
),
1.000
:
np
.
array
([[
480
,
480
],
[
576
,
576
],
[
704
,
704
],
[
960
,
960
]],
dtype
=
np
.
int64
),
}
logger
.
info
(
f
"[wan_audio] use default bucket_shape:
{
bucket_config
}
"
)
ori_height
=
img
.
shape
[
-
2
]
ori_weight
=
img
.
shape
[
-
1
]
ori_ratio
=
ori_height
/
ori_weight
...
...
@@ -130,7 +145,7 @@ def resize_image(img, resize_mode="adaptive", fixed_area=None, fixed_shape=None)
target_h
,
target_w
=
480
,
480
else
:
target_h
,
target_w
=
832
,
480
for
resolution
in
bucket_config
[
closet_ratio
]
[
0
]
:
for
resolution
in
bucket_config
[
closet_ratio
]:
if
ori_height
*
ori_weight
>=
resolution
[
0
]
*
resolution
[
1
]:
target_h
,
target_w
=
resolution
elif
resize_mode
==
"keep_ratio_fixed_area"
:
...
...
@@ -142,12 +157,12 @@ def resize_image(img, resize_mode="adaptive", fixed_area=None, fixed_shape=None)
aspect_ratios
=
np
.
array
(
np
.
array
(
list
(
bucket_config
.
keys
())))
closet_aspect_idx
=
np
.
argmin
(
np
.
abs
(
aspect_ratios
-
ori_ratio
))
closet_ratio
=
aspect_ratios
[
closet_aspect_idx
]
target_h
,
target_w
=
bucket_config
[
closet_ratio
][
0
]
[
0
]
target_h
,
target_w
=
bucket_config
[
closet_ratio
][
0
]
elif
resize_mode
==
"fixed_max_area"
:
aspect_ratios
=
np
.
array
(
np
.
array
(
list
(
bucket_config
.
keys
())))
closet_aspect_idx
=
np
.
argmin
(
np
.
abs
(
aspect_ratios
-
ori_ratio
))
closet_ratio
=
aspect_ratios
[
closet_aspect_idx
]
target_h
,
target_w
=
bucket_config
[
closet_ratio
][
0
][
-
1
]
target_h
,
target_w
=
bucket_config
[
closet_ratio
][
-
1
]
cropped_img
=
isotropic_crop_resize
(
img
,
(
target_h
,
target_w
))
return
cropped_img
,
target_h
,
target_w
...
...
@@ -322,7 +337,13 @@ class WanAudioRunner(WanRunner): # type:ignore
ref_img
=
Image
.
open
(
img_path
).
convert
(
"RGB"
)
ref_img
=
TF
.
to_tensor
(
ref_img
).
sub_
(
0.5
).
div_
(
0.5
).
unsqueeze
(
0
).
cuda
()
ref_img
,
h
,
w
=
resize_image
(
ref_img
,
resize_mode
=
self
.
config
.
get
(
"resize_mode"
,
"adaptive"
),
fixed_area
=
self
.
config
.
get
(
"fixed_area"
,
None
),
fixed_shape
=
self
.
config
.
get
(
"fixed_shape"
,
None
))
ref_img
,
h
,
w
=
resize_image
(
ref_img
,
resize_mode
=
self
.
config
.
get
(
"resize_mode"
,
"adaptive"
),
bucket_shape
=
self
.
config
.
get
(
"bucket_shape"
,
None
),
fixed_area
=
self
.
config
.
get
(
"fixed_area"
,
None
),
fixed_shape
=
self
.
config
.
get
(
"fixed_shape"
,
None
),
)
logger
.
info
(
f
"[wan_audio] resize_image target_h:
{
h
}
, target_w:
{
w
}
"
)
patched_h
=
h
//
self
.
config
.
vae_stride
[
1
]
//
self
.
config
.
patch_size
[
1
]
patched_w
=
w
//
self
.
config
.
vae_stride
[
2
]
//
self
.
config
.
patch_size
[
2
]
...
...
scripts/seko_talk/run_seko_talk_12_fp8_dist_fixed_shape_8gpus_1s.sh
View file @
ad73b271
#!/bin/bash
lightx2v_path
=
model_path
=
lightx2v_path
=
/path/to/Lightx2v
model_path
=
/path/to/SekoTalk-Distill-fp8
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
...
...
scripts/seko_talk/run_seko_talk_13_fp8_dist_bucket_shape_8gpus_5s_realtime.sh
0 → 100755
View file @
ad73b271
#!/bin/bash
lightx2v_path
=
/path/to/Lightx2v
model_path
=
/path/to/SekoTalk-Distill-fp8
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# set environment variables
source
${
lightx2v_path
}
/scripts/base/base.sh
export
PYTORCH_CUDA_ALLOC_CONF
=
expandable_segments:True
export
ENABLE_GRAPH_MODE
=
false
export
SENSITIVE_LAYER_DTYPE
=
None
torchrun
--nproc-per-node
8
-m
lightx2v.infer
\
--model_cls
seko_talk
\
--task
i2v
\
--model_path
$model_path
\
--config_json
${
lightx2v_path
}
/configs/seko_talk/seko_talk_13_fp8_dist_bucket_shape_8gpus_5s_realtime.json
\
--prompt
"The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze."
\
--negative_prompt
色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/seko_input.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/seko_input.wav
\
--save_video_path
${
lightx2v_path
}
/save_results/output_lightx2v_seko_talk.mp4
scripts/seko_talk/run_seko_talk_14_fp8_dist_bucket_shape_8gpus_1s_realtime.sh
0 → 100755
View file @
ad73b271
#!/bin/bash
lightx2v_path
=
/path/to/Lightx2v
model_path
=
/path/to/SekoTalk-Distill-fp8
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# set environment variables
source
${
lightx2v_path
}
/scripts/base/base.sh
export
PYTORCH_CUDA_ALLOC_CONF
=
expandable_segments:True
export
ENABLE_GRAPH_MODE
=
false
export
SENSITIVE_LAYER_DTYPE
=
None
torchrun
--nproc-per-node
8
-m
lightx2v.infer
\
--model_cls
seko_talk
\
--task
i2v
\
--model_path
$model_path
\
--config_json
${
lightx2v_path
}
/configs/seko_talk/seko_talk_14_fp8_dist_bucket_shape_8gpus_1s_realtime.json
\
--prompt
"The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze."
\
--negative_prompt
色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/seko_input.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/seko_input.wav
\
--save_video_path
${
lightx2v_path
}
/save_results/output_lightx2v_seko_talk.mp4
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment