Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xuwx1
LightX2V
Commits
f67c46e4
Unverified
Commit
f67c46e4
authored
Dec 04, 2025
by
sandy
Committed by
GitHub
Dec 04, 2025
Browse files
[Feat] Add f2v for sekotalk (#562)
Co-authored-by:
Yang Yong (雍洋)
<
yongyang1030@163.com
>
parent
f4ab64f4
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
55 additions
and
3 deletions
+55
-3
configs/seko_talk/seko_talk_28_f2v.json
configs/seko_talk/seko_talk_28_f2v.json
+24
-0
lightx2v/models/networks/wan/infer/audio/pre_infer.py
lightx2v/models/networks/wan/infer/audio/pre_infer.py
+4
-1
lightx2v/models/runners/wan/wan_audio_runner.py
lightx2v/models/runners/wan/wan_audio_runner.py
+7
-2
scripts/seko_talk/run_seko_talk_28_f2v.sh
scripts/seko_talk/run_seko_talk_28_f2v.sh
+20
-0
No files found.
configs/seko_talk/seko_talk_28_f2v.json
0 → 100644
View file @
f67c46e4
{
"infer_steps"
:
4
,
"target_fps"
:
16
,
"video_duration"
:
12
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"prev_frame_length"
:
1
,
"resize_mode"
:
"adaptive"
,
"self_attn_1_type"
:
"flash_attn3"
,
"cross_attn_1_type"
:
"flash_attn3"
,
"cross_attn_2_type"
:
"flash_attn3"
,
"sample_guide_scale"
:
1.0
,
"sample_shift"
:
5
,
"enable_cfg"
:
false
,
"cpu_offload"
:
false
,
"use_31_block"
:
false
,
"f2v_process"
:
true
,
"lora_configs"
:
[
{
"path"
:
"lightx2v_I2V_14B_480p_cfg_step_distill_rank32_bf16.safetensors"
,
"strength"
:
1.0
}
]
}
lightx2v/models/networks/wan/infer/audio/pre_infer.py
View file @
f67c46e4
...
...
@@ -59,7 +59,10 @@ class WanAudioPreInfer(WanPreInfer):
y
=
weights
.
patch_embedding
.
apply
(
y
.
unsqueeze
(
0
))
y
=
y
.
flatten
(
2
).
transpose
(
1
,
2
).
contiguous
()
x
=
torch
.
cat
([
x
,
y
],
dim
=
1
).
squeeze
(
0
)
if
not
self
.
config
.
get
(
"f2v_process"
,
False
):
x
=
torch
.
cat
([
x
,
y
],
dim
=
1
).
squeeze
(
0
)
else
:
x
=
x
.
squeeze
(
0
)
####for r2v # zero temporl component corresponding to ref embeddings
# self.freqs[grid_sizes_t:, : self.rope_t_dim] = 0
...
...
lightx2v/models/runners/wan/wan_audio_runner.py
View file @
f67c46e4
...
...
@@ -526,6 +526,8 @@ class WanAudioRunner(WanRunner): # type:ignore
@
ProfilingContext4DebugL2
(
"Run Encoders"
)
def
_run_input_encoder_local_s2v
(
self
):
img
,
latent_shape
,
target_shape
=
self
.
read_image_input
(
self
.
input_info
.
image_path
)
if
self
.
config
.
get
(
"f2v_process"
,
False
):
self
.
ref_img
=
img
self
.
input_info
.
latent_shape
=
latent_shape
# Important: set latent_shape in input_info
self
.
input_info
.
target_shape
=
target_shape
# Important: set target_shape in input_info
clip_encoder_out
=
self
.
run_image_encoder
(
img
)
if
self
.
config
.
get
(
"use_image_encoder"
,
True
)
else
None
...
...
@@ -558,7 +560,7 @@ class WanAudioRunner(WanRunner): # type:ignore
if
prev_video
is
not
None
:
# Extract and process last frames
last_frames
=
prev_video
[:,
:,
-
prev_frame_length
:].
clone
().
to
(
AI_DEVICE
)
if
self
.
config
[
"model_cls"
]
!=
"wan2.2_audio"
:
if
self
.
config
[
"model_cls"
]
!=
"wan2.2_audio"
and
not
self
.
config
.
get
(
"f2v_process"
,
False
)
:
last_frames
=
self
.
frame_preprocessor
.
process_prev_frames
(
last_frames
)
prev_frames
[:,
:,
:
prev_frame_length
]
=
last_frames
prev_len
=
(
prev_frame_length
-
1
)
//
4
+
1
...
...
@@ -620,7 +622,10 @@ class WanAudioRunner(WanRunner): # type:ignore
def
init_run
(
self
):
super
().
init_run
()
self
.
scheduler
.
set_audio_adapter
(
self
.
audio_adapter
)
self
.
prev_video
=
None
if
self
.
config
.
get
(
"f2v_process"
,
False
):
self
.
prev_video
=
self
.
ref_img
.
unsqueeze
(
2
)
else
:
self
.
prev_video
=
None
if
self
.
input_info
.
return_result_tensor
:
self
.
gen_video_final
=
torch
.
zeros
((
self
.
inputs
[
"expected_frames"
],
self
.
input_info
.
target_shape
[
0
],
self
.
input_info
.
target_shape
[
1
],
3
),
dtype
=
torch
.
float32
,
device
=
"cpu"
)
self
.
cut_audio_final
=
torch
.
zeros
((
self
.
inputs
[
"expected_frames"
]
*
self
.
_audio_processor
.
audio_frame_rate
),
dtype
=
torch
.
float32
,
device
=
"cpu"
)
...
...
scripts/seko_talk/run_seko_talk_28_f2v.sh
0 → 100755
View file @
f67c46e4
#!/bin/bash
lightx2v_path
=
model_path
=
export
CUDA_VISIBLE_DEVICES
=
0
# set environment variables
source
${
lightx2v_path
}
/scripts/base/base.sh
python
-m
lightx2v.infer
\
--model_cls
seko_talk
\
--task
s2v
\
--model_path
$model_path
\
--config_json
${
lightx2v_path
}
/configs/seko_talk/seko_talk_28_f2v.json
\
--prompt
"The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze."
\
--negative_prompt
色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/seko_input.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/seko_input.mp3
\
--save_result_path
${
lightx2v_path
}
/save_results/output_lightx2v_seko_talk.mp4
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment