Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xuwx1
LightX2V
Commits
740d8d8f
Commit
740d8d8f
authored
Jul 15, 2025
by
wangshankun
Browse files
r2v v2版本更新
parent
e687fe1a
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
31 additions
and
15 deletions
+31
-15
configs/audio_driven/wan_i2v_audio.json
configs/audio_driven/wan_i2v_audio.json
+5
-4
lightx2v/models/networks/wan/infer/audio/pre_wan_audio_infer.py
...2v/models/networks/wan/infer/audio/pre_wan_audio_infer.py
+4
-2
lightx2v/models/runners/wan/wan_audio_runner.py
lightx2v/models/runners/wan/wan_audio_runner.py
+19
-5
scripts/wan/run_wan_i2v_audio.sh
scripts/wan/run_wan_i2v_audio.sh
+3
-4
No files found.
configs/audio_driven/wan_i2v_audio.json
View file @
740d8d8f
{
"infer_steps"
:
5
,
"infer_steps"
:
4
,
"target_fps"
:
16
,
"video_duration"
:
1
2
,
"video_duration"
:
1
6
,
"audio_sr"
:
16000
,
"target_video_length"
:
81
,
"target_height"
:
480
,
...
...
@@ -11,7 +11,8 @@
"cross_attn_2_type"
:
"flash_attn3"
,
"seed"
:
42
,
"sample_guide_scale"
:
1
,
"sample_shift"
:
5
,
"sample_shift"
:
6
,
"enable_cfg"
:
false
,
"cpu_offload"
:
false
"cpu_offload"
:
false
,
"use_tiling_vae"
:
true
}
lightx2v/models/networks/wan/infer/audio/pre_wan_audio_infer.py
View file @
740d8d8f
...
...
@@ -24,13 +24,15 @@ class WanAudioPreInfer(WanPreInfer):
self
.
text_len
=
config
[
"text_len"
]
def
infer
(
self
,
weights
,
inputs
,
positive
):
ltnt_
channel
=
self
.
scheduler
.
latents
.
size
(
0
)
ltnt_
frames
=
self
.
scheduler
.
latents
.
size
(
1
)
prev_latents
=
inputs
[
"previmg_encoder_output"
][
"prev_latents"
].
unsqueeze
(
0
)
prev_mask
=
inputs
[
"previmg_encoder_output"
][
"prev_mask"
]
hidden_states
=
self
.
scheduler
.
latents
.
unsqueeze
(
0
)
hidden_states
=
torch
.
cat
([
hidden_states
[:,
:
ltnt_channel
],
prev_latents
,
prev_mask
],
dim
=
1
)
# hidden_states = torch.cat([hidden_states[:, :ltnt_channel], prev_latents, prev_mask], dim=1)
# print(f"{prev_mask.shape}, {hidden_states.shape}, {prev_latents.shape},{prev_latents[:, :, :ltnt_frames].shape}")
hidden_states
=
torch
.
cat
([
hidden_states
,
prev_mask
,
prev_latents
[:,
:,
:
ltnt_frames
]],
dim
=
1
)
hidden_states
=
hidden_states
.
squeeze
(
0
)
x
=
[
hidden_states
]
...
...
lightx2v/models/runners/wan/wan_audio_runner.py
View file @
740d8d8f
...
...
@@ -18,6 +18,8 @@ from lightx2v.models.video_encoders.hf.wan.vae import WanVAE
from
lightx2v.models.networks.wan.audio_adapter
import
AudioAdapter
,
AudioAdapterPipe
,
rank0_load_state_dict_from_path
from
lightx2v.models.schedulers.wan.step_distill.scheduler
import
WanStepDistillScheduler
from
loguru
import
logger
import
torch.distributed
as
dist
from
einops
import
rearrange
...
...
@@ -369,6 +371,18 @@ class WanAudioRunner(WanRunner):
audio_frame_rate
=
audio_sr
/
fps
return
round
(
start_frame
*
audio_frame_rate
),
round
((
end_frame
+
1
)
*
audio_frame_rate
)
def
wan_mask_rearrange
(
mask
:
torch
.
Tensor
):
# mask: 1, T, H, W, where 1 means the input mask is one-channel
if
mask
.
ndim
==
3
:
mask
=
mask
[
None
]
assert
mask
.
ndim
==
4
_
,
t
,
h
,
w
=
mask
.
shape
assert
t
==
((
t
-
1
)
//
4
*
4
+
1
)
mask_first_frame
=
torch
.
repeat_interleave
(
mask
[:,
0
:
1
],
repeats
=
4
,
dim
=
1
)
mask
=
torch
.
concat
([
mask_first_frame
,
mask
[:,
1
:]],
dim
=
1
)
mask
=
mask
.
view
(
mask
.
shape
[
1
]
//
4
,
4
,
h
,
w
)
return
mask
.
transpose
(
0
,
1
)
# 4, T // 4, H, W
self
.
inputs
[
"audio_adapter_pipe"
]
=
self
.
load_audio_models
()
# process audio
...
...
@@ -449,11 +463,11 @@ class WanAudioRunner(WanRunner):
if
prev_latents
is
not
None
:
ltnt_channel
,
nframe
,
height
,
width
=
self
.
model
.
scheduler
.
latents
.
shape
bs
=
1
prev_mask
=
torch
.
zeros
((
bs
,
1
,
nframe
,
height
,
width
),
device
=
device
,
dtype
=
dtype
)
if
prev_
len
>
0
:
prev_mask
[:,
:,
:
prev_len
]
=
1.
0
#
bs = 1
frames_n
=
(
nframe
-
1
)
*
4
+
1
prev_
mask
=
torch
.
zeros
((
1
,
frames_n
,
height
,
width
),
device
=
device
,
dtype
=
dtype
)
prev_mask
[:,
prev_len
:
]
=
0
prev_mask
=
wan_mask_rearrange
(
prev_mask
).
unsqueeze
(
0
)
previmg_encoder_output
=
{
"prev_latents"
:
prev_latents
,
"prev_mask"
:
prev_mask
,
...
...
scripts/wan/run_wan_i2v_audio.sh
View file @
740d8d8f
...
...
@@ -2,8 +2,8 @@
# set path and first
lightx2v_path
=
"/mnt/Text2Video/wangshankun/lightx2v"
model_path
=
"/mnt/Text2Video/wangshankun/HF_Cache/Wan2.1-
I
2V-Audio-14B-720P/"
lora_path
=
"/mnt/Text2Video/w
angshankun/HF_Cache
/Wan21_
T
2V_14B_lightx2v_cfg_step_distill_lora_rank
32
.safetensors"
model_path
=
"/mnt/Text2Video/wangshankun/HF_Cache/Wan2.1-
R
2V-Audio-14B-720P/"
#
lora_path="/mnt/Text2Video/w
uzhuguanyu
/Wan21_
I
2V_14B_lightx2v_cfg_step_distill_lora_rank
64
.safetensors"
# check section
if
[
-z
"
${
CUDA_VISIBLE_DEVICES
}
"
]
;
then
cuda_devices
=
0
...
...
@@ -42,5 +42,4 @@ python -m lightx2v.infer \
--negative_prompt
色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走
\
--image_path
${
lightx2v_path
}
/assets/inputs/audio/15.png
\
--audio_path
${
lightx2v_path
}
/assets/inputs/audio/15.wav
\
--save_video_path
${
lightx2v_path
}
/save_results/output_lightx2v_wan_i2v_audio.mp4
\
--lora_path
${
lora_path
}
--save_video_path
${
lightx2v_path
}
/save_results/output_lightx2v_wan_i2v_audio.mp4
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment