Commit 740d8d8f authored by wangshankun's avatar wangshankun
Browse files

r2v v2版本更新

parent e687fe1a
{
"infer_steps": 5,
"infer_steps": 4,
"target_fps": 16,
"video_duration": 12,
"video_duration": 16,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 480,
......@@ -11,7 +11,8 @@
"cross_attn_2_type": "flash_attn3",
"seed": 42,
"sample_guide_scale":1,
"sample_shift": 5,
"sample_shift": 6,
"enable_cfg": false,
"cpu_offload": false
"cpu_offload": false,
"use_tiling_vae": true
}
......@@ -24,13 +24,15 @@ class WanAudioPreInfer(WanPreInfer):
self.text_len = config["text_len"]
def infer(self, weights, inputs, positive):
ltnt_channel = self.scheduler.latents.size(0)
ltnt_frames = self.scheduler.latents.size(1)
prev_latents = inputs["previmg_encoder_output"]["prev_latents"].unsqueeze(0)
prev_mask = inputs["previmg_encoder_output"]["prev_mask"]
hidden_states = self.scheduler.latents.unsqueeze(0)
hidden_states = torch.cat([hidden_states[:, :ltnt_channel], prev_latents, prev_mask], dim=1)
# hidden_states = torch.cat([hidden_states[:, :ltnt_channel], prev_latents, prev_mask], dim=1)
# print(f"{prev_mask.shape}, {hidden_states.shape}, {prev_latents.shape},{prev_latents[:, :, :ltnt_frames].shape}")
hidden_states = torch.cat([hidden_states, prev_mask, prev_latents[:, :, :ltnt_frames]], dim=1)
hidden_states = hidden_states.squeeze(0)
x = [hidden_states]
......
......@@ -18,6 +18,8 @@ from lightx2v.models.video_encoders.hf.wan.vae import WanVAE
from lightx2v.models.networks.wan.audio_adapter import AudioAdapter, AudioAdapterPipe, rank0_load_state_dict_from_path
from lightx2v.models.schedulers.wan.step_distill.scheduler import WanStepDistillScheduler
from loguru import logger
import torch.distributed as dist
from einops import rearrange
......@@ -369,6 +371,18 @@ class WanAudioRunner(WanRunner):
audio_frame_rate = audio_sr / fps
return round(start_frame * audio_frame_rate), round((end_frame + 1) * audio_frame_rate)
def wan_mask_rearrange(mask: torch.Tensor):
# mask: 1, T, H, W, where 1 means the input mask is one-channel
if mask.ndim == 3:
mask = mask[None]
assert mask.ndim == 4
_, t, h, w = mask.shape
assert t == ((t - 1) // 4 * 4 + 1)
mask_first_frame = torch.repeat_interleave(mask[:, 0:1], repeats=4, dim=1)
mask = torch.concat([mask_first_frame, mask[:, 1:]], dim=1)
mask = mask.view(mask.shape[1] // 4, 4, h, w)
return mask.transpose(0, 1) # 4, T // 4, H, W
self.inputs["audio_adapter_pipe"] = self.load_audio_models()
# process audio
......@@ -449,11 +463,11 @@ class WanAudioRunner(WanRunner):
if prev_latents is not None:
ltnt_channel, nframe, height, width = self.model.scheduler.latents.shape
bs = 1
prev_mask = torch.zeros((bs, 1, nframe, height, width), device=device, dtype=dtype)
if prev_len > 0:
prev_mask[:, :, :prev_len] = 1.0
# bs = 1
frames_n = (nframe - 1) * 4 + 1
prev_mask = torch.zeros((1, frames_n, height, width), device=device, dtype=dtype)
prev_mask[:, prev_len:] = 0
prev_mask = wan_mask_rearrange(prev_mask).unsqueeze(0)
previmg_encoder_output = {
"prev_latents": prev_latents,
"prev_mask": prev_mask,
......
......@@ -2,8 +2,8 @@
# set path and first
lightx2v_path="/mnt/Text2Video/wangshankun/lightx2v"
model_path="/mnt/Text2Video/wangshankun/HF_Cache/Wan2.1-I2V-Audio-14B-720P/"
lora_path="/mnt/Text2Video/wangshankun/HF_Cache/Wan21_T2V_14B_lightx2v_cfg_step_distill_lora_rank32.safetensors"
model_path="/mnt/Text2Video/wangshankun/HF_Cache/Wan2.1-R2V-Audio-14B-720P/"
#lora_path="/mnt/Text2Video/wuzhuguanyu/Wan21_I2V_14B_lightx2v_cfg_step_distill_lora_rank64.safetensors"
# check section
if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
cuda_devices=0
......@@ -42,5 +42,4 @@ python -m lightx2v.infer \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4 \
--lora_path ${lora_path}
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment