Unverified Commit f67c46e4 authored by sandy's avatar sandy Committed by GitHub
Browse files

[Feat] Add f2v for sekotalk (#562)


Co-authored-by: default avatarYang Yong (雍洋) <yongyang1030@163.com>
parent f4ab64f4
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 12,
"audio_sr": 16000,
"target_video_length": 81,
"prev_frame_length": 1,
"resize_mode": "adaptive",
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
"sample_guide_scale": 1.0,
"sample_shift": 5,
"enable_cfg": false,
"cpu_offload": false,
"use_31_block": false,
"f2v_process": true,
"lora_configs": [
{
"path": "lightx2v_I2V_14B_480p_cfg_step_distill_rank32_bf16.safetensors",
"strength": 1.0
}
]
}
......@@ -59,7 +59,10 @@ class WanAudioPreInfer(WanPreInfer):
y = weights.patch_embedding.apply(y.unsqueeze(0))
y = y.flatten(2).transpose(1, 2).contiguous()
x = torch.cat([x, y], dim=1).squeeze(0)
if not self.config.get("f2v_process", False):
x = torch.cat([x, y], dim=1).squeeze(0)
else:
x = x.squeeze(0)
####for r2v # zero temporl component corresponding to ref embeddings
# self.freqs[grid_sizes_t:, : self.rope_t_dim] = 0
......
......@@ -526,6 +526,8 @@ class WanAudioRunner(WanRunner): # type:ignore
@ProfilingContext4DebugL2("Run Encoders")
def _run_input_encoder_local_s2v(self):
img, latent_shape, target_shape = self.read_image_input(self.input_info.image_path)
if self.config.get("f2v_process", False):
self.ref_img = img
self.input_info.latent_shape = latent_shape # Important: set latent_shape in input_info
self.input_info.target_shape = target_shape # Important: set target_shape in input_info
clip_encoder_out = self.run_image_encoder(img) if self.config.get("use_image_encoder", True) else None
......@@ -558,7 +560,7 @@ class WanAudioRunner(WanRunner): # type:ignore
if prev_video is not None:
# Extract and process last frames
last_frames = prev_video[:, :, -prev_frame_length:].clone().to(AI_DEVICE)
if self.config["model_cls"] != "wan2.2_audio":
if self.config["model_cls"] != "wan2.2_audio" and not self.config.get("f2v_process", False):
last_frames = self.frame_preprocessor.process_prev_frames(last_frames)
prev_frames[:, :, :prev_frame_length] = last_frames
prev_len = (prev_frame_length - 1) // 4 + 1
......@@ -620,7 +622,10 @@ class WanAudioRunner(WanRunner): # type:ignore
def init_run(self):
super().init_run()
self.scheduler.set_audio_adapter(self.audio_adapter)
self.prev_video = None
if self.config.get("f2v_process", False):
self.prev_video = self.ref_img.unsqueeze(2)
else:
self.prev_video = None
if self.input_info.return_result_tensor:
self.gen_video_final = torch.zeros((self.inputs["expected_frames"], self.input_info.target_shape[0], self.input_info.target_shape[1], 3), dtype=torch.float32, device="cpu")
self.cut_audio_final = torch.zeros((self.inputs["expected_frames"] * self._audio_processor.audio_frame_rate), dtype=torch.float32, device="cpu")
......
#!/bin/bash
lightx2v_path=
model_path=
export CUDA_VISIBLE_DEVICES=0
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
python -m lightx2v.infer \
--model_cls seko_talk \
--task s2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_28_f2v.json \
--prompt "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.mp3 \
--save_result_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment