Commit af5105c7 authored by Yang Yong(雍洋)'s avatar Yang Yong(雍洋) Committed by GitHub
Browse files

Support resize_mode for SekoTalk model (#269)

parent cf6ce7c7
......@@ -4,8 +4,7 @@
"video_duration": 12,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 720,
"target_width": 1280,
"resize_mode": "adaptive",
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
......@@ -14,6 +13,5 @@
"sample_shift": 5,
"enable_cfg": false,
"cpu_offload": false,
"use_31_block": false,
"adaptive_resize": true
"use_31_block": false
}
......@@ -4,8 +4,7 @@
"video_duration": 12,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 720,
"target_width": 1280,
"resize_mode": "adaptive",
"self_attn_1_type": "sage_attn2",
"cross_attn_1_type": "sage_attn2",
"cross_attn_2_type": "sage_attn2",
......@@ -15,7 +14,6 @@
"enable_cfg": false,
"cpu_offload": false,
"use_31_block": false,
"adaptive_resize": true,
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
},
......
......@@ -4,8 +4,7 @@
"video_duration": 12,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 720,
"target_width": 1280,
"resize_mode": "adaptive",
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
......@@ -15,7 +14,6 @@
"enable_cfg": false,
"cpu_offload": false,
"use_31_block": false,
"adaptive_resize": true,
"parallel": {
"seq_p_size": 4,
"seq_p_attn_type": "ulysses"
......
......@@ -4,8 +4,7 @@
"video_duration": 12,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 720,
"target_width": 1280,
"resize_mode": "adaptive",
"self_attn_1_type": "sage_attn2",
"cross_attn_1_type": "sage_attn2",
"cross_attn_2_type": "sage_attn2",
......@@ -15,7 +14,6 @@
"enable_cfg": false,
"cpu_offload": false,
"use_31_block": false,
"adaptive_resize": true,
"parallel": {
"seq_p_size": 4,
"seq_p_attn_type": "ulysses"
......
......@@ -4,8 +4,7 @@
"video_duration": 120,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 720,
"target_width": 1280,
"resize_mode": "adaptive",
"self_attn_1_type": "sage_attn2",
"cross_attn_1_type": "sage_attn2",
"cross_attn_2_type": "sage_attn2",
......@@ -13,7 +12,6 @@
"sample_guide_scale": 1,
"sample_shift": 5,
"enable_cfg": false,
"adaptive_resize": true,
"use_31_block": false,
"cpu_offload": true,
"offload_granularity": "block",
......
......@@ -4,8 +4,7 @@
"video_duration": 120,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 720,
"target_width": 1280,
"resize_mode": "adaptive",
"self_attn_1_type": "sage_attn2",
"cross_attn_1_type": "sage_attn2",
"cross_attn_2_type": "sage_attn2",
......@@ -13,7 +12,6 @@
"sample_guide_scale": 1,
"sample_shift": 5,
"enable_cfg": false,
"adaptive_resize": true,
"use_31_block": false,
"cpu_offload": true,
"offload_granularity": "block",
......
......@@ -4,8 +4,7 @@
"video_duration": 5,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 720,
"target_width": 1280,
"resize_mode": "adaptive",
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
......@@ -14,7 +13,6 @@
"sample_shift": 5,
"enable_cfg": false,
"use_31_block": false,
"adaptive_resize": true,
"parallel": {
"seq_p_size": 4,
"seq_p_attn_type": "ulysses"
......
......@@ -4,9 +4,8 @@
"video_duration": 12,
"audio_sr": 16000,
"target_video_length": 121,
"resize_mode": "adaptive",
"text_len": 512,
"target_height": 704,
"target_width": 1280,
"num_channels_latents": 48,
"vae_stride": [4, 16, 16],
"self_attn_1_type": "flash_attn3",
......@@ -20,7 +19,6 @@
"offload_granularity": "model",
"fps": 24,
"use_image_encoder": false,
"adaptive_resize": true,
"use_31_block": false,
"lora_configs": [
{
......
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 12,
"audio_sr": 16000,
"target_video_length": 81,
"resize_mode": "fixed_min_area",
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
"seed": 42,
"sample_guide_scale": 1.0,
"sample_shift": 5,
"enable_cfg": false,
"cpu_offload": false,
"use_31_block": false
}
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 12,
"audio_sr": 16000,
"target_video_length": 81,
"resize_mode": "fixed_min_area",
"self_attn_1_type": "sage_attn2",
"cross_attn_1_type": "sage_attn2",
"cross_attn_2_type": "sage_attn2",
"seed": 42,
"sample_guide_scale": 1.0,
"sample_shift": 5,
"enable_cfg": false,
"cpu_offload": false,
"use_31_block": false,
"parallel": {
"seq_p_size": 4,
"seq_p_attn_type": "ulysses"
},
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
},
"adapter_quantized": true,
"adapter_quant_scheme": "fp8",
"t5_quantized": true,
"t5_quant_scheme": "fp8"
}
......@@ -80,7 +80,9 @@ def isotropic_crop_resize(frames: torch.Tensor, size: tuple):
return resized_frames
def adaptive_resize(img):
def resize_image(img, resize_mode="adaptive", fixed_area=None):
assert resize_mode in ["adaptive", "keep_ratio_fixed_area", "fixed_min_area", "fixed_max_area"]
bucket_config = {
0.667: (np.array([[480, 832], [544, 960], [720, 1280]], dtype=np.int64), np.array([0.2, 0.5, 0.3])),
1.0: (np.array([[480, 480], [576, 576], [704, 704], [960, 960]], dtype=np.int64), np.array([0.1, 0.1, 0.5, 0.3])),
......@@ -89,18 +91,36 @@ def adaptive_resize(img):
ori_height = img.shape[-2]
ori_weight = img.shape[-1]
ori_ratio = ori_height / ori_weight
aspect_ratios = np.array(np.array(list(bucket_config.keys())))
closet_aspect_idx = np.argmin(np.abs(aspect_ratios - ori_ratio))
closet_ratio = aspect_ratios[closet_aspect_idx]
if ori_ratio < 1.0:
target_h, target_w = 480, 832
elif ori_ratio == 1.0:
target_h, target_w = 480, 480
else:
target_h, target_w = 832, 480
for resolution in bucket_config[closet_ratio][0]:
if ori_height * ori_weight >= resolution[0] * resolution[1]:
target_h, target_w = resolution
if resize_mode == "adaptive":
aspect_ratios = np.array(np.array(list(bucket_config.keys())))
closet_aspect_idx = np.argmin(np.abs(aspect_ratios - ori_ratio))
closet_ratio = aspect_ratios[closet_aspect_idx]
if ori_ratio < 1.0:
target_h, target_w = 480, 832
elif ori_ratio == 1.0:
target_h, target_w = 480, 480
else:
target_h, target_w = 832, 480
for resolution in bucket_config[closet_ratio][0]:
if ori_height * ori_weight >= resolution[0] * resolution[1]:
target_h, target_w = resolution
elif resize_mode == "keep_ratio_fixed_area":
assert fixed_area in ["480p", "720p"], f"fixed_area must be in ['480p', '720p'], but got {fixed_area}, please set fixed_area in config."
fixed_area = 480 * 832 if fixed_area == "480p" else 720 * 1280
target_h = round(np.sqrt(fixed_area * ori_ratio))
target_w = round(np.sqrt(fixed_area / ori_ratio))
elif resize_mode == "fixed_min_area":
aspect_ratios = np.array(np.array(list(bucket_config.keys())))
closet_aspect_idx = np.argmin(np.abs(aspect_ratios - ori_ratio))
closet_ratio = aspect_ratios[closet_aspect_idx]
target_h, target_w = bucket_config[closet_ratio][0][0]
elif resize_mode == "fixed_max_area":
aspect_ratios = np.array(np.array(list(bucket_config.keys())))
closet_aspect_idx = np.argmin(np.abs(aspect_ratios - ori_ratio))
closet_ratio = aspect_ratios[closet_aspect_idx]
target_h, target_w = bucket_config[closet_ratio][0][-1]
cropped_img = isotropic_crop_resize(img, (target_h, target_w))
return cropped_img, target_h, target_w
......@@ -269,7 +289,8 @@ class WanAudioRunner(WanRunner): # type:ignore
ref_img = Image.open(img_path).convert("RGB")
ref_img = TF.to_tensor(ref_img).sub_(0.5).div_(0.5).unsqueeze(0).cuda()
ref_img, h, w = adaptive_resize(ref_img)
ref_img, h, w = resize_image(ref_img, resize_mode=self.config.get("resize_mode", "adaptive"), fixed_area=self.config.get("fixed_area", None))
logger.info(f"[wan_audio] resize_image target_h: {h}, target_w: {w}")
patched_h = h // self.config.vae_stride[1] // self.config.patch_size[1]
patched_w = w // self.config.vae_stride[2] // self.config.patch_size[2]
......
#!/bin/bash
lightx2v_path=
model_path=
export CUDA_VISIBLE_DEVICES=0
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export ENABLE_GRAPH_MODE=false
export SENSITIVE_LAYER_DTYPE=None
python -m lightx2v.infer \
--model_cls seko_talk \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_09_base_fixed_min_area.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
#!/bin/bash
lightx2v_path=
model_path=
export CUDA_VISIBLE_DEVICES=0,1,2,3
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export ENABLE_GRAPH_MODE=false
export SENSITIVE_LAYER_DTYPE=None
torchrun --nproc-per-node 4 -m lightx2v.infer \
--model_cls seko_talk \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_10_fp8_dist_fixed_min_area.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment