Commit ad73b271 authored by Yang Yong(雍洋)'s avatar Yang Yong(雍洋) Committed by GitHub
Browse files

Support custom bucket_shape for sekotalk model (#287)

parent c33c4896
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 15,
"audio_sr": 16000,
"target_video_length": 81,
"resize_mode": "adaptive",
"bucket_shape": {
"0.667": [[480, 832], [544, 960]],
"1.500": [[832, 480], [960, 544]],
"1.000": [[480, 480], [576, 576], [704, 704]]
},
"self_attn_1_type": "sage_attn2",
"cross_attn_1_type": "sage_attn2",
"cross_attn_2_type": "sage_attn2",
"seed": 42,
"sample_guide_scale": 1.0,
"sample_shift": 5,
"enable_cfg": false,
"cpu_offload": false,
"use_31_block": false,
"parallel": {
"seq_p_size": 8,
"seq_p_attn_type": "ulysses"
},
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl"
},
"adapter_quantized": true,
"adapter_quant_scheme": "fp8",
"t5_quantized": true,
"t5_quant_scheme": "fp8"
}
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 15,
"audio_sr": 16000,
"target_video_length": 17,
"prev_frame_length": 1,
"resize_mode": "adaptive",
"bucket_shape": {
"0.667": [[480, 832], [544, 960]],
"1.500": [[832, 480], [960, 544]],
"1.000": [[480, 480], [576, 576], [704, 704]]
},
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
"seed": 42,
"sample_guide_scale": 1.0,
"sample_shift": 5,
"enable_cfg": false,
"cpu_offload": false,
"use_31_block": false,
"parallel": {
"seq_p_size": 8,
"seq_p_attn_type": "ulysses"
},
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl"
},
"adapter_quantized": true,
"adapter_quant_scheme": "fp8",
"t5_quantized": true,
"t5_quant_scheme": "fp8"
}
......@@ -103,7 +103,7 @@ def fixed_shape_resize(img, target_height, target_width):
return resized_img, h, w
def resize_image(img, resize_mode="adaptive", fixed_area=None, fixed_shape=None):
def resize_image(img, resize_mode="adaptive", bucket_shape=None, fixed_area=None, fixed_shape=None):
assert resize_mode in ["adaptive", "keep_ratio_fixed_area", "fixed_min_area", "fixed_max_area", "fixed_shape"]
if resize_mode == "fixed_shape":
......@@ -111,11 +111,26 @@ def resize_image(img, resize_mode="adaptive", fixed_area=None, fixed_shape=None)
logger.info(f"[wan_audio] fixed_shape_resize fixed_height: {fixed_shape[0]}, fixed_width: {fixed_shape[1]}")
return fixed_shape_resize(img, fixed_shape[0], fixed_shape[1])
bucket_config = {
0.667: (np.array([[480, 832], [544, 960], [720, 1280]], dtype=np.int64), np.array([0.2, 0.5, 0.3])),
1.0: (np.array([[480, 480], [576, 576], [704, 704], [960, 960]], dtype=np.int64), np.array([0.1, 0.1, 0.5, 0.3])),
1.5: (np.array([[480, 832], [544, 960], [720, 1280]], dtype=np.int64)[:, ::-1], np.array([0.2, 0.5, 0.3])),
}
if bucket_shape is not None:
"""
"adaptive_shape": {
"0.667": [[480, 832], [544, 960], [720, 1280]],
"1.500": [[832, 480], [960, 544], [1280, 720]],
"1.000": [[480, 480], [576, 576], [704, 704], [960, 960]]
}
"""
bucket_config = {}
for ratio, resolutions in bucket_shape.items():
bucket_config[float(ratio)] = np.array(resolutions, dtype=np.int64)
logger.info(f"[wan_audio] use custom bucket_shape: {bucket_config}")
else:
bucket_config = {
0.667: np.array([[480, 832], [544, 960], [720, 1280]], dtype=np.int64),
1.500: np.array([[832, 480], [960, 544], [1280, 720]], dtype=np.int64),
1.000: np.array([[480, 480], [576, 576], [704, 704], [960, 960]], dtype=np.int64),
}
logger.info(f"[wan_audio] use default bucket_shape: {bucket_config}")
ori_height = img.shape[-2]
ori_weight = img.shape[-1]
ori_ratio = ori_height / ori_weight
......@@ -130,7 +145,7 @@ def resize_image(img, resize_mode="adaptive", fixed_area=None, fixed_shape=None)
target_h, target_w = 480, 480
else:
target_h, target_w = 832, 480
for resolution in bucket_config[closet_ratio][0]:
for resolution in bucket_config[closet_ratio]:
if ori_height * ori_weight >= resolution[0] * resolution[1]:
target_h, target_w = resolution
elif resize_mode == "keep_ratio_fixed_area":
......@@ -142,12 +157,12 @@ def resize_image(img, resize_mode="adaptive", fixed_area=None, fixed_shape=None)
aspect_ratios = np.array(np.array(list(bucket_config.keys())))
closet_aspect_idx = np.argmin(np.abs(aspect_ratios - ori_ratio))
closet_ratio = aspect_ratios[closet_aspect_idx]
target_h, target_w = bucket_config[closet_ratio][0][0]
target_h, target_w = bucket_config[closet_ratio][0]
elif resize_mode == "fixed_max_area":
aspect_ratios = np.array(np.array(list(bucket_config.keys())))
closet_aspect_idx = np.argmin(np.abs(aspect_ratios - ori_ratio))
closet_ratio = aspect_ratios[closet_aspect_idx]
target_h, target_w = bucket_config[closet_ratio][0][-1]
target_h, target_w = bucket_config[closet_ratio][-1]
cropped_img = isotropic_crop_resize(img, (target_h, target_w))
return cropped_img, target_h, target_w
......@@ -322,7 +337,13 @@ class WanAudioRunner(WanRunner): # type:ignore
ref_img = Image.open(img_path).convert("RGB")
ref_img = TF.to_tensor(ref_img).sub_(0.5).div_(0.5).unsqueeze(0).cuda()
ref_img, h, w = resize_image(ref_img, resize_mode=self.config.get("resize_mode", "adaptive"), fixed_area=self.config.get("fixed_area", None), fixed_shape=self.config.get("fixed_shape", None))
ref_img, h, w = resize_image(
ref_img,
resize_mode=self.config.get("resize_mode", "adaptive"),
bucket_shape=self.config.get("bucket_shape", None),
fixed_area=self.config.get("fixed_area", None),
fixed_shape=self.config.get("fixed_shape", None),
)
logger.info(f"[wan_audio] resize_image target_h: {h}, target_w: {w}")
patched_h = h // self.config.vae_stride[1] // self.config.patch_size[1]
patched_w = w // self.config.vae_stride[2] // self.config.patch_size[2]
......
#!/bin/bash
lightx2v_path=
model_path=
lightx2v_path=/path/to/Lightx2v
model_path=/path/to/SekoTalk-Distill-fp8
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
......
#!/bin/bash
lightx2v_path=/path/to/Lightx2v
model_path=/path/to/SekoTalk-Distill-fp8
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export ENABLE_GRAPH_MODE=false
export SENSITIVE_LAYER_DTYPE=None
torchrun --nproc-per-node 8 -m lightx2v.infer \
--model_cls seko_talk \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_13_fp8_dist_bucket_shape_8gpus_5s_realtime.json \
--prompt "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
#!/bin/bash
lightx2v_path=/path/to/Lightx2v
model_path=/path/to/SekoTalk-Distill-fp8
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export ENABLE_GRAPH_MODE=false
export SENSITIVE_LAYER_DTYPE=None
torchrun --nproc-per-node 8 -m lightx2v.infer \
--model_cls seko_talk \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_14_fp8_dist_bucket_shape_8gpus_1s_realtime.json \
--prompt "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment