Support resize_mode for SekoTalk model (#269)

af5105c7 · Yang Yong(雍洋) · GitHub · cf6ce7c7 · af5105c7 · af5105c7
Commit af5105c7 authored Sep 01, 2025 by Yang Yong(雍洋) Committed by GitHub Sep 01, 2025
13 changed files
--- a/configs/seko_talk/seko_talk_01_base.json
+++ b/configs/seko_talk/seko_talk_01_base.json
@@ -4,8 +4,7 @@
    "video_duration": 12,
    "audio_sr": 16000,
    "target_video_length": 81,
-    "target_height": 720,
-    "target_width": 1280,
+    "resize_mode": "adaptive",
    "self_attn_1_type": "flash_attn3",
    "cross_attn_1_type": "flash_attn3",
    "cross_attn_2_type": "flash_attn3",
@@ -14,6 +13,5 @@
    "sample_shift": 5,
    "enable_cfg": false,
    "cpu_offload": false,
-    "use_31_block": false,
-    "adaptive_resize": true
+    "use_31_block": false
 }
--- a/configs/seko_talk/seko_talk_02_fp8.json
+++ b/configs/seko_talk/seko_talk_02_fp8.json
@@ -4,8 +4,7 @@
    "video_duration": 12,
    "audio_sr": 16000,
    "target_video_length": 81,
-    "target_height": 720,
-    "target_width": 1280,
+    "resize_mode": "adaptive",
    "self_attn_1_type": "sage_attn2",
    "cross_attn_1_type": "sage_attn2",
    "cross_attn_2_type": "sage_attn2",
@@ -15,7 +14,6 @@
    "enable_cfg": false,
    "cpu_offload": false,
    "use_31_block": false,
-    "adaptive_resize": true,
    "mm_config": {
        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
    },

--- a/configs/seko_talk/seko_talk_03_dist.json
+++ b/configs/seko_talk/seko_talk_03_dist.json
@@ -4,8 +4,7 @@
    "video_duration": 12,
    "audio_sr": 16000,
    "target_video_length": 81,
-    "target_height": 720,
-    "target_width": 1280,
+    "resize_mode": "adaptive",
    "self_attn_1_type": "flash_attn3",
    "cross_attn_1_type": "flash_attn3",
    "cross_attn_2_type": "flash_attn3",
@@ -15,7 +14,6 @@
    "enable_cfg": false,
    "cpu_offload": false,
    "use_31_block": false,
-    "adaptive_resize": true,
    "parallel": {
        "seq_p_size": 4,
        "seq_p_attn_type": "ulysses"

--- a/configs/seko_talk/seko_talk_04_fp8_dist.json
+++ b/configs/seko_talk/seko_talk_04_fp8_dist.json
@@ -4,8 +4,7 @@
    "video_duration": 12,
    "audio_sr": 16000,
    "target_video_length": 81,
-    "target_height": 720,
-    "target_width": 1280,
+    "resize_mode": "adaptive",
    "self_attn_1_type": "sage_attn2",
    "cross_attn_1_type": "sage_attn2",
    "cross_attn_2_type": "sage_attn2",
@@ -15,7 +14,6 @@
    "enable_cfg": false,
    "cpu_offload": false,
    "use_31_block": false,
-    "adaptive_resize": true,
    "parallel": {
        "seq_p_size": 4,
        "seq_p_attn_type": "ulysses"

--- a/configs/seko_talk/seko_talk_05_offload_fp8_4090.json
+++ b/configs/seko_talk/seko_talk_05_offload_fp8_4090.json
@@ -4,8 +4,7 @@
    "video_duration": 120,
    "audio_sr": 16000,
    "target_video_length": 81,
-    "target_height": 720,
-    "target_width": 1280,
+    "resize_mode": "adaptive",
    "self_attn_1_type": "sage_attn2",
    "cross_attn_1_type": "sage_attn2",
    "cross_attn_2_type": "sage_attn2",
@@ -13,7 +12,6 @@
    "sample_guide_scale": 1,
    "sample_shift": 5,
    "enable_cfg": false,
-    "adaptive_resize": true,
    "use_31_block": false,
    "cpu_offload": true,
    "offload_granularity": "block",

--- a/configs/seko_talk/seko_talk_06_offload_fp8_H100.json
+++ b/configs/seko_talk/seko_talk_06_offload_fp8_H100.json
@@ -4,8 +4,7 @@
    "video_duration": 120,
    "audio_sr": 16000,
    "target_video_length": 81,
-    "target_height": 720,
-    "target_width": 1280,
+    "resize_mode": "adaptive",
    "self_attn_1_type": "sage_attn2",
    "cross_attn_1_type": "sage_attn2",
    "cross_attn_2_type": "sage_attn2",
@@ -13,7 +12,6 @@
    "sample_guide_scale": 1,
    "sample_shift": 5,
    "enable_cfg": false,
-    "adaptive_resize": true,
    "use_31_block": false,
    "cpu_offload": true,
    "offload_granularity": "block",

--- a/configs/seko_talk/seko_talk_07_dist_offload.json
+++ b/configs/seko_talk/seko_talk_07_dist_offload.json
@@ -4,8 +4,7 @@
    "video_duration": 5,
    "audio_sr": 16000,
    "target_video_length": 81,
-    "target_height": 720,
-    "target_width": 1280,
+    "resize_mode": "adaptive",
    "self_attn_1_type": "flash_attn3",
    "cross_attn_1_type": "flash_attn3",
    "cross_attn_2_type": "flash_attn3",
@@ -14,7 +13,6 @@
    "sample_shift": 5,
    "enable_cfg": false,
    "use_31_block": false,
-    "adaptive_resize": true,
    "parallel": {
        "seq_p_size": 4,
        "seq_p_attn_type": "ulysses"

--- a/configs/seko_talk/seko_talk_08_5B_base.json
+++ b/configs/seko_talk/seko_talk_08_5B_base.json
@@ -4,9 +4,8 @@
    "video_duration": 12,
    "audio_sr": 16000,
    "target_video_length": 121,
+    "resize_mode": "adaptive",
    "text_len": 512,
-    "target_height": 704,
-    "target_width": 1280,
    "num_channels_latents": 48,
    "vae_stride": [4, 16, 16],
    "self_attn_1_type": "flash_attn3",
@@ -20,7 +19,6 @@
    "offload_granularity": "model",
    "fps": 24,
    "use_image_encoder": false,
-    "adaptive_resize": true,
    "use_31_block": false,
    "lora_configs": [
        {

--- a/configs/seko_talk/seko_talk_09_base_fixed_min_area.json
+++ b/configs/seko_talk/seko_talk_09_base_fixed_min_area.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 12,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "fixed_min_area",
+    "self_attn_1_type": "flash_attn3",
+    "cross_attn_1_type": "flash_attn3",
+    "cross_attn_2_type": "flash_attn3",
+    "seed": 42,
+    "sample_guide_scale": 1.0,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "cpu_offload": false,
+    "use_31_block": false
+}
--- a/configs/seko_talk/seko_talk_10_fp8_dist_fixed_min_area.json
+++ b/configs/seko_talk/seko_talk_10_fp8_dist_fixed_min_area.json
+{
+    "infer_steps": 4,
+    "target_fps": 16,
+    "video_duration": 12,
+    "audio_sr": 16000,
+    "target_video_length": 81,
+    "resize_mode": "fixed_min_area",
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "seed": 42,
+    "sample_guide_scale": 1.0,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "cpu_offload": false,
+    "use_31_block": false,
+    "parallel": {
+        "seq_p_size": 4,
+        "seq_p_attn_type": "ulysses"
+    },
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
+    },
+    "adapter_quantized": true,
+    "adapter_quant_scheme": "fp8",
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8"
+}
--- a/lightx2v/models/runners/wan/wan_audio_runner.py
+++ b/lightx2v/models/runners/wan/wan_audio_runner.py
@@ -80,7 +80,9 @@ def isotropic_crop_resize(frames: torch.Tensor, size: tuple):
    return resized_frames


-def adaptive_resize(img):
+def resize_image(img, resize_mode="adaptive", fixed_area=None):
+    assert resize_mode in ["adaptive", "keep_ratio_fixed_area", "fixed_min_area", "fixed_max_area"]
+
    bucket_config = {
        0.667: (np.array([[480, 832], [544, 960], [720, 1280]], dtype=np.int64), np.array([0.2, 0.5, 0.3])),
        1.0: (np.array([[480, 480], [576, 576], [704, 704], [960, 960]], dtype=np.int64), np.array([0.1, 0.1, 0.5, 0.3])),
@@ -89,18 +91,36 @@ def adaptive_resize(img):
    ori_height = img.shape[-2]
    ori_weight = img.shape[-1]
    ori_ratio = ori_height / ori_weight
-    aspect_ratios = np.array(np.array(list(bucket_config.keys())))
-    closet_aspect_idx = np.argmin(np.abs(aspect_ratios - ori_ratio))
-    closet_ratio = aspect_ratios[closet_aspect_idx]
-    if ori_ratio < 1.0:
-        target_h, target_w = 480, 832
-    elif ori_ratio == 1.0:
-        target_h, target_w = 480, 480
-    else:
-        target_h, target_w = 832, 480
-    for resolution in bucket_config[closet_ratio][0]:
-        if ori_height * ori_weight >= resolution[0] * resolution[1]:
-            target_h, target_w = resolution
+
+    if resize_mode == "adaptive":
+        aspect_ratios = np.array(np.array(list(bucket_config.keys())))
+        closet_aspect_idx = np.argmin(np.abs(aspect_ratios - ori_ratio))
+        closet_ratio = aspect_ratios[closet_aspect_idx]
+        if ori_ratio < 1.0:
+            target_h, target_w = 480, 832
+        elif ori_ratio == 1.0:
+            target_h, target_w = 480, 480
+        else:
+            target_h, target_w = 832, 480
+        for resolution in bucket_config[closet_ratio][0]:
+            if ori_height * ori_weight >= resolution[0] * resolution[1]:
+                target_h, target_w = resolution
+    elif resize_mode == "keep_ratio_fixed_area":
+        assert fixed_area in ["480p", "720p"], f"fixed_area must be in ['480p', '720p'], but got {fixed_area}, please set fixed_area in config."
+        fixed_area = 480 * 832 if fixed_area == "480p" else 720 * 1280
+        target_h = round(np.sqrt(fixed_area * ori_ratio))
+        target_w = round(np.sqrt(fixed_area / ori_ratio))
+    elif resize_mode == "fixed_min_area":
+        aspect_ratios = np.array(np.array(list(bucket_config.keys())))
+        closet_aspect_idx = np.argmin(np.abs(aspect_ratios - ori_ratio))
+        closet_ratio = aspect_ratios[closet_aspect_idx]
+        target_h, target_w = bucket_config[closet_ratio][0][0]
+    elif resize_mode == "fixed_max_area":
+        aspect_ratios = np.array(np.array(list(bucket_config.keys())))
+        closet_aspect_idx = np.argmin(np.abs(aspect_ratios - ori_ratio))
+        closet_ratio = aspect_ratios[closet_aspect_idx]
+        target_h, target_w = bucket_config[closet_ratio][0][-1]
+
    cropped_img = isotropic_crop_resize(img, (target_h, target_w))
    return cropped_img, target_h, target_w

@@ -269,7 +289,8 @@ class WanAudioRunner(WanRunner):  # type:ignore
        ref_img = Image.open(img_path).convert("RGB")
        ref_img = TF.to_tensor(ref_img).sub_(0.5).div_(0.5).unsqueeze(0).cuda()

-        ref_img, h, w = adaptive_resize(ref_img)
+        ref_img, h, w = resize_image(ref_img, resize_mode=self.config.get("resize_mode", "adaptive"), fixed_area=self.config.get("fixed_area", None))
+        logger.info(f"[wan_audio] resize_image target_h: {h}, target_w: {w}")
        patched_h = h // self.config.vae_stride[1] // self.config.patch_size[1]
        patched_w = w // self.config.vae_stride[2] // self.config.patch_size[2]


--- a/scripts/seko_talk/run_seko_talk_09_base_fixed_min_area.sh
+++ b/scripts/seko_talk/run_seko_talk_09_base_fixed_min_area.sh
+#!/bin/bash
+
+lightx2v_path=
+model_path=
+
+export CUDA_VISIBLE_DEVICES=0
+
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+export ENABLE_GRAPH_MODE=false
+export SENSITIVE_LAYER_DTYPE=None
+
+python -m lightx2v.infer \
+--model_cls seko_talk \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_09_base_fixed_min_area.json \
+--prompt  "The video features a old lady is saying something and knitting a sweater." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
+--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
--- a/scripts/seko_talk/run_seko_talk_10_fp8_dist_fixed_min_area.sh
+++ b/scripts/seko_talk/run_seko_talk_10_fp8_dist_fixed_min_area.sh
+#!/bin/bash
+
+lightx2v_path=
+model_path=
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+export ENABLE_GRAPH_MODE=false
+export SENSITIVE_LAYER_DTYPE=None
+
+torchrun --nproc-per-node 4 -m lightx2v.infer \
+--model_cls seko_talk \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_10_fp8_dist_fixed_min_area.json \
+--prompt  "The video features a old lady is saying something and knitting a sweater." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
+--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4