[Feat] support wan2.2 distill flf2v (#305)

60c421f4 · gushiqiao · GitHub · c37065b1 · 60c421f4 · 60c421f4
Commit 60c421f4 authored Sep 15, 2025 by gushiqiao Committed by GitHub Sep 15, 2025
4 changed files
--- a/configs/wan22/wan_distill_moe_flf2v.json
+++ b/configs/wan22/wan_distill_moe_flf2v.json
+{
+    "infer_steps": 4,
+    "target_video_length": 81,
+    "text_len": 512,
+    "target_height": 720,
+    "target_width": 1280,
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "seed": 42,
+    "sample_guide_scale": [3.5, 3.5],
+    "sample_shift": 16,
+    "enable_cfg": false,
+    "cpu_offload": true,
+    "offload_granularity": "model",
+    "use_image_encoder": false,
+    "boundary_step_index": 2,
+    "denoising_step_list": [1000, 750, 500, 250],
+    "lora_configs": [
+        {
+            "name": "low_noise_model",
+            "path": "/path/to/low_noise_lora",
+            "strength": 1.0
+        },
+        {
+            "name": "high_noise_model",
+            "path": "/path/to/high_noise_lora",
+            "strength": 1.0
+        }
+    ]
+}
--- a/configs/wan22/wan_distill_moe_flf2v_fp8.json
+++ b/configs/wan22/wan_distill_moe_flf2v_fp8.json
+{
+    "infer_steps": 4,
+    "target_video_length": 81,
+    "text_len": 512,
+    "target_height": 720,
+    "target_width": 1280,
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "seed": 42,
+    "sample_guide_scale": [3.5, 3.5],
+    "sample_shift": 16,
+    "enable_cfg": false,
+    "cpu_offload": true,
+    "offload_granularity": "model",
+    "use_image_encoder": false,
+    "boundary_step_index": 2,
+    "denoising_step_list": [1000, 750, 500, 250],
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl"
+    },
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8"
+}
--- a/lightx2v/common/ops/attn/sage_attn.py
+++ b/lightx2v/common/ops/attn/sage_attn.py
@@ -52,7 +52,7 @@ class SageAttn2Weight(AttnWeightTemplate):
            )
            x = torch.cat((x1, x2), dim=1)
            x = x.view(max_seqlen_q, -1)
-        elif model_cls in ["wan2.1", "wan2.1_distill", "wan2.1_causvid", "wan2.1_df", "seko_talk", "wan2.2", "wan2.1_vace", "wan2.2_moe"]:
+        elif model_cls in ["wan2.1", "wan2.1_distill", "wan2.1_causvid", "wan2.1_df", "seko_talk", "wan2.2", "wan2.1_vace", "wan2.2_moe", "wan2.2_moe_distill"]:
            x = sageattn(
                q.unsqueeze(0),
                k.unsqueeze(0),

--- a/scripts/wan22/run_wan22_distill_moe_flf2v.sh
+++ b/scripts/wan22/run_wan22_distill_moe_flf2v.sh
+#!/bin/bash
+
+# set path and first
+lightx2v_path=
+model_path=
+export CUDA_VISIBLE_DEVICES=0
+
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+export SENSITIVE_LAYER_DTYPE=None
+export ENABLE_GRAPH_MODE=false
+
+python -m lightx2v.infer \
+--model_cls wan2.2_moe_distill \
+--task flf2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/wan22/wan_distill_moe_flf2v_fp8.json \
+--prompt "A bearded man with red facial hair wearing a yellow straw hat and dark coat in Van Gogh's self-portrait style, slowly and continuously transforms into a space astronaut. The transformation flows like liquid paint - his beard fades away strand by strand, the yellow hat melts and reforms smoothly into a silver space helmet, dark coat gradually lightens and restructures into a white spacesuit. The background swirling brushstrokes slowly organize and clarify into realistic stars and space, with Earth appearing gradually in the distance. Every change happens in seamless waves, maintaining visual continuity throughout the metamorphosis.\n\nConsistent soft lighting throughout, medium close-up maintaining same framing, central composition stays fixed, gentle color temperature shift from warm to cool, gradual contrast increase, smooth style transition from painterly to photorealistic. Static camera with subtle slow zoom, emphasizing the flowing transformation process without abrupt changes." \
+--negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
+--image_path /mtc/gushiqiao/llmc_workspace/wan22_14B_flf2v_start_image.png \
+--last_frame_path /mtc/gushiqiao/llmc_workspace/wan22_14B_flf2v_end_image.png \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_flf2v.mp4