add wan2.1 cfg & step distillation model (#67)

* add step & cfg distillation wan model * bug fixed

add wan2.1 cfg & step distillation model (#67)
* add step & cfg distillation wan model * bug fixed
793ec1db · Zhuguanyu Wu · GitHub · c8374fec · 793ec1db · 793ec1db
Commit 793ec1db authored Jun 12, 2025 by Zhuguanyu Wu Committed by GitHub Jun 12, 2025
16 changed files
--- a/configs/wan_t2v_distill.json
+++ b/configs/wan_t2v_distill.json
+{
+    "infer_steps": 4,
+    "target_video_length": 81,
+    "text_len": 512,
+    "target_height": 480,
+    "target_width": 832,
+    "attention_type": "flash_attn3",
+    "seed": 42,
+    "sample_guide_scale": 6,
+    "sample_shift": 8,
+    "enable_cfg": false,
+    "cpu_offload": false,
+    "denoising_step_list": [999, 750, 500, 250]
+}
--- a/configs/wan_t2v_enhancer.json
+++ b/configs/wan_t2v_enhancer.json
+{
+    "infer_steps": 50,
+    "target_video_length": 81,
+    "text_len": 512,
+    "target_height": 480,
+    "target_width": 832,
+    "attention_type": "flash_attn3",
+    "seed": 42,
+    "sample_guide_scale": 6,
+    "sample_shift": 8,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "sub_servers": {
+        "prompt_enhancer": ["http://localhost:9001"]
+    }
+}
--- a/lightx2v/api_server.py
+++ b/lightx2v/api_server.py
@@ -129,7 +129,7 @@ async def stop_running_task():
 if __name__ == "__main__":
    ProcessManager.register_signal_handler()
    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_cls", type=str, required=True, choices=["wan2.1", "hunyuan", "wan2.1_causvid", "wan2.1_skyreels_v2_df", "cogvideox"], default="hunyuan")
+    parser.add_argument("--model_cls", type=str, required=True, choices=["wan2.1", "hunyuan", "wan2.1_distill", "wan2.1_causvid", "wan2.1_skyreels_v2_df", "cogvideox"], default="hunyuan")
    parser.add_argument("--task", type=str, choices=["t2v", "i2v"], default="t2v")
    parser.add_argument("--model_path", type=str, required=True)
    parser.add_argument("--config_json", type=str, required=True)

--- a/lightx2v/attentions/common/sage_attn2.py
+++ b/lightx2v/attentions/common/sage_attn2.py
@@ -29,7 +29,7 @@ def sage_attn2(q, k, v, cu_seqlens_q=None, cu_seqlens_kv=None, max_seqlen_q=None
        )
        x = torch.cat((x1, x2), dim=1)
        x = x.view(max_seqlen_q, -1)
-    elif model_cls in ["wan2.1", "wan2.1_causvid", "wan2.1_df"]:
+    elif model_cls in ["wan2.1", "wan2.1_distill", "wan2.1_causvid", "wan2.1_df"]:
        x = sageattn(
            q.unsqueeze(0),
            k.unsqueeze(0),

--- a/lightx2v/common/apis/dit.py
+++ b/lightx2v/common/apis/dit.py
@@ -12,6 +12,7 @@ from lightx2v.common.ops import *
 from lightx2v.utils.registry_factory import RUNNER_REGISTER
 from lightx2v.models.runners.hunyuan.hunyuan_runner import HunyuanRunner
 from lightx2v.models.runners.wan.wan_runner import WanRunner
+from lightx2v.models.runners.wan.wan_distill_runner import WanDistillRunner
 from lightx2v.models.runners.wan.wan_causvid_runner import WanCausVidRunner
 from lightx2v.models.runners.wan.wan_skyreels_v2_df_runner import WanSkyreelsV2DFRunner

@@ -109,7 +110,7 @@ async def get_task_status(message: TaskStatusMessage):
 if __name__ == "__main__":
    ProcessManager.register_signal_handler()
    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_cls", type=str, required=True, choices=["wan2.1", "hunyuan", "wan2.1_causvid", "wan2.1_skyreels_v2_df", "cogvideox"], default="hunyuan")
+    parser.add_argument("--model_cls", type=str, required=True, choices=["wan2.1", "hunyuan", "wan2.1_distill", "wan2.1_causvid", "wan2.1_skyreels_v2_df", "cogvideox"], default="hunyuan")
    parser.add_argument("--task", type=str, choices=["t2v", "i2v"], default="t2v")
    parser.add_argument("--model_path", type=str, required=True)
    parser.add_argument("--config_json", type=str, required=True)

--- a/lightx2v/common/apis/image_encoder.py
+++ b/lightx2v/common/apis/image_encoder.py
@@ -11,6 +11,7 @@ import torchvision.transforms.functional as TF
 from lightx2v.utils.registry_factory import RUNNER_REGISTER
 from lightx2v.models.runners.hunyuan.hunyuan_runner import HunyuanRunner
 from lightx2v.models.runners.wan.wan_runner import WanRunner
+from lightx2v.models.runners.wan.wan_distill_runner import WanDistillRunner
 from lightx2v.models.runners.wan.wan_causvid_runner import WanCausVidRunner
 from lightx2v.models.runners.wan.wan_skyreels_v2_df_runner import WanSkyreelsV2DFRunner

@@ -102,7 +103,7 @@ async def get_task_status(message: TaskStatusMessage):
 if __name__ == "__main__":
    ProcessManager.register_signal_handler()
    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_cls", type=str, required=True, choices=["wan2.1", "hunyuan", "wan2.1_causvid", "wan2.1_skyreels_v2_df", "cogvideox"], default="hunyuan")
+    parser.add_argument("--model_cls", type=str, required=True, choices=["wan2.1", "hunyuan", "wan2.1_distill", "wan2.1_causvid", "wan2.1_skyreels_v2_df", "cogvideox"], default="hunyuan")
    parser.add_argument("--task", type=str, choices=["t2v", "i2v"], default="t2v")
    parser.add_argument("--model_path", type=str, required=True)
    parser.add_argument("--config_json", type=str, required=True)

--- a/lightx2v/common/apis/text_encoder.py
+++ b/lightx2v/common/apis/text_encoder.py
@@ -11,6 +11,7 @@ import torch
 from lightx2v.utils.registry_factory import RUNNER_REGISTER
 from lightx2v.models.runners.hunyuan.hunyuan_runner import HunyuanRunner
 from lightx2v.models.runners.wan.wan_runner import WanRunner
+from lightx2v.models.runners.wan.wan_distill_runner import WanDistillRunner
 from lightx2v.models.runners.wan.wan_causvid_runner import WanCausVidRunner
 from lightx2v.models.runners.wan.wan_skyreels_v2_df_runner import WanSkyreelsV2DFRunner

@@ -107,7 +108,7 @@ async def get_task_status(message: TaskStatusMessage):
 if __name__ == "__main__":
    ProcessManager.register_signal_handler()
    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_cls", type=str, required=True, choices=["wan2.1", "hunyuan", "wan2.1_causvid", "wan2.1_skyreels_v2_df", "cogvideox"], default="hunyuan")
+    parser.add_argument("--model_cls", type=str, required=True, choices=["wan2.1", "hunyuan", "wan2.1_distill", "wan2.1_causvid", "wan2.1_skyreels_v2_df", "cogvideox"], default="hunyuan")
    parser.add_argument("--task", type=str, choices=["t2v", "i2v"], default="t2v")
    parser.add_argument("--model_path", type=str, required=True)
    parser.add_argument("--config_json", type=str, required=True)

--- a/lightx2v/common/apis/vae.py
+++ b/lightx2v/common/apis/vae.py
@@ -15,6 +15,7 @@ from lightx2v.common.ops import *
 from lightx2v.utils.registry_factory import RUNNER_REGISTER
 from lightx2v.models.runners.hunyuan.hunyuan_runner import HunyuanRunner
 from lightx2v.models.runners.wan.wan_runner import WanRunner
+from lightx2v.models.runners.wan.wan_distill_runner import WanDistillRunner
 from lightx2v.models.runners.wan.wan_causvid_runner import WanCausVidRunner
 from lightx2v.models.runners.wan.wan_skyreels_v2_df_runner import WanSkyreelsV2DFRunner

@@ -156,7 +157,7 @@ async def get_task_status(message: TaskStatusMessage):
 if __name__ == "__main__":
    ProcessManager.register_signal_handler()
    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_cls", type=str, required=True, choices=["wan2.1", "hunyuan", "wan2.1_causvid", "wan2.1_skyreels_v2_df", "cogvideox"], default="hunyuan")
+    parser.add_argument("--model_cls", type=str, required=True, choices=["wan2.1", "hunyuan", "wan2.1_distill", "wan2.1_causvid", "wan2.1_skyreels_v2_df", "cogvideox"], default="hunyuan")
    parser.add_argument("--task", type=str, choices=["t2v", "i2v"], default="t2v")
    parser.add_argument("--model_path", type=str, required=True)
    parser.add_argument("--config_json", type=str, required=True)

--- a/lightx2v/common/ops/attn/attn_weight.py
+++ b/lightx2v/common/ops/attn/attn_weight.py
@@ -122,7 +122,7 @@ class SageAttn2Weight(AttnWeightTemplate):
            )
            x = torch.cat((x1, x2), dim=1)
            x = x.view(max_seqlen_q, -1)
-        elif model_cls in ["wan2.1", "wan2.1_causvid", "wan2.1_df"]:
+        elif model_cls in ["wan2.1", "wan2.1_distill", "wan2.1_causvid", "wan2.1_df"]:
            x = sageattn(
                q.unsqueeze(0),
                k.unsqueeze(0),

--- a/lightx2v/infer.py
+++ b/lightx2v/infer.py
@@ -12,6 +12,7 @@ from lightx2v.utils.registry_factory import RUNNER_REGISTER

 from lightx2v.models.runners.hunyuan.hunyuan_runner import HunyuanRunner
 from lightx2v.models.runners.wan.wan_runner import WanRunner
+from lightx2v.models.runners.wan.wan_distill_runner import WanDistillRunner
 from lightx2v.models.runners.wan.wan_causvid_runner import WanCausVidRunner
 from lightx2v.models.runners.wan.wan_skyreels_v2_df_runner import WanSkyreelsV2DFRunner
 from lightx2v.models.runners.graph_runner import GraphRunner
@@ -40,11 +41,11 @@ def init_runner(config):

 async def main():
    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_cls", type=str, required=True, choices=["wan2.1", "hunyuan", "wan2.1_causvid", "wan2.1_skyreels_v2_df", "cogvideox"], default="hunyuan")
+    parser.add_argument("--model_cls", type=str, required=True, choices=["wan2.1", "hunyuan", "wan2.1_distill", "wan2.1_causvid", "wan2.1_skyreels_v2_df", "cogvideox"], default="hunyuan")
    parser.add_argument("--task", type=str, choices=["t2v", "i2v"], default="t2v")
    parser.add_argument("--model_path", type=str, required=True)
    parser.add_argument("--config_json", type=str, required=True)
-    parser.add_argument("--prompt_enhancer", type=str, default=None)
+    parser.add_argument("--use_prompt_enhancer", action="store_true")

    parser.add_argument("--prompt", type=str, required=True)
    parser.add_argument("--negative_prompt", type=str, default="")

--- a/lightx2v/models/networks/wan/distill_model.py
+++ b/lightx2v/models/networks/wan/distill_model.py
+import os
+import sys
+import torch
+import glob
+import json
+from lightx2v.models.networks.wan.model import WanModel
+from lightx2v.models.networks.wan.weights.pre_weights import WanPreWeights
+from lightx2v.models.networks.wan.weights.post_weights import WanPostWeights
+from lightx2v.models.networks.wan.weights.transformer_weights import (
+    WanTransformerWeights,
+)
+from lightx2v.models.networks.wan.infer.pre_infer import WanPreInfer
+from lightx2v.models.networks.wan.infer.post_infer import WanPostInfer
+from lightx2v.models.networks.wan.infer.transformer_infer import (
+    WanTransformerInfer,
+)
+from lightx2v.models.networks.wan.infer.feature_caching.transformer_infer import (
+    WanTransformerInferTeaCaching,
+)
+from safetensors import safe_open
+import lightx2v.attentions.distributed.ulysses.wrap as ulysses_dist_wrap
+import lightx2v.attentions.distributed.ring.wrap as ring_dist_wrap
+from lightx2v.utils.envs import *
+from loguru import logger
+
+
+class WanDistillModel(WanModel):
+    pre_weight_class = WanPreWeights
+    post_weight_class = WanPostWeights
+    transformer_weight_class = WanTransformerWeights
+
+    def __init__(self, model_path, config, device):
+        super().__init__(model_path, config, device)
+
+    def _load_ckpt(self):
+        use_bfloat16 = self.config.get("use_bfloat16", True)
+        ckpt_path = os.path.join(self.model_path, "distill_model.pt")
+        if not os.path.exists(ckpt_path):
+            # 文件不存在，调用父类的 _load_ckpt 方法
+            return super()._load_ckpt()
+
+        weight_dict = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+
+        dtype = torch.bfloat16 if use_bfloat16 else None
+        for key, value in weight_dict.items():
+            weight_dict[key] = value.to(device=self.device, dtype=dtype)
+
+        return weight_dict
--- a/lightx2v/models/runners/wan/wan_causvid_runner.py
+++ b/lightx2v/models/runners/wan/wan_causvid_runner.py
@@ -8,7 +8,7 @@ from lightx2v.utils.registry_factory import RUNNER_REGISTER
 from lightx2v.models.runners.wan.wan_runner import WanRunner
 from lightx2v.models.runners.default_runner import DefaultRunner
 from lightx2v.models.schedulers.wan.scheduler import WanScheduler
-from lightx2v.models.schedulers.wan.causvid.scheduler import WanCausVidScheduler
+from lightx2v.models.schedulers.wan.step_distill.scheduler import WanStepDistillScheduler
 from lightx2v.utils.profiler import ProfilingContext4Debug, ProfilingContext
 from lightx2v.models.input_encoders.hf.t5.model import T5EncoderModel
 from lightx2v.models.input_encoders.hf.xlm_roberta.model import CLIPModel
@@ -38,7 +38,7 @@ class WanCausVidRunner(WanRunner):
        self.num_fragments = self.config["num_fragments"]

    def init_scheduler(self):
-        scheduler = WanCausVidScheduler(self.config)
+        scheduler = WanStepDistillScheduler(self.config)
        self.model.set_scheduler(scheduler)

    def set_target_shape(self):

--- a/lightx2v/models/runners/wan/wan_distill_runner.py
+++ b/lightx2v/models/runners/wan/wan_distill_runner.py
+import os
+import numpy as np
+import torch
+import torchvision.transforms.functional as TF
+from PIL import Image
+from lightx2v.utils.registry_factory import RUNNER_REGISTER
+from lightx2v.models.runners.wan.wan_runner import WanRunner
+from lightx2v.models.schedulers.wan.step_distill.scheduler import WanStepDistillScheduler
+from lightx2v.utils.profiler import ProfilingContext
+from lightx2v.models.input_encoders.hf.t5.model import T5EncoderModel
+from lightx2v.models.input_encoders.hf.xlm_roberta.model import CLIPModel
+from lightx2v.models.networks.wan.model import WanModel
+from lightx2v.models.networks.wan.distill_model import WanDistillModel
+from lightx2v.models.networks.wan.lora_adapter import WanLoraWrapper
+from lightx2v.models.video_encoders.hf.wan.vae import WanVAE
+from lightx2v.models.video_encoders.hf.wan.vae_tiny import WanVAE_tiny
+from lightx2v.utils.utils import cache_video
+from loguru import logger
+
+
+@RUNNER_REGISTER("wan2.1_distill")
+class WanDistillRunner(WanRunner):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def load_transformer(self, init_device):
+        model = WanDistillModel(self.config.model_path, self.config, init_device)
+        if self.config.lora_path:
+            lora_wrapper = WanLoraWrapper(model)
+            lora_name = lora_wrapper.load_lora(self.config.lora_path)
+            lora_wrapper.apply_lora(lora_name, self.config.strength_model)
+            logger.info(f"Loaded LoRA: {lora_name}")
+        return model
+
+    def init_scheduler(self):
+        if self.config.feature_caching == "NoCaching":
+            scheduler = WanStepDistillScheduler(self.config)
+        else:
+            raise NotImplementedError(f"Unsupported feature_caching type: {self.config.feature_caching}")
+        self.model.set_scheduler(scheduler)
--- a/lightx2v/models/schedulers/wan/causvid/scheduler.py
+++ b/lightx2v/models/schedulers/wan/causvid/scheduler.py
@@ -5,7 +5,7 @@ from typing import List, Optional, Tuple, Union
 from lightx2v.models.schedulers.wan.scheduler import WanScheduler


-class WanCausVidScheduler(WanScheduler):
+class WanStepDistillScheduler(WanScheduler):
    def __init__(self, config):
        super().__init__(config)
        self.denoising_step_list = config.denoising_step_list

--- a/scripts/start_server_enhancer.sh
+++ b/scripts/start_server_enhancer.sh
 #!/bin/bash

 # set path and first
-lightx2v_path=
-model_path=
-prompt_enhancer_path=
+lightx2v_path="/data/lightx2v-dev/"
+model_path="/data/lightx2v-dev/Wan2.1-T2V-14B/"

 # check section
 if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
-    cuda_devices=0,1
+    cuda_devices=0
    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
    export CUDA_VISIBLE_DEVICES=${cuda_devices}
 fi
@@ -29,10 +28,12 @@ export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
 export ENABLE_PROFILING_DEBUG=true
 export ENABLE_GRAPH_MODE=false

-python -m lightx2v.api_server \
--model_cls wan2.1_causvid \
+python -m lightx2v.infer \
+--model_cls wan2.1_distill \
 --task t2v \
 --model_path $model_path \
--config_json ${lightx2v_path}/configs/wan_t2v_causvid.json \
--prompt_enhancer ${prompt_enhancer_path} \
--port 8000
+--config_json ${lightx2v_path}/configs/wan_t2v_distill.json \
+--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
+--use_prompt_enhancer \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_t2v.mp4
--- a/scripts/run_wan_t2v_enhancer.sh
+++ b/scripts/run_wan_t2v_enhancer.sh
@@ -33,8 +33,8 @@ python -m lightx2v.infer \
 --model_cls wan2.1 \
 --task t2v \
 --model_path $model_path \
--config_json ${lightx2v_path}/configs/wan_t2v.json \
+--config_json ${lightx2v_path}/configs/wan_t2v_enhancer.json \
 --prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
--prompt_enhancer ${prompt_enhancer_path} \
+--use_prompt_enhancer \
 --negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_t2v.mp4