Support wan2.2 ti2v-5B and fix some bugs.

8dbe1be6 · gushiqiao · fa7aedbe · 8dbe1be6 · 8dbe1be6 · 8dbe1be6
Commit 8dbe1be6 authored Jul 30, 2025 by gushiqiao
20 changed files
--- a/configs/wan22/wan_ti2v_i2v.json
+++ b/configs/wan22/wan_ti2v_i2v.json
+{
+    "infer_steps": 50,
+    "target_video_length": 121,
+    "text_len": 512,
+    "target_height": 704,
+    "target_width": 1280,
+    "num_channels_latents": 48,
+    "vae_stride": [4, 16, 16],
+    "self_attn_1_type": "flash_attn3",
+    "cross_attn_1_type": "flash_attn3",
+    "cross_attn_2_type": "flash_attn3",
+    "seed": 42,
+    "sample_guide_scale": 5.0,
+    "sample_shift": 5.0,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "offload_granularity": "model",
+    "fps": 24,
+    "use_image_encoder": false
+}
--- a/configs/wan22/wan_ti2v_t2v.json
+++ b/configs/wan22/wan_ti2v_t2v.json
+{
+    "infer_steps": 50,
+    "target_video_length": 121,
+    "text_len": 512,
+    "target_height": 704,
+    "target_width": 1280,
+    "num_channels_latents": 48,
+    "vae_stride": [4, 16, 16],
+    "self_attn_1_type": "flash_attn3",
+    "cross_attn_1_type": "flash_attn3",
+    "cross_attn_2_type": "flash_attn3",
+    "seed": 42,
+    "sample_guide_scale": 5.0,
+    "sample_shift": 5.0,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "offload_granularity": "model",
+    "fps": 24
+}
--- a/lightx2v/common/apis/vae.py
+++ b/lightx2v/common/apis/vae.py
@@ -60,8 +60,8 @@ class VAERunner:

    def _run_vae_encoder(self, img):
        img = image_transporter.load_image(img)
-        vae_encode_out, kwargs = self.runner.run_vae_encoder(img)
-        return vae_encode_out, kwargs
+        vae_encoder_out, kwargs = self.runner.run_vae_encoder(img)
+        return vae_encoder_out, kwargs

    def _run_vae_decoder(self, latents):
        latents = tensor_transporter.load_tensor(latents)
@@ -72,9 +72,9 @@ class VAERunner:
 def run_vae_encoder(message: Message):
    try:
        global runner
-        vae_encode_out, kwargs = runner._run_vae_encoder(message.img)
+        vae_encoder_out, kwargs = runner._run_vae_encoder(message.img)
        VAEServiceStatus.complete_task(message)
-        return vae_encode_out, kwargs
+        return vae_encoder_out, kwargs
    except Exception as e:
        logger.error(f"task_id {message.task_id} failed: {str(e)}")
        VAEServiceStatus.record_failed_task(message, error=str(e))
@@ -95,9 +95,9 @@ def run_vae_decoder(message: Message):
 def v1_local_vae_model_encoder_generate(message: Message):
    try:
        task_id = VAEServiceStatus.start_task(message)
-        vae_encode_out, kwargs = run_vae_encoder(message)
-        output = tensor_transporter.prepare_tensor(vae_encode_out)
-        del vae_encode_out
+        vae_encoder_out, kwargs = run_vae_encoder(message)
+        output = tensor_transporter.prepare_tensor(vae_encoder_out)
+        del vae_encoder_out
        return {"task_id": task_id, "task_status": "completed", "output": output, "kwargs": kwargs}
    except RuntimeError as e:
        return {"error": str(e)}

--- a/lightx2v/infer.py
+++ b/lightx2v/infer.py
@@ -42,7 +42,11 @@ def init_runner(config):
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
-        "--model_cls", type=str, required=True, choices=["wan2.1", "hunyuan", "wan2.1_distill", "wan2.1_causvid", "wan2.1_skyreels_v2_df", "cogvideox", "wan2.1_audio", "wan2.2_moe"], default="wan2.1"
+        "--model_cls",
+        type=str,
+        required=True,
+        choices=["wan2.1", "hunyuan", "wan2.1_distill", "wan2.1_causvid", "wan2.1_skyreels_v2_df", "cogvideox", "wan2.1_audio", "wan2.2_moe", "wan2.2"],
+        default="wan2.1",
    )

    parser.add_argument("--task", type=str, choices=["t2v", "i2v"], default="t2v")

--- a/lightx2v/models/networks/wan/infer/audio/pre_wan_audio_infer.py
+++ b/lightx2v/models/networks/wan/infer/audio/pre_wan_audio_infer.py
@@ -51,7 +51,7 @@ class WanAudioPreInfer(WanPreInfer):
        seq_len = self.scheduler.seq_len

        clip_fea = inputs["image_encoder_output"]["clip_encoder_out"]
-        ref_image_encoder = inputs["image_encoder_output"]["vae_encode_out"]
+        ref_image_encoder = inputs["image_encoder_output"]["vae_encoder_out"]
        batch_size = len(x)
        num_channels, num_frames, height, width = x[0].shape
        _, ref_num_channels, ref_num_frames, _, _ = ref_image_encoder.shape

--- a/lightx2v/models/networks/wan/infer/post_infer.py
+++ b/lightx2v/models/networks/wan/infer/post_infer.py
@@ -25,7 +25,7 @@ class WanPostInfer:

        if GET_DTYPE() != "BF16":
            x = x.float()
-        x.mul_(1 + e[1].squeeze(0)).add_(e[0].squeeze(0))
+        x.mul_(1 + e[1].squeeze()).add_(e[0].squeeze())
        if GET_DTYPE() != "BF16":
            x = x.to(torch.bfloat16)


--- a/lightx2v/models/networks/wan/infer/pre_infer.py
+++ b/lightx2v/models/networks/wan/infer/pre_infer.py
@@ -35,7 +35,10 @@ class WanPreInfer:
            t = self.scheduler.df_timesteps[self.scheduler.step_index].unsqueeze(0)
            assert t.dim() == 2  # df推理模型timestep是二维
        else:
-            t = torch.stack([self.scheduler.timesteps[self.scheduler.step_index]])
+            timestep = self.scheduler.timesteps[self.scheduler.step_index]
+            t = torch.stack([timestep])
+            if hasattr(self.scheduler, "mask"):
+                t = (self.scheduler.mask[0][:, ::2, ::2] * t).flatten()

        if positive:
            context = inputs["text_encoder_output"]["context"]
@@ -47,17 +50,18 @@ class WanPreInfer:
                clip_fea = inputs["image_encoder_output"]["clip_encoder_out"]

            if self.config.get("changing_resolution", False):
-                image_encoder = inputs["image_encoder_output"]["vae_encode_out"][self.scheduler.changing_resolution_index]
+                image_encoder = inputs["image_encoder_output"]["vae_encoder_out"][self.scheduler.changing_resolution_index]
            else:
-                image_encoder = inputs["image_encoder_output"]["vae_encode_out"]
+                image_encoder = inputs["image_encoder_output"]["vae_encoder_out"]

-            frame_seq_length = (image_encoder.size(2) // 2) * (image_encoder.size(3) // 2)
-            if kv_end - kv_start >= frame_seq_length:  # 如果是CausalVid, image_encoder取片段
-                idx_s = kv_start // frame_seq_length
-                idx_e = kv_end // frame_seq_length
-                image_encoder = image_encoder[:, idx_s:idx_e, :, :]
-            y = image_encoder
-            x = torch.cat([x, y], dim=0)
+            if image_encoder is not None:
+                frame_seq_length = (image_encoder.size(2) // 2) * (image_encoder.size(3) // 2)
+                if kv_end - kv_start >= frame_seq_length:  # 如果是CausalVid, image_encoder取片段
+                    idx_s = kv_start // frame_seq_length
+                    idx_e = kv_end // frame_seq_length
+                    image_encoder = image_encoder[:, idx_s:idx_e, :, :]
+                y = image_encoder
+                x = torch.cat([x, y], dim=0)

        # embeddings
        x = weights.patch_embedding.apply(x.unsqueeze(0))

--- a/lightx2v/models/networks/wan/infer/transformer_infer.py
+++ b/lightx2v/models/networks/wan/infer/transformer_infer.py
@@ -309,12 +309,13 @@ class WanTransformerInfer(BaseTransformerInfer):
        return x

    def infer_modulation(self, weights, embed0):
-        if embed0.dim() == 3:
+        if embed0.dim() == 3 and embed0.shape[2] == 1:
            modulation = weights.modulation.tensor.unsqueeze(2)
            embed0 = (modulation + embed0).chunk(6, dim=1)
            shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = [ei.squeeze(1) for ei in embed0]
-        elif embed0.dim() == 2:
+        else:
            shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (weights.modulation.tensor + embed0).chunk(6, dim=1)
+
        if self.clean_cuda_cache:
            del embed0
            torch.cuda.empty_cache()
@@ -330,11 +331,11 @@ class WanTransformerInfer(BaseTransformerInfer):

    def infer_self_attn(self, weights, grid_sizes, x, seq_lens, freqs, shift_msa, scale_msa):
        if hasattr(weights, "smooth_norm1_weight"):
-            norm1_weight = (1 + scale_msa.squeeze(0)) * weights.smooth_norm1_weight.tensor
-            norm1_bias = shift_msa.squeeze(0) * weights.smooth_norm1_bias.tensor
+            norm1_weight = (1 + scale_msa.squeeze()) * weights.smooth_norm1_weight.tensor
+            norm1_bias = shift_msa.squeeze() * weights.smooth_norm1_bias.tensor
        else:
-            norm1_weight = 1 + scale_msa.squeeze(0)
-            norm1_bias = shift_msa.squeeze(0)
+            norm1_weight = 1 + scale_msa.squeeze()
+            norm1_bias = shift_msa.squeeze()

        norm1_out = weights.norm1.apply(x)

@@ -398,9 +399,9 @@ class WanTransformerInfer(BaseTransformerInfer):

    def infer_cross_attn(self, weights, x, context, y_out, gate_msa):
        if GET_DTYPE() != "BF16":
-            x = x.float() + y_out.float() * gate_msa.squeeze(0)
+            x = x.float() + y_out.float() * gate_msa.squeeze()
        else:
-            x.add_(y_out * gate_msa.squeeze(0))
+            x.add_(y_out * gate_msa.squeeze())

        norm3_out = weights.norm3.apply(x)
        if self.task == "i2v" and self.config.get("use_image_encoder", True):
@@ -473,11 +474,11 @@ class WanTransformerInfer(BaseTransformerInfer):
            torch.cuda.empty_cache()

        if hasattr(weights, "smooth_norm2_weight"):
-            norm2_weight = (1 + c_scale_msa.squeeze(0)) * weights.smooth_norm2_weight.tensor
-            norm2_bias = c_shift_msa.squeeze(0) * weights.smooth_norm2_bias.tensor
+            norm2_weight = (1 + c_scale_msa.squeeze()) * weights.smooth_norm2_weight.tensor
+            norm2_bias = c_shift_msa.squeeze() * weights.smooth_norm2_bias.tensor
        else:
-            norm2_weight = 1 + c_scale_msa.squeeze(0)
-            norm2_bias = c_shift_msa.squeeze(0)
+            norm2_weight = 1 + c_scale_msa.squeeze()
+            norm2_bias = c_shift_msa.squeeze()

        norm2_out = weights.norm2.apply(x)
        if GET_DTYPE() != "BF16":
@@ -499,9 +500,9 @@ class WanTransformerInfer(BaseTransformerInfer):

    def post_process(self, x, y, c_gate_msa):
        if GET_DTYPE() != "BF16":
-            x = x.float() + y.float() * c_gate_msa.squeeze(0)
+            x = x.float() + y.float() * c_gate_msa.squeeze()
        else:
-            x.add_(y * c_gate_msa.squeeze(0))
+            x.add_(y * c_gate_msa.squeeze())

        if self.clean_cuda_cache:
            del y, c_gate_msa

--- a/lightx2v/models/runners/base_runner.py
+++ b/lightx2v/models/runners/base_runner.py
@@ -112,12 +112,12 @@ class BaseRunner(ABC):
        pass

    @abstractmethod
-    def get_encoder_output_i2v(self, clip_encoder_out: Any, vae_encode_out: Any, text_encoder_output: Any, img: Any) -> Dict[str, Any]:
+    def get_encoder_output_i2v(self, clip_encoder_out: Any, vae_encoder_out: Any, text_encoder_output: Any, img: Any) -> Dict[str, Any]:
        """Combine encoder outputs for i2v task

        Args:
            clip_encoder_out: CLIP encoder output
-            vae_encode_out: VAE encoder output
+            vae_encoder_out: VAE encoder output
            text_encoder_output: Text encoder output
            img: Original image


--- a/lightx2v/models/runners/cogvideox/cogvidex_runner.py
+++ b/lightx2v/models/runners/cogvideox/cogvidex_runner.py
@@ -49,7 +49,7 @@ class CogvideoxRunner(DefaultRunner):
        # TODO: implement vae encoder for Cogvideox
        raise NotImplementedError("I2V inference is not implemented for Cogvideox.")

-    def get_encoder_output_i2v(self, clip_encoder_out, vae_encode_out, text_encoder_output, img):
+    def get_encoder_output_i2v(self, clip_encoder_out, vae_encoder_out, text_encoder_output, img):
        # TODO: Implement image encoder for Cogvideox-I2V
        raise ValueError(f"Unsupported model class: {self.config['model_cls']}")


--- a/lightx2v/models/runners/default_runner.py
+++ b/lightx2v/models/runners/default_runner.py
@@ -10,7 +10,7 @@ import torch.distributed as dist
 from lightx2v.utils.envs import *
 from lightx2v.utils.generate_task_id import generate_task_id
 from lightx2v.utils.profiler import ProfilingContext, ProfilingContext4Debug
-from lightx2v.utils.utils import save_to_video, vae_to_comfyui_image
+from lightx2v.utils.utils import save_to_video, vae_to_comfyui_image, cache_video

 from .base_runner import BaseRunner

@@ -176,6 +176,8 @@ class DefaultRunner(BaseRunner):
            self.model = self.load_transformer()
        self.init_scheduler()
        self.model.scheduler.prepare(self.inputs["image_encoder_output"])
+        if self.config.get("model_cls") == "wan2.2":
+            self.inputs["image_encoder_output"]["vae_encoder_out"] = None
        latents, generator = self.run()
        self.end_run()
        return latents, generator
@@ -212,13 +214,12 @@ class DefaultRunner(BaseRunner):
            self.config["prompt_enhanced"] = self.post_prompt_enhancer()

        self.inputs = self.run_input_encoder()
-
        self.set_target_shape()
-
        latents, generator = self.run_dit()

        images = self.run_vae_decoder(latents, generator)
-        images = vae_to_comfyui_image(images)
+        if self.config["model_cls"] != "wan2.2":
+            images = vae_to_comfyui_image(images)

        if "video_frame_interpolation" in self.config:
            assert self.vfi_model is not None and self.config["video_frame_interpolation"].get("target_fps", None) is not None
@@ -238,7 +239,11 @@ class DefaultRunner(BaseRunner):

            if not self.config.get("parallel_attn_type", None) or dist.get_rank() == 0:
                logger.info(f"Saving video to {self.config.save_video_path}")
-                save_to_video(images, self.config.save_video_path, fps=fps, method="ffmpeg")  # type: ignore
+
+                if self.config["model_cls"] != "wan2.2":
+                    save_to_video(images, self.config.save_video_path, fps=fps, method="ffmpeg")  # type: ignore
+                else:
+                    cache_video(tensor=images, save_file=self.config.save_video_path, fps=fps, nrow=1, normalize=True, value_range=(-1, 1))

        del latents, generator
        torch.cuda.empty_cache()

--- a/lightx2v/models/runners/hunyuan/hunyuan_runner.py
+++ b/lightx2v/models/runners/hunyuan/hunyuan_runner.py
@@ -137,8 +137,8 @@ class HunyuanRunner(DefaultRunner):

        return img_latents, kwargs

-    def get_encoder_output_i2v(self, clip_encoder_out, vae_encode_out, text_encoder_output, img):
-        image_encoder_output = {"img": img, "img_latents": vae_encode_out}
+    def get_encoder_output_i2v(self, clip_encoder_out, vae_encoder_out, text_encoder_output, img):
+        image_encoder_output = {"img": img, "img_latents": vae_encoder_out}
        return {"text_encoder_output": text_encoder_output, "image_encoder_output": image_encoder_output}

    def set_target_shape(self):

--- a/lightx2v/models/runners/wan/wan_audio_runner.py
+++ b/lightx2v/models/runners/wan/wan_audio_runner.py
@@ -435,10 +435,10 @@ class WanAudioRunner(WanRunner):  # type:ignore

        if os.path.isfile(self.config.image_path):
            with ProfilingContext("Run Img Encoder"):
-                vae_encode_out, clip_encoder_out = self.run_image_encoder(self.config, self.vae_encoder)
+                vae_encoder_out, clip_encoder_out = self.run_image_encoder(self.config, self.vae_encoder)
                image_encoder_output = {
                    "clip_encoder_out": clip_encoder_out,
-                    "vae_encode_out": vae_encode_out,
+                    "vae_encoder_out": vae_encoder_out,
                }

        with ProfilingContext("Run Text Encoder"):
@@ -659,11 +659,11 @@ class WanAudioRunner(WanRunner):  # type:ignore

        # vae encode
        cond_frms = rearrange(cond_frms, "1 C H W -> 1 C 1 H W")
-        vae_encode_out = vae_model.encode(cond_frms.to(torch.float), config)
-        if isinstance(vae_encode_out, list):
-            vae_encode_out = torch.stack(vae_encode_out, dim=0).to(torch.bfloat16)
+        vae_encoder_out = vae_model.encode(cond_frms.to(torch.float), config)
+        if isinstance(vae_encoder_out, list):
+            vae_encoder_out = torch.stack(vae_encoder_out, dim=0).to(torch.bfloat16)

-        return vae_encode_out, clip_encoder_out
+        return vae_encoder_out, clip_encoder_out

    def set_target_shape(self):
        """Set target shape for generation"""

--- a/lightx2v/models/runners/wan/wan_runner.py
+++ b/lightx2v/models/runners/wan/wan_runner.py
@@ -3,6 +3,8 @@ import gc
 import numpy as np
 import torch
 import torchvision.transforms.functional as TF
+import torch.distributed as dist
+from loguru import logger
 from PIL import Image
 from lightx2v.utils.registry_factory import RUNNER_REGISTER
 from lightx2v.models.runners.default_runner import DefaultRunner
@@ -14,16 +16,16 @@ from lightx2v.models.schedulers.wan.feature_caching.scheduler import (
    WanSchedulerCaching,
    WanSchedulerTaylorCaching,
 )
-from lightx2v.utils.profiler import ProfilingContext
 from lightx2v.utils.utils import *
 from lightx2v.models.input_encoders.hf.t5.model import T5EncoderModel
 from lightx2v.models.input_encoders.hf.xlm_roberta.model import CLIPModel
 from lightx2v.models.networks.wan.model import WanModel, Wan22MoeModel
 from lightx2v.models.networks.wan.lora_adapter import WanLoraWrapper
 from lightx2v.models.video_encoders.hf.wan.vae import WanVAE
+from lightx2v.models.video_encoders.hf.wan.vae_2_2 import Wan2_2_VAE
 from lightx2v.models.video_encoders.hf.wan.vae_tiny import WanVAE_tiny
-from lightx2v.utils.utils import cache_video
-from loguru import logger
+from lightx2v.utils.utils import cache_video, best_output_size
+from lightx2v.utils.profiler import ProfilingContext


 @RUNNER_REGISTER("wan2.1")
@@ -218,8 +220,8 @@ class WanRunner(DefaultRunner):
            return vae_encode_out_list
        else:
            self.config.lat_h, self.config.lat_w = lat_h, lat_w
-            vae_encode_out = self.get_vae_encoder_output(img, lat_h, lat_w)
-            return vae_encode_out
+            vae_encoder_out = self.get_vae_encoder_output(img, lat_h, lat_w)
+            return vae_encoder_out

    def get_vae_encoder_output(self, img, lat_h, lat_w):
        h = lat_h * self.config.vae_stride[1]
@@ -238,7 +240,7 @@ class WanRunner(DefaultRunner):
        msk = msk.transpose(1, 2)[0]
        if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
            self.vae_encoder = self.load_vae_encoder()
-        vae_encode_out = self.vae_encoder.encode(
+        vae_encoder_out = self.vae_encoder.encode(
            [
                torch.concat(
                    [
@@ -254,13 +256,13 @@ class WanRunner(DefaultRunner):
            del self.vae_encoder
            torch.cuda.empty_cache()
            gc.collect()
-        vae_encode_out = torch.concat([msk, vae_encode_out]).to(torch.bfloat16)
-        return vae_encode_out
+        vae_encoder_out = torch.concat([msk, vae_encoder_out]).to(torch.bfloat16)
+        return vae_encoder_out

-    def get_encoder_output_i2v(self, clip_encoder_out, vae_encode_out, text_encoder_output, img):
+    def get_encoder_output_i2v(self, clip_encoder_out, vae_encoder_out, text_encoder_output, img):
        image_encoder_output = {
            "clip_encoder_out": clip_encoder_out,
-            "vae_encode_out": vae_encode_out,
+            "vae_encoder_out": vae_encoder_out,
        }
        return {
            "text_encoder_output": text_encoder_output,
@@ -359,3 +361,58 @@ class Wan22MoeRunner(WanRunner):
            self.init_device,
        )
        return MultiModelStruct([high_noise_model, low_noise_model], self.config, self.config.boundary)
+
+
+@RUNNER_REGISTER("wan2.2")
+class Wan22DenseRunner(WanRunner):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def load_vae_decoder(self):
+        vae_config = {
+            "vae_pth": find_torch_model_path(self.config, "vae_pth", "Wan2.2_VAE.pth"),
+            "device": self.init_device,
+        }
+        vae_decoder = Wan2_2_VAE(**vae_config)
+        return vae_decoder
+
+    def load_vae_encoder(self):
+        vae_config = {
+            "vae_pth": find_torch_model_path(self.config, "vae_pth", "Wan2.2_VAE.pth"),
+            "device": self.init_device,
+        }
+        if self.config.task != "i2v":
+            return None
+        else:
+            return Wan2_2_VAE(**vae_config)
+
+    def load_vae(self):
+        vae_encoder = self.load_vae_encoder()
+        vae_decoder = self.load_vae_decoder()
+        return vae_encoder, vae_decoder
+
+    def run_vae_encoder(self, img):
+        max_area = self.config.target_height * self.config.target_width
+        ih, iw = img.height, img.width
+        dh, dw = self.config.patch_size[1] * self.config.vae_stride[1], self.config.patch_size[2] * self.config.vae_stride[2]
+        ow, oh = best_output_size(iw, ih, dw, dh, max_area)
+
+        scale = max(ow / iw, oh / ih)
+        img = img.resize((round(iw * scale), round(ih * scale)), Image.LANCZOS)
+
+        # center-crop
+        x1 = (img.width - ow) // 2
+        y1 = (img.height - oh) // 2
+        img = img.crop((x1, y1, x1 + ow, y1 + oh))
+        assert img.width == ow and img.height == oh
+
+        # to tensor
+        img = TF.to_tensor(img).sub_(0.5).div_(0.5).cuda().unsqueeze(1)
+        vae_encoder_out = self.get_vae_encoder_output(img)
+        self.config.lat_w, self.config.lat_h = ow // self.config.vae_stride[2], oh // self.config.vae_stride[1]
+
+        return vae_encoder_out
+
+    def get_vae_encoder_output(self, img):
+        z = self.vae_encoder.encode(img)
+        return z
--- a/lightx2v/models/runners/wan/wan_skyreels_v2_df_runner.py
+++ b/lightx2v/models/runners/wan/wan_skyreels_v2_df_runner.py
@@ -36,9 +36,9 @@ class WanSkyreelsV2DFRunner(WanRunner):  # Diffustion foring for SkyReelsV2 DF I
        config.lat_h = lat_h
        config.lat_w = lat_w

-        vae_encode_out = vae_model.encode([torch.nn.functional.interpolate(img[None].cpu(), size=(h, w), mode="bicubic").transpose(0, 1).cuda()], config)[0]
-        vae_encode_out = vae_encode_out.to(torch.bfloat16)
-        return vae_encode_out
+        vae_encoder_out = vae_model.encode([torch.nn.functional.interpolate(img[None].cpu(), size=(h, w), mode="bicubic").transpose(0, 1).cuda()], config)[0]
+        vae_encoder_out = vae_encoder_out.to(torch.bfloat16)
+        return vae_encoder_out

    def set_target_shape(self):
        if os.path.isfile(self.config.image_path):

--- a/lightx2v/models/schedulers/wan/scheduler.py
+++ b/lightx2v/models/schedulers/wan/scheduler.py
@@ -2,8 +2,9 @@ import math
 import numpy as np
 import torch
 import gc
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Union
 from lightx2v.models.schedulers.scheduler import BaseScheduler
+from lightx2v.utils.utils import masks_like


 class WanScheduler(BaseScheduler):
@@ -19,10 +20,11 @@ class WanScheduler(BaseScheduler):
        self.solver_order = 2
        self.noise_pred = None
        self.sample_guide_scale = self.config.sample_guide_scale
-
        self.caching_records_2 = [True] * self.config.infer_steps

    def prepare(self, image_encoder_output=None):
+        if self.config["model_cls"] == "wan2.2" and self.config["task"] == "i2v":
+            self.vae_encoder_out = image_encoder_output["vae_encoder_out"]
        self.generator = torch.Generator(device=self.device)
        self.generator.manual_seed(self.config.seed)

@@ -57,6 +59,12 @@ class WanScheduler(BaseScheduler):
            device=self.device,
            generator=self.generator,
        )
+        if self.config["model_cls"] == "wan2.2":
+            if self.config["task"] == "t2v":
+                self.mask = masks_like(self.latents, zero=False)
+            elif self.config["task"] == "i2v":
+                self.mask = masks_like(self.latents, zero=True)
+                self.latents = (1.0 - self.mask) * self.vae_encoder_out + self.mask * self.latents

    def set_timesteps(
        self,
@@ -354,3 +362,5 @@ class WanScheduler(BaseScheduler):
            self.lower_order_nums += 1

        self.latents = prev_sample
+        if self.config["model_cls"] == "wan2.2" and self.config["task"] == "i2v":
+            self.latents = (1.0 - self.mask) * self.vae_encoder_out + self.mask * self.latents
--- a/lightx2v/models/video_encoders/hf/wan/vae_2_2.py
+++ b/lightx2v/models/video_encoders/hf/wan/vae_2_2.py
--- a/lightx2v/utils/utils.py
+++ b/lightx2v/utils/utils.py
@@ -294,3 +294,43 @@ def find_hf_model_path(config, ckpt_config_key=None, subdir=["original", "fp8",
            logger.info(f"Found Hugging Face model files in: {path}")
            return path
    raise FileNotFoundError(f"No Hugging Face model files (.safetensors) found.\nPlease download the model from: https://huggingface.co/lightx2v/ or specify the model path in the configuration file.")
+
+
+def masks_like(tensor, zero=False, generator=None, p=0.2):
+    assert isinstance(tensor, torch.Tensor)
+    out = torch.ones_like(tensor)
+    if zero:
+        if generator is not None:
+            # 生成随机数判断是否需要修改
+            random_num = torch.rand(1, generator=generator, device=generator.device).item()
+            if random_num < p:
+                out[:, 0] = torch.zeros_like(out[:, 0])
+        else:
+            out[:, 0] = torch.zeros_like(out[:, 0])
+
+    return out
+
+
+def best_output_size(w, h, dw, dh, expected_area):
+    # float output size
+    ratio = w / h
+    ow = (expected_area * ratio) ** 0.5
+    oh = expected_area / ow
+
+    # process width first
+    ow1 = int(ow // dw * dw)
+    oh1 = int(expected_area / ow1 // dh * dh)
+    assert ow1 % dw == 0 and oh1 % dh == 0 and ow1 * oh1 <= expected_area
+    ratio1 = ow1 / oh1
+
+    # process height first
+    oh2 = int(oh // dh * dh)
+    ow2 = int(expected_area / oh2 // dw * dw)
+    assert oh2 % dh == 0 and ow2 % dw == 0 and ow2 * oh2 <= expected_area
+    ratio2 = ow2 / oh2
+
+    # compare ratios
+    if max(ratio / ratio1, ratio1 / ratio) < max(ratio / ratio2, ratio2 / ratio):
+        return ow1, oh1
+    else:
+        return ow2, oh2
--- a/scripts/wan22/run_wan22_i2v.sh
+++ b/scripts/wan22/run_wan22_i2v.sh
+#!/bin/bash
+
+# set path and first
+lightx2v_path=
+model_path=
+
+# check section
+if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
+    cuda_devices=5
+    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
+    export CUDA_VISIBLE_DEVICES=${cuda_devices}
+fi
+
+if [ -z "${lightx2v_path}" ]; then
+    echo "Error: lightx2v_path is not set. Please set this variable first."
+    exit 1
+fi
+
+if [ -z "${model_path}" ]; then
+    echo "Error: model_path is not set. Please set this variable first."
+    exit 1
+fi
+
+export TOKENIZERS_PARALLELISM=false
+
+export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
+export DTYPE=BF16
+export ENABLE_PROFILING_DEBUG=true
+export ENABLE_GRAPH_MODE=false
+
+python -m lightx2v.infer \
+--model_cls wan2.2 \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/wan22/wan_ti2v_i2v.json \
+--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+--negative_prompt "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
+--image_path ${lightx2v_path}/assets/inputs/imgs/img_0.jpg \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan22_i2v.mp4
--- a/scripts/wan22/run_wan22_t2v.sh
+++ b/scripts/wan22/run_wan22_t2v.sh
+#!/bin/bash
+
+# set path and first
+lightx2v_path=
+model_path=
+
+# check section
+if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
+    cuda_devices=1
+    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
+    export CUDA_VISIBLE_DEVICES=${cuda_devices}
+fi
+
+if [ -z "${lightx2v_path}" ]; then
+    echo "Error: lightx2v_path is not set. Please set this variable first."
+    exit 1
+fi
+
+if [ -z "${model_path}" ]; then
+    echo "Error: model_path is not set. Please set this variable first."
+    exit 1
+fi
+
+export TOKENIZERS_PARALLELISM=false
+
+export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
+export DTYPE=BF16
+export ENABLE_PROFILING_DEBUG=true
+export ENABLE_GRAPH_MODE=false
+
+python -m lightx2v.infer \
+--model_cls wan2.2 \
+--task t2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/wan22/wan_ti2v_t2v.json \
+--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
+--negative_prompt "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan22_dense_t2v.mp4