Merge branch 'main' of https://github.com/ModelTC/LightX2V into main

826e9b03 · wangshankun · 6de0996c · 1ff745e4 · 826e9b03 · 826e9b03
Commit 826e9b03 authored Aug 08, 2025 by wangshankun
15 changed files
--- a/configs/dist_infer/wan22_ti2v_i2v_cfg.json
+++ b/configs/dist_infer/wan22_ti2v_i2v_cfg.json
+{
+    "infer_steps": 50,
+    "target_video_length": 121,
+    "text_len": 512,
+    "target_height": 704,
+    "target_width": 1280,
+    "num_channels_latents": 48,
+    "vae_stride": [4, 16, 16],
+    "self_attn_1_type": "flash_attn3",
+    "cross_attn_1_type": "flash_attn3",
+    "cross_attn_2_type": "flash_attn3",
+    "seed": 42,
+    "sample_guide_scale": 5.0,
+    "sample_shift": 5.0,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "offload_granularity": "model",
+    "fps": 24,
+    "use_image_encoder": false,
+    "parallel": {
+        "cfg_p_size": 2
+    }
+}
--- a/configs/dist_infer/wan22_ti2v_i2v_cfg_ulysses.json
+++ b/configs/dist_infer/wan22_ti2v_i2v_cfg_ulysses.json
+{
+    "infer_steps": 50,
+    "target_video_length": 121,
+    "text_len": 512,
+    "target_height": 704,
+    "target_width": 1280,
+    "num_channels_latents": 48,
+    "vae_stride": [4, 16, 16],
+    "self_attn_1_type": "flash_attn3",
+    "cross_attn_1_type": "flash_attn3",
+    "cross_attn_2_type": "flash_attn3",
+    "seed": 42,
+    "sample_guide_scale": 5.0,
+    "sample_shift": 5.0,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "offload_granularity": "model",
+    "fps": 24,
+    "use_image_encoder": false,
+    "parallel": {
+        "seq_p_size": 4,
+        "seq_p_attn_type": "ulysses",
+        "cfg_p_size": 2
+    }
+}
--- a/configs/dist_infer/wan22_ti2v_i2v_ulysses.json
+++ b/configs/dist_infer/wan22_ti2v_i2v_ulysses.json
+{
+    "infer_steps": 50,
+    "target_video_length": 121,
+    "text_len": 512,
+    "target_height": 704,
+    "target_width": 1280,
+    "num_channels_latents": 48,
+    "vae_stride": [4, 16, 16],
+    "self_attn_1_type": "flash_attn3",
+    "cross_attn_1_type": "flash_attn3",
+    "cross_attn_2_type": "flash_attn3",
+    "seed": 42,
+    "sample_guide_scale": 5.0,
+    "sample_shift": 5.0,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "offload_granularity": "model",
+    "fps": 24,
+    "use_image_encoder": false,
+    "parallel": {
+        "seq_p_size": 4,
+        "seq_p_attn_type": "ulysses"
+    }
+}
--- a/configs/dist_infer/wan22_ti2v_t2v_cfg.json
+++ b/configs/dist_infer/wan22_ti2v_t2v_cfg.json
+{
+    "infer_steps": 50,
+    "target_video_length": 121,
+    "text_len": 512,
+    "target_height": 704,
+    "target_width": 1280,
+    "num_channels_latents": 48,
+    "vae_stride": [4, 16, 16],
+    "self_attn_1_type": "flash_attn3",
+    "cross_attn_1_type": "flash_attn3",
+    "cross_attn_2_type": "flash_attn3",
+    "seed": 42,
+    "sample_guide_scale": 5.0,
+    "sample_shift": 5.0,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "offload_granularity": "model",
+    "fps": 24,
+    "parallel": {
+        "cfg_p_size": 2
+    }
+}
--- a/configs/dist_infer/wan22_ti2v_t2v_cfg_ulysses.json
+++ b/configs/dist_infer/wan22_ti2v_t2v_cfg_ulysses.json
+{
+    "infer_steps": 50,
+    "target_video_length": 121,
+    "text_len": 512,
+    "target_height": 704,
+    "target_width": 1280,
+    "num_channels_latents": 48,
+    "vae_stride": [4, 16, 16],
+    "self_attn_1_type": "flash_attn3",
+    "cross_attn_1_type": "flash_attn3",
+    "cross_attn_2_type": "flash_attn3",
+    "seed": 42,
+    "sample_guide_scale": 5.0,
+    "sample_shift": 5.0,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "offload_granularity": "model",
+    "fps": 24,
+    "parallel": {
+        "seq_p_size": 4,
+        "seq_p_attn_type": "ulysses",
+        "cfg_p_size": 2
+    }
+}
--- a/configs/dist_infer/wan22_ti2v_t2v_ulysses.json
+++ b/configs/dist_infer/wan22_ti2v_t2v_ulysses.json
+{
+    "infer_steps": 50,
+    "target_video_length": 121,
+    "text_len": 512,
+    "target_height": 704,
+    "target_width": 1280,
+    "num_channels_latents": 48,
+    "vae_stride": [4, 16, 16],
+    "self_attn_1_type": "flash_attn3",
+    "cross_attn_1_type": "flash_attn3",
+    "cross_attn_2_type": "flash_attn3",
+    "seed": 42,
+    "sample_guide_scale": 5.0,
+    "sample_shift": 5.0,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "offload_granularity": "model",
+    "fps": 24,
+    "parallel": {
+        "seq_p_size": 4,
+        "seq_p_attn_type": "ulysses"
+    }
+}
--- a/lightx2v/models/networks/wan/infer/dist_infer/transformer_infer.py
+++ b/lightx2v/models/networks/wan/infer/dist_infer/transformer_infer.py
@@ -12,7 +12,7 @@ class WanTransformerDistInfer(WanTransformerInfer):
        self.seq_p_group = self.config["device_mesh"].get_group(mesh_dim="seq_p")
    def infer(self, weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context, audio_dit_blocks=None):
-        x = self.dist_pre_process(x)
+        x, embed0 = self.dist_pre_process(x, embed0)
        x = super().infer(weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context, audio_dit_blocks)
        x = self.dist_post_process(x)
        return x
@@ -24,7 +24,7 @@ class WanTransformerDistInfer(WanTransformerInfer):
            freqs_i = self.compute_freqs_dist(q.size(0), q.size(2) // 2, grid_sizes, freqs)
        return freqs_i
-    def dist_pre_process(self, x):
+    def dist_pre_process(self, x, embed0):
        world_size = dist.get_world_size(self.seq_p_group)
        cur_rank = dist.get_rank(self.seq_p_group)
@@ -35,7 +35,9 @@ class WanTransformerDistInfer(WanTransformerInfer):
            x = F.pad(x, (0, 0, 0, padding_size))  # (后维度填充, 前维度填充)
        x = torch.chunk(x, world_size, dim=0)[cur_rank]
-        return x
+        if self.config["model_cls"].startswith("wan2.2"):
+            embed0 = torch.chunk(embed0, world_size, dim=0)[cur_rank]
+        return x, embed0
    def dist_post_process(self, x):
        world_size = dist.get_world_size(self.seq_p_group)

--- a/lightx2v/models/networks/wan/infer/pre_infer.py
+++ b/lightx2v/models/networks/wan/infer/pre_infer.py
@@ -73,6 +73,10 @@ class WanPreInfer:
        x = x.flatten(2).transpose(1, 2).contiguous()
        seq_lens = torch.tensor(x.size(1), dtype=torch.long).cuda().unsqueeze(0)
+        # wan2.2_moe会对t做扩展，我们发现这里做不做影响不大，而且做了拓展会增加耗时，目前忠实原作代码，后续可以考虑去掉
+        if self.config["model_cls"] == "wan2.2_moe":
+            t = t.expand(seq_lens[0])
        embed = sinusoidal_embedding_1d(self.freq_dim, t.flatten())
        if self.enable_dynamic_cfg:
            s = torch.tensor([self.cfg_scale], dtype=torch.float32).to(x.device)

--- a/lightx2v/utils/set_config.py
+++ b/lightx2v/utils/set_config.py
@@ -25,6 +25,7 @@ def get_default_config():
        "seq_parallel": False,
        "cfg_parallel": False,
        "enable_cfg": False,
+        "use_image_encoder": True,
    }
    return default_config

--- a/scripts/dist_infer/run_wan22_ti2v_i2v_cfg.sh
+++ b/scripts/dist_infer/run_wan22_ti2v_i2v_cfg.sh
+#!/bin/bash
+# set path and first
+lightx2v_path=
+model_path=
+export CUDA_VISIBLE_DEVICES=0,1
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+torchrun --nproc_per_node=2 -m lightx2v.infer \
+--model_cls wan2.2 \
+--task t2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/dist_infer/wan22_ti2v_i2v_cfg.json \
+--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
+--negative_prompt "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan22_ti2v_i2v_parallel_cfg.mp4
--- a/scripts/dist_infer/run_wan22_ti2v_i2v_cfg_ulysses.sh
+++ b/scripts/dist_infer/run_wan22_ti2v_i2v_cfg_ulysses.sh
+#!/bin/bash
+# set path and first
+lightx2v_path=
+model_path=
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+torchrun --nproc_per_node=8 -m lightx2v.infer \
+--model_cls wan2.2 \
+--task t2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/dist_infer/wan22_ti2v_i2v_cfg_ulysses.json \
+--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
+--negative_prompt "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan22_ti2v_i2v_parallel_cfg_ulysses.mp4
--- a/scripts/dist_infer/run_wan22_ti2v_i2v_ulysses.sh
+++ b/scripts/dist_infer/run_wan22_ti2v_i2v_ulysses.sh
+#!/bin/bash
+# set path and first
+lightx2v_path=
+model_path=
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+torchrun --nproc_per_node=4 -m lightx2v.infer \
+--model_cls wan2.2 \
+--task t2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/dist_infer/wan22_ti2v_i2v_ulysses.json \
+--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
+--negative_prompt "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan22_ti2v_i2v_ulysses.mp4
--- a/scripts/dist_infer/run_wan22_ti2v_t2v_cfg.sh
+++ b/scripts/dist_infer/run_wan22_ti2v_t2v_cfg.sh
+#!/bin/bash
+# set path and first
+lightx2v_path=
+model_path=
+export CUDA_VISIBLE_DEVICES=0,1
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+torchrun --nproc_per_node=2 -m lightx2v.infer \
+--model_cls wan2.2 \
+--task t2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/dist_infer/wan22_ti2v_t2v_cfg.json \
+--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
+--negative_prompt "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan22_ti2v_t2v_parallel_cfg.mp4
--- a/scripts/dist_infer/run_wan22_ti2v_t2v_cfg_ulysses.sh
+++ b/scripts/dist_infer/run_wan22_ti2v_t2v_cfg_ulysses.sh
+#!/bin/bash
+# set path and first
+lightx2v_path=
+model_path=
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+torchrun --nproc_per_node=8 -m lightx2v.infer \
+--model_cls wan2.2 \
+--task t2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/dist_infer/wan22_ti2v_t2v_cfg_ulysses.json \
+--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
+--negative_prompt "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan22_ti2v_t2v_parallel_cfg_ulysses.mp4
--- a/scripts/dist_infer/run_wan22_ti2v_t2v_ulysses.sh
+++ b/scripts/dist_infer/run_wan22_ti2v_t2v_ulysses.sh
+#!/bin/bash
+# set path and first
+lightx2v_path=
+model_path=
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+# set environment variables
+source ${lightx2v_path}/scripts/base/base.sh
+torchrun --nproc_per_node=4 -m lightx2v.infer \
+--model_cls wan2.2 \
+--task t2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/dist_infer/wan22_ti2v_t2v_ulysses.json \
+--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
+--negative_prompt "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan22_ti2v_t2v_parallel_ulysses.mp4