Merge pull request #93 from ModelTC/dev_fix

Dev fix

Merge pull request #93 from ModelTC/dev_fix
Dev fix
b4496e64 · gushiqiao · GitHub · 53eae786 · a4b666ca · b4496e64
Commit b4496e64 authored Jul 08, 2025 by gushiqiao Committed by GitHub Jul 08, 2025
5 changed files
--- a/app/gradio_demo.py
+++ b/app/gradio_demo.py
--- a/app/gradio_demo_zh.py
+++ b/app/gradio_demo_zh.py
--- a/lightx2v/common/offload/manager.py
+++ b/lightx2v/common/offload/manager.py
--- a/lightx2v/models/networks/wan/infer/transformer_infer.py
+++ b/lightx2v/models/networks/wan/infer/transformer_infer.py
 import torch
-from .utils import compute_freqs, compute_freqs_dist, apply_rotary_emb, apply_rotary_emb_chunk
+from .utils import compute_freqs, compute_freqs_dist, compute_freqs_audio, compute_freqs_audio_dist, apply_rotary_emb, apply_rotary_emb_chunk
 from lightx2v.common.offload.manager import (
    WeightAsyncStreamManager,
    LazyWeightAsyncStreamManager,
@@ -26,6 +26,8 @@ class WanTransformerInfer(BaseTransformerInfer):
        else:
            self.apply_rotary_emb_func = apply_rotary_emb
        self.clean_cuda_cache = self.config.get("clean_cuda_cache", False)
+        self.mask_map = None
        if self.config["cpu_offload"]:
            if "offload_ratio" in self.config:
                offload_ratio = self.config["offload_ratio"]
@@ -73,10 +75,10 @@ class WanTransformerInfer(BaseTransformerInfer):
        return cu_seqlens_q, cu_seqlens_k
    @torch.compile(disable=not CHECK_ENABLE_GRAPH_MODE())
-    def infer(self, weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context):
+    def infer(self, weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context, audio_dit_blocks=None):
-        return self.infer_func(weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context)
+        return self.infer_func(weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context, audio_dit_blocks)
-    def _infer_with_offload(self, weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context):
+    def _infer_with_offload(self, weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context, audio_dit_blocks=None):
        for block_idx in range(self.blocks_num):
            if block_idx == 0:
                self.weights_stream_mgr.active_weights[0] = weights.blocks[0]
@@ -138,7 +140,7 @@ class WanTransformerInfer(BaseTransformerInfer):
        return x
-    def _infer_with_phases_offload(self, weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context):
+    def _infer_with_phases_offload(self, weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context, audio_dit_blocks=None):
        for block_idx in range(weights.blocks_num):
            for phase_idx in range(self.phases_num):
                if block_idx == 0 and phase_idx == 0:
@@ -186,7 +188,7 @@ class WanTransformerInfer(BaseTransformerInfer):
        return x
-    def _infer_with_phases_lazy_offload(self, weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context):
+    def _infer_with_phases_lazy_offload(self, weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context, audio_dit_blocks=None):
        self.weights_stream_mgr.prefetch_weights_from_disk(weights)
        for block_idx in range(weights.blocks_num):
@@ -247,7 +249,22 @@ class WanTransformerInfer(BaseTransformerInfer):
        return x
-    def _infer_without_offload(self, weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context):
+    def zero_temporal_component_in_3DRoPE(self, valid_token_length, rotary_emb=None):
+        if rotary_emb is None:
+            return None
+        self.use_real = False
+        rope_t_dim = 44
+        if self.use_real:
+            freqs_cos, freqs_sin = rotary_emb
+            freqs_cos[valid_token_length:, :, :rope_t_dim] = 0
+            freqs_sin[valid_token_length:, :, :rope_t_dim] = 0
+            return freqs_cos, freqs_sin
+        else:
+            freqs_cis = rotary_emb
+            freqs_cis[valid_token_length:, :, : rope_t_dim // 2] = 0
+            return freqs_cis
+    def _infer_without_offload(self, weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context, audio_dit_blocks=None):
        for block_idx in range(self.blocks_num):
            x = self.infer_block(
                weights.blocks[block_idx],
@@ -259,6 +276,12 @@ class WanTransformerInfer(BaseTransformerInfer):
                freqs,
                context,
            )
+            if audio_dit_blocks is not None and len(audio_dit_blocks) > 0:
+                for ipa_out in audio_dit_blocks:
+                    if block_idx in ipa_out:
+                        cur_modify = ipa_out[block_idx]
+                        x = cur_modify["modify_func"](x, grid_sizes, **cur_modify["kwargs"])
        return x
    def infer_block(self, weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context):
@@ -318,14 +341,23 @@ class WanTransformerInfer(BaseTransformerInfer):
        v = weights.self_attn_v.apply(norm1_out).view(s, n, d)
        if not self.parallel_attention:
+            if self.config.get("audio_sr", False):
+                freqs_i = compute_freqs_audio(q.size(2) // 2, grid_sizes, freqs)
+            else:
                freqs_i = compute_freqs(q.size(2) // 2, grid_sizes, freqs)
        else:
-            freqs_i = compute_freqs_dist(q.size(0), q.size(2) // 2, grid_sizes, freqs)
+            if self.config.get("audio_sr", False):
+                freqs_i = compute_freqs_audio_dist(q.size(0), q.size(2) // 2, grid_sizes, freqs)
+            else:
+                freqs_i = compute_freqs_dist(q.size(2) // 2, grid_sizes, freqs)
+        freqs_i = self.zero_temporal_component_in_3DRoPE(seq_lens, freqs_i)
        q = self.apply_rotary_emb_func(q, freqs_i)
        k = self.apply_rotary_emb_func(k, freqs_i)
-        cu_seqlens_q, cu_seqlens_k = self._calculate_q_k_len(q, k_lens=seq_lens)
+        k_lens = torch.empty_like(seq_lens).fill_(freqs_i.size(0))
+        cu_seqlens_q, cu_seqlens_k = self._calculate_q_k_len(q, k_lens=k_lens)
        if self.clean_cuda_cache:
            del freqs_i, norm1_out, norm1_weight, norm1_bias
@@ -341,6 +373,7 @@ class WanTransformerInfer(BaseTransformerInfer):
                max_seqlen_q=q.size(0),
                max_seqlen_kv=k.size(0),
                model_cls=self.config["model_cls"],
+                mask_map=self.mask_map,
            )
        else:
            attn_out = self.parallel_attention(
@@ -406,7 +439,6 @@ class WanTransformerInfer(BaseTransformerInfer):
                q,
                k_lens=torch.tensor([k_img.size(0)], dtype=torch.int32, device=k.device),
            )
            img_attn_out = weights.cross_attn_2.apply(
                q=q,
                k=k_img,

--- a/lightx2v/models/networks/wan/infer/utils.py
+++ b/lightx2v/models/networks/wan/infer/utils.py
 import torch
 import torch.distributed as dist
+from loguru import logger
 from lightx2v.utils.envs import *
@@ -19,6 +20,45 @@ def compute_freqs(c, grid_sizes, freqs):
    return freqs_i
+def compute_freqs_audio(c, grid_sizes, freqs):
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    f, h, w = grid_sizes[0].tolist()
+    f = f + 1  ##for r2v add 1 channel
+    seq_len = f * h * w
+    freqs_i = torch.cat(
+        [
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
+        ],
+        dim=-1,
+    ).reshape(seq_len, 1, -1)
+    return freqs_i
+def compute_freqs_audio_dist(s, c, grid_sizes, freqs):
+    world_size = dist.get_world_size()
+    cur_rank = dist.get_rank()
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    f, h, w = grid_sizes[0].tolist()
+    f = f + 1
+    seq_len = f * h * w
+    freqs_i = torch.cat(
+        [
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
+        ],
+        dim=-1,
+    ).reshape(seq_len, 1, -1)
+    freqs_i = pad_freqs(freqs_i, s * world_size)
+    s_per_rank = s
+    freqs_i_rank = freqs_i[(cur_rank * s_per_rank) : ((cur_rank + 1) * s_per_rank), :, :]
+    return freqs_i_rank
 def compute_freqs_causvid(c, grid_sizes, freqs, start_frame=0):
    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
    f, h, w = grid_sizes[0].tolist()