Merge branch 'main' into dev_flf2v

ecb2107c · gushiqiao · GitHub · d8d70a28 · 3d8cb02e · ecb2107c
Commit ecb2107c authored Aug 18, 2025 by gushiqiao Committed by GitHub Aug 18, 2025
3 changed files
--- a/lightx2v/models/video_encoders/hf/qwen_image/vae.py
+++ b/lightx2v/models/video_encoders/hf/qwen_image/vae.py
+import json
+import os
+
+import torch
+
+try:
+    from diffusers import AutoencoderKLQwenImage
+    from diffusers.image_processor import VaeImageProcessor
+except ImportError:
+    AutoencoderKLQwenImage = None
+    VaeImageProcessor = None
+
+
+class AutoencoderKLQwenImageVAE:
+    def __init__(self, config):
+        self.config = config
+        self.model = AutoencoderKLQwenImage.from_pretrained(os.path.join(config.model_path, "vae")).to(torch.device("cuda")).to(torch.bfloat16)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=config.vae_scale_factor * 2)
+        with open(os.path.join(config.model_path, "vae", "config.json"), "r") as f:
+            vae_config = json.load(f)
+            self.vae_scale_factor = 2 ** len(vae_config["temperal_downsample"]) if "temperal_downsample" in vae_config else 8
+        self.dtype = torch.bfloat16
+
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (2 * 2), 1, height, width)
+
+        return latents
+
+    @torch.no_grad()
+    def decode(self, latents):
+        width, height = self.config.aspect_ratios[self.config.aspect_ratio]
+        latents = self._unpack_latents(latents, height, width, self.config.vae_scale_factor)
+        latents = latents.to(self.dtype)
+        latents_mean = torch.tensor(self.config.vae_latents_mean).view(1, self.config.vae_z_dim, 1, 1, 1).to(latents.device, latents.dtype)
+        latents_std = 1.0 / torch.tensor(self.config.vae_latents_std).view(1, self.config.vae_z_dim, 1, 1, 1).to(latents.device, latents.dtype)
+        latents = latents / latents_std + latents_mean
+        images = self.model.decode(latents, return_dict=False)[0][:, :, 0]
+        images = self.image_processor.postprocess(images, output_type="pil")
+        return images
--- a/scripts/cache/run_wan_i2v_tea.sh
+++ b/scripts/cache/run_wan_i2v_tea.sh
@@ -11,10 +11,10 @@ source ${lightx2v_path}/scripts/base/base.sh

 python -m lightx2v.infer \
 --model_cls wan2.1 \
--task t2v \
+--task i2v \
 --model_path $model_path \
 --config_json ${lightx2v_path}/configs/caching/teacache/wan_i2v_tea_480p.json \
--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
+--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
 --negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
 --image_path ${lightx2v_path}/assets/inputs/imgs/img_0.jpg \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_tea.mp4
--- a/scripts/qwen_image/qwen_image_t2i.sh
+++ b/scripts/qwen_image/qwen_image_t2i.sh
+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=
+
+# set path and first
+export lightx2v_path=
+export model_path=
+
+# check section
+if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
+    cuda_devices=0
+    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
+    export CUDA_VISIBLE_DEVICES=${cuda_devices}
+fi
+
+if [ -z "${lightx2v_path}" ]; then
+    echo "Error: lightx2v_path is not set. Please set this variable first."
+    exit 1
+fi
+
+if [ -z "${model_path}" ]; then
+    echo "Error: model_path is not set. Please set this variable first."
+    exit 1
+fi
+
+export TOKENIZERS_PARALLELISM=false
+
+export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
+export DTYPE=BF16
+export ENABLE_PROFILING_DEBUG=true
+export ENABLE_GRAPH_MODE=false
+
+
+python -m lightx2v.infer \
+--model_cls qwen_image \
+--task t2i \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/qwen_image/qwen_image_t2i.json \
+--prompt 'A coffee shop entrance features a chalkboard sign reading "Qwen Coffee 😊 $2 per cup," with a neon light beside it displaying "通义千问". Next to it hangs a poster showing a beautiful Chinese woman, and beneath the poster is written "π≈3.1415926-53589793-23846264-33832795-02384197". Ultra HD, 4K, cinematic compositionUltra HD, 4K, cinematic composition.' \
+--save_video_path ${lightx2v_path}/save_results/qwen_image_t2i.png