[feat]: support qwen-image-edit-2509 (#401)

[feat]: support qwen-image-edit-2509

[feat]: support qwen-image-edit-2509 (#401)
[feat]: support qwen-image-edit-2509
52ecf060 · Watebear · GitHub · 9fcb2cf8 · 52ecf060 · 52ecf060
Unverified Commit 52ecf060 authored Oct 22, 2025 by Watebear Committed by GitHub Oct 22, 2025
14 changed files
--- a/configs/offload/block/qwen_image_i2i_2509_block.json
+++ b/configs/offload/block/qwen_image_i2i_2509_block.json
+{
+    "batchsize": 1,
+    "num_channels_latents": 16,
+    "vae_scale_factor": 8,
+    "infer_steps": 40,
+    "guidance_embeds": false,
+    "num_images_per_prompt": 1,
+    "vae_latents_mean": [
+        -0.7571,
+        -0.7089,
+        -0.9113,
+        0.1075,
+        -0.1745,
+        0.9653,
+        -0.1517,
+        1.5508,
+        0.4134,
+        -0.0715,
+        0.5517,
+        -0.3632,
+        -0.1922,
+        -0.9497,
+        0.2503,
+        -0.2921
+    ],
+    "vae_latents_std": [
+        2.8184,
+        1.4541,
+        2.3275,
+        2.6558,
+        1.2196,
+        1.7708,
+        2.6052,
+        2.0743,
+        3.2687,
+        2.1526,
+        2.8652,
+        1.5579,
+        1.6382,
+        1.1253,
+        2.8251,
+        1.916
+    ],
+    "vae_z_dim": 16,
+    "feature_caching": "NoCaching",
+    "transformer_in_channels": 64,
+    "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode_start_idx": 64,
+    "_auto_resize": true,
+    "num_layers": 60,
+    "attention_out_dim": 3072,
+    "attention_dim_head": 128,
+    "axes_dims_rope": [
+        16,
+        56,
+        56
+    ],
+    "_comment_attn": "in [torch_sdpa, flash_attn3, sage_attn2]",
+    "attn_type": "flash_attn3",
+    "do_true_cfg": true,
+    "true_cfg_scale": 4.0,
+    "cpu_offload": true,
+    "offload_granularity": "block",
+    "mm_config": {},
+    "CONDITION_IMAGE_SIZE": 147456,
+    "USE_IMAGE_ID_IN_PROMPT": true
+}
--- a/configs/offload/block/qwen_image_i2i_block.json
+++ b/configs/offload/block/qwen_image_i2i_block.json
@@ -44,7 +44,7 @@
    "vae_z_dim": 16,
    "feature_caching": "NoCaching",
    "transformer_in_channels": 64,
-    "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
    "prompt_template_encode_start_idx": 64,
    "_auto_resize": true,
    "num_layers": 60,
@@ -61,5 +61,7 @@
    "true_cfg_scale": 4.0,
    "cpu_offload": true,
    "offload_granularity": "block",
-    "mm_config": {}
+    "mm_config": {},
+    "CONDITION_IMAGE_SIZE": 1048576,
+    "USE_IMAGE_ID_IN_PROMPT": false
 }
--- a/configs/qwen_image/qwen_image_i2i.json
+++ b/configs/qwen_image/qwen_image_i2i.json
@@ -44,7 +44,7 @@
    "vae_z_dim": 16,
    "feature_caching": "NoCaching",
    "transformer_in_channels": 64,
-    "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
    "prompt_template_encode_start_idx": 64,
    "_auto_resize": true,
    "num_layers": 60,
@@ -59,5 +59,7 @@
    "attn_type": "flash_attn3",
    "do_true_cfg": true,
    "true_cfg_scale": 4.0,
-    "mm_config": {}
+    "mm_config": {},
+    "CONDITION_IMAGE_SIZE": 1048576,
+    "USE_IMAGE_ID_IN_PROMPT": false
 }
--- a/configs/qwen_image/qwen_image_i2i_2509.json
+++ b/configs/qwen_image/qwen_image_i2i_2509.json
+{
+    "batchsize": 1,
+    "num_channels_latents": 16,
+    "vae_scale_factor": 8,
+    "infer_steps": 40,
+    "guidance_embeds": false,
+    "num_images_per_prompt": 1,
+    "vae_latents_mean": [
+        -0.7571,
+        -0.7089,
+        -0.9113,
+        0.1075,
+        -0.1745,
+        0.9653,
+        -0.1517,
+        1.5508,
+        0.4134,
+        -0.0715,
+        0.5517,
+        -0.3632,
+        -0.1922,
+        -0.9497,
+        0.2503,
+        -0.2921
+    ],
+    "vae_latents_std": [
+        2.8184,
+        1.4541,
+        2.3275,
+        2.6558,
+        1.2196,
+        1.7708,
+        2.6052,
+        2.0743,
+        3.2687,
+        2.1526,
+        2.8652,
+        1.5579,
+        1.6382,
+        1.1253,
+        2.8251,
+        1.916
+    ],
+    "vae_z_dim": 16,
+    "feature_caching": "NoCaching",
+    "transformer_in_channels": 64,
+    "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode_start_idx": 64,
+    "_auto_resize": true,
+    "num_layers": 60,
+    "attention_out_dim": 3072,
+    "attention_dim_head": 128,
+    "axes_dims_rope": [
+        16,
+        56,
+        56
+    ],
+    "_comment_attn": "in [torch_sdpa, flash_attn3, sage_attn2]",
+    "attn_type": "flash_attn3",
+    "do_true_cfg": true,
+    "true_cfg_scale": 4.0,
+    "mm_config": {},
+    "CONDITION_IMAGE_SIZE": 147456,
+    "USE_IMAGE_ID_IN_PROMPT": true
+}
--- a/lightx2v/models/input_encoders/hf/qwen25/qwen25_vlforconditionalgeneration.py
+++ b/lightx2v/models/input_encoders/hf/qwen25/qwen25_vlforconditionalgeneration.py
@@ -40,7 +40,7 @@ def calculate_dimensions(target_area, ratio):
    width = round(width / 32) * 32
    height = round(height / 32) * 32
-    return width, height, None
+    return width, height
 class Qwen25_VLForConditionalGeneration_TextEncoder:
@@ -49,6 +49,13 @@ class Qwen25_VLForConditionalGeneration_TextEncoder:
        self.tokenizer_max_length = 1024
        self.prompt_template_encode = config["prompt_template_encode"]
        self.prompt_template_encode_start_idx = config["prompt_template_encode_start_idx"]
+        """
+        for Qwen-Image-Edit model, CONDITION_IMAGE_SIZE = 1024 * 1024
+        for Qwen-Image-Edit-2509 model, CONDITION_IMAGE_SIZE = 384 * 384
+        """
+        self.CONDITION_IMAGE_SIZE = config.get("CONDITION_IMAGE_SIZE", 384 * 384)
+        self.USE_IMAGE_ID_IN_PROMPT = config.get("USE_IMAGE_ID_IN_PROMPT", True)
+        self.VAE_IMAGE_SIZE = 1024 * 1024
        self.cpu_offload = config.get("cpu_offload", False)
        if self.cpu_offload:
@@ -77,40 +84,50 @@ class Qwen25_VLForConditionalGeneration_TextEncoder:
        return split_result
    def preprocess_image(self, image):
-        image_size = image.size
+        image_width, image_height = image.size
-        width, height = image_size
+        condition_width, condition_height = calculate_dimensions(self.CONDITION_IMAGE_SIZE, image_width / image_height)
-        calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, width / height)
+        vae_width, vae_height = calculate_dimensions(self.VAE_IMAGE_SIZE, image_width / image_height)
+        condition_image = self.image_processor.resize(image, condition_height, condition_width)
+        vae_image = self.image_processor.preprocess(image, vae_height, vae_width).unsqueeze(2)
-        height = height or calculated_height
+        return condition_image, vae_image, (condition_height, condition_width), (vae_height, vae_width)
-        width = width or calculated_width
-        multiple_of = self.config["vae_scale_factor"] * 2
-        width = width // multiple_of * multiple_of
-        height = height // multiple_of * multiple_of
-        if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
-            image = self.image_processor.resize(image, calculated_height, calculated_width)
-            prompt_image = image
-            image = self.image_processor.preprocess(image, calculated_height, calculated_width)
-            image = image.unsqueeze(2)
-        return prompt_image, image, (calculated_height, calculated_width)
    @torch.no_grad()
-    def infer(self, text, image=None):
+    def infer(self, text, image_list=None):
        if self.cpu_offload:
            self.text_encoder.to(torch.device("cuda"))
+        if image_list is not None:
+            condition_image_list = []
+            vae_image_list = []
+            condition_image_info_list = []
+            vae_image_info_list = []
+            if self.USE_IMAGE_ID_IN_PROMPT:
+                base_img_prompt = ""
+                img_prompt_template = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>"
+                for i, image in enumerate(image_list):
+                    base_img_prompt += img_prompt_template.format(i + 1)
+                    condition_image, vae_image, condition_image_info, vae_image_info = self.preprocess_image(image)
+                    condition_image_list.append(condition_image)
+                    vae_image_list.append(vae_image)
+                    condition_image_info_list.append(condition_image_info)
+                    vae_image_info_list.append(vae_image_info)
+            else:
+                base_img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
+                for i, image in enumerate(image_list):
+                    condition_image, vae_image, condition_image_info, vae_image_info = self.preprocess_image(image)
+                    condition_image_list.append(condition_image)
+                    vae_image_list.append(vae_image)
+                    condition_image_info_list.append(condition_image_info)
+                    vae_image_info_list.append(vae_image_info)
            template = self.prompt_template_encode
            drop_idx = self.prompt_template_encode_start_idx
-        txt = [template.format(e) for e in text]
+            txt = [template.format(base_img_prompt + e) for e in text]
-        if image is not None:
-            prompt_image, image, image_info = self.preprocess_image(image)
            model_inputs = self.processor(
                text=txt,
-                images=prompt_image,
+                images=condition_image_list,
                padding=True,
                return_tensors="pt",
            ).to(torch.device("cuda"))
@@ -122,8 +139,20 @@ class Qwen25_VLForConditionalGeneration_TextEncoder:
                image_grid_thw=model_inputs.image_grid_thw,
                output_hidden_states=True,
            )
+            image_info = {
+                "condition_image_list": condition_image_list,
+                "vae_image_list": vae_image_list,
+                "condition_image_info_list": condition_image_info_list,
+                "vae_image_info_list": vae_image_info_list,
+            }
        else:
-            prompt_image, image, image_info = None, None, None
+            template = self.prompt_template_encode
+            drop_idx = self.prompt_template_encode_start_idx
+            txt = [template.format(e) for e in text]
+            image_info = {}
            model_inputs = self.tokenizer(txt, max_length=self.tokenizer_max_length + drop_idx, padding=True, truncation=True, return_tensors="pt").to(torch.device("cuda"))
            encoder_hidden_states = self.text_encoder(
                input_ids=model_inputs.input_ids,
@@ -154,4 +183,4 @@ class Qwen25_VLForConditionalGeneration_TextEncoder:
            torch.cuda.empty_cache()
            gc.collect()
-        return prompt_embeds, prompt_embeds_mask, image, image_info
+        return prompt_embeds, prompt_embeds_mask, image_info
--- a/lightx2v/models/networks/qwen_image/model.py
+++ b/lightx2v/models/networks/qwen_image/model.py
@@ -194,7 +194,7 @@ class QwenImageTransformerModel:
        t = self.scheduler.timesteps[self.scheduler.step_index]
        latents = self.scheduler.latents
        if self.config["task"] == "i2i":
-            image_latents = inputs["image_encoder_output"]["image_latents"]
+            image_latents = torch.cat([item["image_latents"] for item in inputs["image_encoder_output"]], dim=1)
            latents_input = torch.cat([latents, image_latents], dim=1)
        else:
            latents_input = latents

--- a/lightx2v/models/runners/qwen_image/qwen_image_runner.py
+++ b/lightx2v/models/runners/qwen_image/qwen_image_runner.py
@@ -2,6 +2,8 @@ import gc
 import math
 import torch
+import torchvision.transforms.functional as TF
+from PIL import Image
 from loguru import logger
 from lightx2v.models.input_encoders.hf.qwen25.qwen25_vlforconditionalgeneration import Qwen25_VLForConditionalGeneration_TextEncoder
@@ -90,41 +92,61 @@ class QwenImageRunner(DefaultRunner):
            "image_encoder_output": None,
        }
+    def read_image_input(self, img_path):
+        if isinstance(img_path, Image.Image):
+            img_ori = img_path
+        else:
+            img_ori = Image.open(img_path).convert("RGB")
+        if GET_RECORDER_MODE():
+            width, height = img_ori.size
+            monitor_cli.lightx2v_input_image_len.observe(width * height)
+        img = TF.to_tensor(img_ori).sub_(0.5).div_(0.5).unsqueeze(0).cuda()
+        self.input_info.original_size.append(img_ori.size)
+        return img, img_ori
    @ProfilingContext4DebugL2("Run Encoders")
    def _run_input_encoder_local_i2i(self):
-        _, image = self.read_image_input(self.input_info.image_path)
+        image_paths_list = self.input_info.image_path.split(",")
+        images_list = []
+        for image_path in image_paths_list:
+            _, image = self.read_image_input(image_path)
+            images_list.append(image)
        prompt = self.input_info.prompt
-        text_encoder_output = self.run_text_encoder(prompt, image, neg_prompt=self.input_info.negative_prompt)
+        text_encoder_output = self.run_text_encoder(prompt, images_list, neg_prompt=self.input_info.negative_prompt)
-        image_encoder_output = self.run_vae_encoder(image=text_encoder_output["preprocessed_image"])
-        image_encoder_output["image_info"] = text_encoder_output["image_info"]
+        image_encoder_output_list = []
+        for vae_image in text_encoder_output["image_info"]["vae_image_list"]:
+            image_encoder_output = self.run_vae_encoder(image=vae_image)
+            image_encoder_output_list.append(image_encoder_output)
        torch.cuda.empty_cache()
        gc.collect()
        return {
            "text_encoder_output": text_encoder_output,
-            "image_encoder_output": image_encoder_output,
+            "image_encoder_output": image_encoder_output_list,
        }
    @ProfilingContext4DebugL1("Run Text Encoder", recorder_mode=GET_RECORDER_MODE(), metrics_func=monitor_cli.lightx2v_run_text_encode_duration, metrics_labels=["QwenImageRunner"])
-    def run_text_encoder(self, text, image=None, neg_prompt=None):
+    def run_text_encoder(self, text, image_list=None, neg_prompt=None):
        if GET_RECORDER_MODE():
            monitor_cli.lightx2v_input_prompt_len.observe(len(text))
        text_encoder_output = {}
        if self.config["task"] == "t2i":
-            prompt_embeds, prompt_embeds_mask, _, _ = self.text_encoders[0].infer([text])
+            prompt_embeds, prompt_embeds_mask, _ = self.text_encoders[0].infer([text])
            text_encoder_output["prompt_embeds"] = prompt_embeds
            text_encoder_output["prompt_embeds_mask"] = prompt_embeds_mask
            if self.config["do_true_cfg"] and neg_prompt is not None:
-                neg_prompt_embeds, neg_prompt_embeds_mask, _, _ = self.text_encoders[0].infer([neg_prompt])
+                neg_prompt_embeds, neg_prompt_embeds_mask, _ = self.text_encoders[0].infer([neg_prompt])
                text_encoder_output["negative_prompt_embeds"] = neg_prompt_embeds
                text_encoder_output["negative_prompt_embeds_mask"] = neg_prompt_embeds_mask
        elif self.config["task"] == "i2i":
-            prompt_embeds, prompt_embeds_mask, preprocessed_image, image_info = self.text_encoders[0].infer([text], image)
+            prompt_embeds, prompt_embeds_mask, image_info = self.text_encoders[0].infer([text], image_list)
            text_encoder_output["prompt_embeds"] = prompt_embeds
            text_encoder_output["prompt_embeds_mask"] = prompt_embeds_mask
-            text_encoder_output["preprocessed_image"] = preprocessed_image
            text_encoder_output["image_info"] = image_info
            if self.config["do_true_cfg"] and neg_prompt is not None:
-                neg_prompt_embeds, neg_prompt_embeds_mask, _, _ = self.text_encoders[0].infer([neg_prompt], image)
+                neg_prompt_embeds, neg_prompt_embeds_mask, _ = self.text_encoders[0].infer([neg_prompt], image_list)
                text_encoder_output["negative_prompt_embeds"] = neg_prompt_embeds
                text_encoder_output["negative_prompt_embeds_mask"] = neg_prompt_embeds_mask
        return text_encoder_output
@@ -158,7 +180,7 @@ class QwenImageRunner(DefaultRunner):
        if not self.config["_auto_resize"]:
            width, height = self.config["aspect_ratios"][self.config["aspect_ratio"]]
        else:
-            width, height = self.input_info.original_size
+            width, height = self.input_info.original_size[-1]
            calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, width / height)
            multiple_of = self.vae.vae_scale_factor * 2
            width = calculated_width // multiple_of * multiple_of
@@ -178,13 +200,10 @@ class QwenImageRunner(DefaultRunner):
            width, height = self.config["aspect_ratios"][self.config["aspect_ratio"]]
            img_shapes = [(1, height // self.config["vae_scale_factor"] // 2, width // self.config["vae_scale_factor"] // 2)] * self.config["batchsize"]
        elif self.config["task"] == "i2i":
-            image_height, image_width = self.inputs["image_encoder_output"]["image_info"]
+            img_shapes = [[(1, self.input_info.auto_hight // self.config["vae_scale_factor"] // 2, self.input_info.auto_width // self.config["vae_scale_factor"] // 2)]]
-            img_shapes = [
+            for image_height, image_width in self.inputs["text_encoder_output"]["image_info"]["vae_image_info_list"]:
-                [
+                img_shapes[0].append((1, image_height // self.config["vae_scale_factor"] // 2, image_width // self.config["vae_scale_factor"] // 2))
-                    (1, self.input_info.auto_hight // self.config["vae_scale_factor"] // 2, self.input_info.auto_width // self.config["vae_scale_factor"] // 2),
-                    (1, image_height // self.config["vae_scale_factor"] // 2, image_width // self.config["vae_scale_factor"] // 2),
-                ]
-            ]
        self.inputs["img_shapes"] = img_shapes
    def init_scheduler(self):

--- a/lightx2v/models/video_encoders/hf/qwen_image/vae.py
+++ b/lightx2v/models/video_encoders/hf/qwen_image/vae.py
@@ -111,6 +111,7 @@ class AutoencoderKLQwenImageVAE:
        if self.cpu_offload:
            self.model.to(torch.device("cuda"))
        num_channels_latents = self.config["transformer_in_channels"] // 4
        image = image.to(self.model.device).to(self.dtype)
@@ -129,6 +130,7 @@ class AutoencoderKLQwenImageVAE:
        image_latent_height, image_latent_width = image_latents.shape[3:]
        image_latents = self._pack_latents(image_latents, self.config["batchsize"], num_channels_latents, image_latent_height, image_latent_width)
        if self.cpu_offload:
            self.model.to(torch.device("cpu"))
            torch.cuda.empty_cache()

--- a/lightx2v/utils/input_info.py
+++ b/lightx2v/utils/input_info.py
@@ -123,6 +123,7 @@ class I2IInputInfo:
    # shape related
    target_shape: int = field(default_factory=int)
    processed_image_size: int = field(default_factory=list)
+    original_size: list = field(default_factory=list)
 def set_input_info(args):

--- a/scripts/qwen_image/qwen_image_i2i.sh
+++ b/scripts/qwen_image/qwen_image_i2i.sh
@@ -36,6 +36,6 @@ python -m lightx2v.infer \
    --config_json ${lightx2v_path}/configs/qwen_image/qwen_image_i2i.json \
    --prompt "turn the style of the photo to vintage comic book" \
    --negative_prompt " " \
-    --image_path /data/nvme2/wushuo/qwen-image/pie.png \
+    --image_path pie.png \
    --save_result_path ${lightx2v_path}/save_results/qwen_image_i2i.png \
    --seed 0
--- a/scripts/qwen_image/qwen_image_i2i_2509.sh
+++ b/scripts/qwen_image/qwen_image_i2i_2509.sh
+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=
+# set path and first
+export lightx2v_path=
+export model_path=
+# check section
+if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
+    cuda_devices=0
+    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
+    export CUDA_VISIBLE_DEVICES=${cuda_devices}
+fi
+if [ -z "${lightx2v_path}" ]; then
+    echo "Error: lightx2v_path is not set. Please set this variable first."
+    exit 1
+fi
+if [ -z "${model_path}" ]; then
+    echo "Error: model_path is not set. Please set this variable first."
+    exit 1
+fi
+export TOKENIZERS_PARALLELISM=false
+export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
+export DTYPE=BF16
+export PROFILING_DEBUG_LEVEL=2
+python -m lightx2v.infer \
+    --model_cls qwen_image \
+    --task i2i \
+    --model_path $model_path \
+    --config_json ${lightx2v_path}/configs/qwen_image/qwen_image_i2i_2509.json \
+    --prompt "Have the two characters swap clothes and stand in front of the castle." \
+    --negative_prompt " " \
+    --image_path 1.jpeg,2.jpeg \
+    --save_result_path ${lightx2v_path}/save_results/qwen_image_i2i_2509.png \
+    --seed 0
--- a/scripts/qwen_image/qwen_image_i2i_2509_block.sh
+++ b/scripts/qwen_image/qwen_image_i2i_2509_block.sh
+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=
+# set path and first
+export lightx2v_path=
+export model_path=
+# check section
+if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
+    cuda_devices=0
+    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
+    export CUDA_VISIBLE_DEVICES=${cuda_devices}
+fi
+if [ -z "${lightx2v_path}" ]; then
+    echo "Error: lightx2v_path is not set. Please set this variable first."
+    exit 1
+fi
+if [ -z "${model_path}" ]; then
+    echo "Error: model_path is not set. Please set this variable first."
+    exit 1
+fi
+export TOKENIZERS_PARALLELISM=false
+export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
+export DTYPE=BF16
+export PROFILING_DEBUG_LEVEL=2
+python -m lightx2v.infer \
+    --model_cls qwen_image \
+    --task i2i \
+    --model_path $model_path \
+    --config_json ${lightx2v_path}/configs/offload/block/qwen_image_i2i_2509_block.json \
+    --prompt "Have the two characters swap clothes and stand in front of the castle." \
+    --negative_prompt " " \
+    --image_path 1.jpeg,2.jpeg \
+    --save_result_path ${lightx2v_path}/save_results/qwen_image_i2i_2509.png \
+    --seed 0
--- a/scripts/qwen_image/qwen_image_i2i_block.sh
+++ b/scripts/qwen_image/qwen_image_i2i_block.sh
@@ -34,7 +34,7 @@ python -m lightx2v.infer \
    --model_cls qwen_image \
    --task i2i \
    --model_path $model_path \
-    --config_json ${lightx2v_path}/configs/qwen_image/qwen_image_i2i.json \
+    --config_json ${lightx2v_path}/configs/offload/block/qwen_image_i2i_block.json \
    --prompt "turn the style of the photo to vintage comic book" \
    --negative_prompt " " \
    --image_path pie.png \

--- a/scripts/qwen_image/qwen_image_t2i_block.sh
+++ b/scripts/qwen_image/qwen_image_t2i_block.sh
@@ -33,7 +33,7 @@ python -m lightx2v.infer \
 --model_cls qwen_image \
 --task t2i \
 --model_path $model_path \
--config_json ${lightx2v_path}/configs/qwen_image/qwen_image_t2i.json \
+--config_json ${lightx2v_path}/configs/offload/block/qwen_image_t2i_block.json \
 --prompt 'A coffee shop entrance features a chalkboard sign reading "Qwen Coffee 😊 $2 per cup," with a neon light beside it displaying "通义千问". Next to it hangs a poster showing a beautiful Chinese woman, and beneath the poster is written "π≈3.1415926-53589793-23846264-33832795-02384197". Ultra HD, 4K, cinematic composition, Ultra HD, 4K, cinematic composition.' \
 --negative_prompt " " \
 --save_result_path ${lightx2v_path}/save_results/qwen_image_t2i.png \