Unverified Commit 52ecf060 authored by Watebear's avatar Watebear Committed by GitHub
Browse files

[feat]: support qwen-image-edit-2509 (#401)

[feat]: support qwen-image-edit-2509
parent 9fcb2cf8
{
"batchsize": 1,
"num_channels_latents": 16,
"vae_scale_factor": 8,
"infer_steps": 40,
"guidance_embeds": false,
"num_images_per_prompt": 1,
"vae_latents_mean": [
-0.7571,
-0.7089,
-0.9113,
0.1075,
-0.1745,
0.9653,
-0.1517,
1.5508,
0.4134,
-0.0715,
0.5517,
-0.3632,
-0.1922,
-0.9497,
0.2503,
-0.2921
],
"vae_latents_std": [
2.8184,
1.4541,
2.3275,
2.6558,
1.2196,
1.7708,
2.6052,
2.0743,
3.2687,
2.1526,
2.8652,
1.5579,
1.6382,
1.1253,
2.8251,
1.916
],
"vae_z_dim": 16,
"feature_caching": "NoCaching",
"transformer_in_channels": 64,
"prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
"prompt_template_encode_start_idx": 64,
"_auto_resize": true,
"num_layers": 60,
"attention_out_dim": 3072,
"attention_dim_head": 128,
"axes_dims_rope": [
16,
56,
56
],
"_comment_attn": "in [torch_sdpa, flash_attn3, sage_attn2]",
"attn_type": "flash_attn3",
"do_true_cfg": true,
"true_cfg_scale": 4.0,
"cpu_offload": true,
"offload_granularity": "block",
"mm_config": {},
"CONDITION_IMAGE_SIZE": 147456,
"USE_IMAGE_ID_IN_PROMPT": true
}
...@@ -44,7 +44,7 @@ ...@@ -44,7 +44,7 @@
"vae_z_dim": 16, "vae_z_dim": 16,
"feature_caching": "NoCaching", "feature_caching": "NoCaching",
"transformer_in_channels": 64, "transformer_in_channels": 64,
"prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n", "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
"prompt_template_encode_start_idx": 64, "prompt_template_encode_start_idx": 64,
"_auto_resize": true, "_auto_resize": true,
"num_layers": 60, "num_layers": 60,
...@@ -61,5 +61,7 @@ ...@@ -61,5 +61,7 @@
"true_cfg_scale": 4.0, "true_cfg_scale": 4.0,
"cpu_offload": true, "cpu_offload": true,
"offload_granularity": "block", "offload_granularity": "block",
"mm_config": {} "mm_config": {},
"CONDITION_IMAGE_SIZE": 1048576,
"USE_IMAGE_ID_IN_PROMPT": false
} }
...@@ -44,7 +44,7 @@ ...@@ -44,7 +44,7 @@
"vae_z_dim": 16, "vae_z_dim": 16,
"feature_caching": "NoCaching", "feature_caching": "NoCaching",
"transformer_in_channels": 64, "transformer_in_channels": 64,
"prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n", "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
"prompt_template_encode_start_idx": 64, "prompt_template_encode_start_idx": 64,
"_auto_resize": true, "_auto_resize": true,
"num_layers": 60, "num_layers": 60,
...@@ -59,5 +59,7 @@ ...@@ -59,5 +59,7 @@
"attn_type": "flash_attn3", "attn_type": "flash_attn3",
"do_true_cfg": true, "do_true_cfg": true,
"true_cfg_scale": 4.0, "true_cfg_scale": 4.0,
"mm_config": {} "mm_config": {},
"CONDITION_IMAGE_SIZE": 1048576,
"USE_IMAGE_ID_IN_PROMPT": false
} }
{
"batchsize": 1,
"num_channels_latents": 16,
"vae_scale_factor": 8,
"infer_steps": 40,
"guidance_embeds": false,
"num_images_per_prompt": 1,
"vae_latents_mean": [
-0.7571,
-0.7089,
-0.9113,
0.1075,
-0.1745,
0.9653,
-0.1517,
1.5508,
0.4134,
-0.0715,
0.5517,
-0.3632,
-0.1922,
-0.9497,
0.2503,
-0.2921
],
"vae_latents_std": [
2.8184,
1.4541,
2.3275,
2.6558,
1.2196,
1.7708,
2.6052,
2.0743,
3.2687,
2.1526,
2.8652,
1.5579,
1.6382,
1.1253,
2.8251,
1.916
],
"vae_z_dim": 16,
"feature_caching": "NoCaching",
"transformer_in_channels": 64,
"prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
"prompt_template_encode_start_idx": 64,
"_auto_resize": true,
"num_layers": 60,
"attention_out_dim": 3072,
"attention_dim_head": 128,
"axes_dims_rope": [
16,
56,
56
],
"_comment_attn": "in [torch_sdpa, flash_attn3, sage_attn2]",
"attn_type": "flash_attn3",
"do_true_cfg": true,
"true_cfg_scale": 4.0,
"mm_config": {},
"CONDITION_IMAGE_SIZE": 147456,
"USE_IMAGE_ID_IN_PROMPT": true
}
...@@ -40,7 +40,7 @@ def calculate_dimensions(target_area, ratio): ...@@ -40,7 +40,7 @@ def calculate_dimensions(target_area, ratio):
width = round(width / 32) * 32 width = round(width / 32) * 32
height = round(height / 32) * 32 height = round(height / 32) * 32
return width, height, None return width, height
class Qwen25_VLForConditionalGeneration_TextEncoder: class Qwen25_VLForConditionalGeneration_TextEncoder:
...@@ -49,6 +49,13 @@ class Qwen25_VLForConditionalGeneration_TextEncoder: ...@@ -49,6 +49,13 @@ class Qwen25_VLForConditionalGeneration_TextEncoder:
self.tokenizer_max_length = 1024 self.tokenizer_max_length = 1024
self.prompt_template_encode = config["prompt_template_encode"] self.prompt_template_encode = config["prompt_template_encode"]
self.prompt_template_encode_start_idx = config["prompt_template_encode_start_idx"] self.prompt_template_encode_start_idx = config["prompt_template_encode_start_idx"]
"""
for Qwen-Image-Edit model, CONDITION_IMAGE_SIZE = 1024 * 1024
for Qwen-Image-Edit-2509 model, CONDITION_IMAGE_SIZE = 384 * 384
"""
self.CONDITION_IMAGE_SIZE = config.get("CONDITION_IMAGE_SIZE", 384 * 384)
self.USE_IMAGE_ID_IN_PROMPT = config.get("USE_IMAGE_ID_IN_PROMPT", True)
self.VAE_IMAGE_SIZE = 1024 * 1024
self.cpu_offload = config.get("cpu_offload", False) self.cpu_offload = config.get("cpu_offload", False)
if self.cpu_offload: if self.cpu_offload:
...@@ -77,40 +84,50 @@ class Qwen25_VLForConditionalGeneration_TextEncoder: ...@@ -77,40 +84,50 @@ class Qwen25_VLForConditionalGeneration_TextEncoder:
return split_result return split_result
def preprocess_image(self, image): def preprocess_image(self, image):
image_size = image.size image_width, image_height = image.size
width, height = image_size condition_width, condition_height = calculate_dimensions(self.CONDITION_IMAGE_SIZE, image_width / image_height)
calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, width / height) vae_width, vae_height = calculate_dimensions(self.VAE_IMAGE_SIZE, image_width / image_height)
condition_image = self.image_processor.resize(image, condition_height, condition_width)
vae_image = self.image_processor.preprocess(image, vae_height, vae_width).unsqueeze(2)
height = height or calculated_height return condition_image, vae_image, (condition_height, condition_width), (vae_height, vae_width)
width = width or calculated_width
multiple_of = self.config["vae_scale_factor"] * 2
width = width // multiple_of * multiple_of
height = height // multiple_of * multiple_of
if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
image = self.image_processor.resize(image, calculated_height, calculated_width)
prompt_image = image
image = self.image_processor.preprocess(image, calculated_height, calculated_width)
image = image.unsqueeze(2)
return prompt_image, image, (calculated_height, calculated_width)
@torch.no_grad() @torch.no_grad()
def infer(self, text, image=None): def infer(self, text, image_list=None):
if self.cpu_offload: if self.cpu_offload:
self.text_encoder.to(torch.device("cuda")) self.text_encoder.to(torch.device("cuda"))
if image_list is not None:
condition_image_list = []
vae_image_list = []
condition_image_info_list = []
vae_image_info_list = []
if self.USE_IMAGE_ID_IN_PROMPT:
base_img_prompt = ""
img_prompt_template = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>"
for i, image in enumerate(image_list):
base_img_prompt += img_prompt_template.format(i + 1)
condition_image, vae_image, condition_image_info, vae_image_info = self.preprocess_image(image)
condition_image_list.append(condition_image)
vae_image_list.append(vae_image)
condition_image_info_list.append(condition_image_info)
vae_image_info_list.append(vae_image_info)
else:
base_img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
for i, image in enumerate(image_list):
condition_image, vae_image, condition_image_info, vae_image_info = self.preprocess_image(image)
condition_image_list.append(condition_image)
vae_image_list.append(vae_image)
condition_image_info_list.append(condition_image_info)
vae_image_info_list.append(vae_image_info)
template = self.prompt_template_encode template = self.prompt_template_encode
drop_idx = self.prompt_template_encode_start_idx drop_idx = self.prompt_template_encode_start_idx
txt = [template.format(e) for e in text] txt = [template.format(base_img_prompt + e) for e in text]
if image is not None:
prompt_image, image, image_info = self.preprocess_image(image)
model_inputs = self.processor( model_inputs = self.processor(
text=txt, text=txt,
images=prompt_image, images=condition_image_list,
padding=True, padding=True,
return_tensors="pt", return_tensors="pt",
).to(torch.device("cuda")) ).to(torch.device("cuda"))
...@@ -122,8 +139,20 @@ class Qwen25_VLForConditionalGeneration_TextEncoder: ...@@ -122,8 +139,20 @@ class Qwen25_VLForConditionalGeneration_TextEncoder:
image_grid_thw=model_inputs.image_grid_thw, image_grid_thw=model_inputs.image_grid_thw,
output_hidden_states=True, output_hidden_states=True,
) )
image_info = {
"condition_image_list": condition_image_list,
"vae_image_list": vae_image_list,
"condition_image_info_list": condition_image_info_list,
"vae_image_info_list": vae_image_info_list,
}
else: else:
prompt_image, image, image_info = None, None, None template = self.prompt_template_encode
drop_idx = self.prompt_template_encode_start_idx
txt = [template.format(e) for e in text]
image_info = {}
model_inputs = self.tokenizer(txt, max_length=self.tokenizer_max_length + drop_idx, padding=True, truncation=True, return_tensors="pt").to(torch.device("cuda")) model_inputs = self.tokenizer(txt, max_length=self.tokenizer_max_length + drop_idx, padding=True, truncation=True, return_tensors="pt").to(torch.device("cuda"))
encoder_hidden_states = self.text_encoder( encoder_hidden_states = self.text_encoder(
input_ids=model_inputs.input_ids, input_ids=model_inputs.input_ids,
...@@ -154,4 +183,4 @@ class Qwen25_VLForConditionalGeneration_TextEncoder: ...@@ -154,4 +183,4 @@ class Qwen25_VLForConditionalGeneration_TextEncoder:
torch.cuda.empty_cache() torch.cuda.empty_cache()
gc.collect() gc.collect()
return prompt_embeds, prompt_embeds_mask, image, image_info return prompt_embeds, prompt_embeds_mask, image_info
...@@ -194,7 +194,7 @@ class QwenImageTransformerModel: ...@@ -194,7 +194,7 @@ class QwenImageTransformerModel:
t = self.scheduler.timesteps[self.scheduler.step_index] t = self.scheduler.timesteps[self.scheduler.step_index]
latents = self.scheduler.latents latents = self.scheduler.latents
if self.config["task"] == "i2i": if self.config["task"] == "i2i":
image_latents = inputs["image_encoder_output"]["image_latents"] image_latents = torch.cat([item["image_latents"] for item in inputs["image_encoder_output"]], dim=1)
latents_input = torch.cat([latents, image_latents], dim=1) latents_input = torch.cat([latents, image_latents], dim=1)
else: else:
latents_input = latents latents_input = latents
......
...@@ -2,6 +2,8 @@ import gc ...@@ -2,6 +2,8 @@ import gc
import math import math
import torch import torch
import torchvision.transforms.functional as TF
from PIL import Image
from loguru import logger from loguru import logger
from lightx2v.models.input_encoders.hf.qwen25.qwen25_vlforconditionalgeneration import Qwen25_VLForConditionalGeneration_TextEncoder from lightx2v.models.input_encoders.hf.qwen25.qwen25_vlforconditionalgeneration import Qwen25_VLForConditionalGeneration_TextEncoder
...@@ -90,41 +92,61 @@ class QwenImageRunner(DefaultRunner): ...@@ -90,41 +92,61 @@ class QwenImageRunner(DefaultRunner):
"image_encoder_output": None, "image_encoder_output": None,
} }
def read_image_input(self, img_path):
if isinstance(img_path, Image.Image):
img_ori = img_path
else:
img_ori = Image.open(img_path).convert("RGB")
if GET_RECORDER_MODE():
width, height = img_ori.size
monitor_cli.lightx2v_input_image_len.observe(width * height)
img = TF.to_tensor(img_ori).sub_(0.5).div_(0.5).unsqueeze(0).cuda()
self.input_info.original_size.append(img_ori.size)
return img, img_ori
@ProfilingContext4DebugL2("Run Encoders") @ProfilingContext4DebugL2("Run Encoders")
def _run_input_encoder_local_i2i(self): def _run_input_encoder_local_i2i(self):
_, image = self.read_image_input(self.input_info.image_path) image_paths_list = self.input_info.image_path.split(",")
images_list = []
for image_path in image_paths_list:
_, image = self.read_image_input(image_path)
images_list.append(image)
prompt = self.input_info.prompt prompt = self.input_info.prompt
text_encoder_output = self.run_text_encoder(prompt, image, neg_prompt=self.input_info.negative_prompt) text_encoder_output = self.run_text_encoder(prompt, images_list, neg_prompt=self.input_info.negative_prompt)
image_encoder_output = self.run_vae_encoder(image=text_encoder_output["preprocessed_image"])
image_encoder_output["image_info"] = text_encoder_output["image_info"] image_encoder_output_list = []
for vae_image in text_encoder_output["image_info"]["vae_image_list"]:
image_encoder_output = self.run_vae_encoder(image=vae_image)
image_encoder_output_list.append(image_encoder_output)
torch.cuda.empty_cache() torch.cuda.empty_cache()
gc.collect() gc.collect()
return { return {
"text_encoder_output": text_encoder_output, "text_encoder_output": text_encoder_output,
"image_encoder_output": image_encoder_output, "image_encoder_output": image_encoder_output_list,
} }
@ProfilingContext4DebugL1("Run Text Encoder", recorder_mode=GET_RECORDER_MODE(), metrics_func=monitor_cli.lightx2v_run_text_encode_duration, metrics_labels=["QwenImageRunner"]) @ProfilingContext4DebugL1("Run Text Encoder", recorder_mode=GET_RECORDER_MODE(), metrics_func=monitor_cli.lightx2v_run_text_encode_duration, metrics_labels=["QwenImageRunner"])
def run_text_encoder(self, text, image=None, neg_prompt=None): def run_text_encoder(self, text, image_list=None, neg_prompt=None):
if GET_RECORDER_MODE(): if GET_RECORDER_MODE():
monitor_cli.lightx2v_input_prompt_len.observe(len(text)) monitor_cli.lightx2v_input_prompt_len.observe(len(text))
text_encoder_output = {} text_encoder_output = {}
if self.config["task"] == "t2i": if self.config["task"] == "t2i":
prompt_embeds, prompt_embeds_mask, _, _ = self.text_encoders[0].infer([text]) prompt_embeds, prompt_embeds_mask, _ = self.text_encoders[0].infer([text])
text_encoder_output["prompt_embeds"] = prompt_embeds text_encoder_output["prompt_embeds"] = prompt_embeds
text_encoder_output["prompt_embeds_mask"] = prompt_embeds_mask text_encoder_output["prompt_embeds_mask"] = prompt_embeds_mask
if self.config["do_true_cfg"] and neg_prompt is not None: if self.config["do_true_cfg"] and neg_prompt is not None:
neg_prompt_embeds, neg_prompt_embeds_mask, _, _ = self.text_encoders[0].infer([neg_prompt]) neg_prompt_embeds, neg_prompt_embeds_mask, _ = self.text_encoders[0].infer([neg_prompt])
text_encoder_output["negative_prompt_embeds"] = neg_prompt_embeds text_encoder_output["negative_prompt_embeds"] = neg_prompt_embeds
text_encoder_output["negative_prompt_embeds_mask"] = neg_prompt_embeds_mask text_encoder_output["negative_prompt_embeds_mask"] = neg_prompt_embeds_mask
elif self.config["task"] == "i2i": elif self.config["task"] == "i2i":
prompt_embeds, prompt_embeds_mask, preprocessed_image, image_info = self.text_encoders[0].infer([text], image) prompt_embeds, prompt_embeds_mask, image_info = self.text_encoders[0].infer([text], image_list)
text_encoder_output["prompt_embeds"] = prompt_embeds text_encoder_output["prompt_embeds"] = prompt_embeds
text_encoder_output["prompt_embeds_mask"] = prompt_embeds_mask text_encoder_output["prompt_embeds_mask"] = prompt_embeds_mask
text_encoder_output["preprocessed_image"] = preprocessed_image
text_encoder_output["image_info"] = image_info text_encoder_output["image_info"] = image_info
if self.config["do_true_cfg"] and neg_prompt is not None: if self.config["do_true_cfg"] and neg_prompt is not None:
neg_prompt_embeds, neg_prompt_embeds_mask, _, _ = self.text_encoders[0].infer([neg_prompt], image) neg_prompt_embeds, neg_prompt_embeds_mask, _ = self.text_encoders[0].infer([neg_prompt], image_list)
text_encoder_output["negative_prompt_embeds"] = neg_prompt_embeds text_encoder_output["negative_prompt_embeds"] = neg_prompt_embeds
text_encoder_output["negative_prompt_embeds_mask"] = neg_prompt_embeds_mask text_encoder_output["negative_prompt_embeds_mask"] = neg_prompt_embeds_mask
return text_encoder_output return text_encoder_output
...@@ -158,7 +180,7 @@ class QwenImageRunner(DefaultRunner): ...@@ -158,7 +180,7 @@ class QwenImageRunner(DefaultRunner):
if not self.config["_auto_resize"]: if not self.config["_auto_resize"]:
width, height = self.config["aspect_ratios"][self.config["aspect_ratio"]] width, height = self.config["aspect_ratios"][self.config["aspect_ratio"]]
else: else:
width, height = self.input_info.original_size width, height = self.input_info.original_size[-1]
calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, width / height) calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, width / height)
multiple_of = self.vae.vae_scale_factor * 2 multiple_of = self.vae.vae_scale_factor * 2
width = calculated_width // multiple_of * multiple_of width = calculated_width // multiple_of * multiple_of
...@@ -178,13 +200,10 @@ class QwenImageRunner(DefaultRunner): ...@@ -178,13 +200,10 @@ class QwenImageRunner(DefaultRunner):
width, height = self.config["aspect_ratios"][self.config["aspect_ratio"]] width, height = self.config["aspect_ratios"][self.config["aspect_ratio"]]
img_shapes = [(1, height // self.config["vae_scale_factor"] // 2, width // self.config["vae_scale_factor"] // 2)] * self.config["batchsize"] img_shapes = [(1, height // self.config["vae_scale_factor"] // 2, width // self.config["vae_scale_factor"] // 2)] * self.config["batchsize"]
elif self.config["task"] == "i2i": elif self.config["task"] == "i2i":
image_height, image_width = self.inputs["image_encoder_output"]["image_info"] img_shapes = [[(1, self.input_info.auto_hight // self.config["vae_scale_factor"] // 2, self.input_info.auto_width // self.config["vae_scale_factor"] // 2)]]
img_shapes = [ for image_height, image_width in self.inputs["text_encoder_output"]["image_info"]["vae_image_info_list"]:
[ img_shapes[0].append((1, image_height // self.config["vae_scale_factor"] // 2, image_width // self.config["vae_scale_factor"] // 2))
(1, self.input_info.auto_hight // self.config["vae_scale_factor"] // 2, self.input_info.auto_width // self.config["vae_scale_factor"] // 2),
(1, image_height // self.config["vae_scale_factor"] // 2, image_width // self.config["vae_scale_factor"] // 2),
]
]
self.inputs["img_shapes"] = img_shapes self.inputs["img_shapes"] = img_shapes
def init_scheduler(self): def init_scheduler(self):
......
...@@ -111,6 +111,7 @@ class AutoencoderKLQwenImageVAE: ...@@ -111,6 +111,7 @@ class AutoencoderKLQwenImageVAE:
if self.cpu_offload: if self.cpu_offload:
self.model.to(torch.device("cuda")) self.model.to(torch.device("cuda"))
num_channels_latents = self.config["transformer_in_channels"] // 4 num_channels_latents = self.config["transformer_in_channels"] // 4
image = image.to(self.model.device).to(self.dtype) image = image.to(self.model.device).to(self.dtype)
...@@ -129,6 +130,7 @@ class AutoencoderKLQwenImageVAE: ...@@ -129,6 +130,7 @@ class AutoencoderKLQwenImageVAE:
image_latent_height, image_latent_width = image_latents.shape[3:] image_latent_height, image_latent_width = image_latents.shape[3:]
image_latents = self._pack_latents(image_latents, self.config["batchsize"], num_channels_latents, image_latent_height, image_latent_width) image_latents = self._pack_latents(image_latents, self.config["batchsize"], num_channels_latents, image_latent_height, image_latent_width)
if self.cpu_offload: if self.cpu_offload:
self.model.to(torch.device("cpu")) self.model.to(torch.device("cpu"))
torch.cuda.empty_cache() torch.cuda.empty_cache()
......
...@@ -123,6 +123,7 @@ class I2IInputInfo: ...@@ -123,6 +123,7 @@ class I2IInputInfo:
# shape related # shape related
target_shape: int = field(default_factory=int) target_shape: int = field(default_factory=int)
processed_image_size: int = field(default_factory=list) processed_image_size: int = field(default_factory=list)
original_size: list = field(default_factory=list)
def set_input_info(args): def set_input_info(args):
......
...@@ -36,6 +36,6 @@ python -m lightx2v.infer \ ...@@ -36,6 +36,6 @@ python -m lightx2v.infer \
--config_json ${lightx2v_path}/configs/qwen_image/qwen_image_i2i.json \ --config_json ${lightx2v_path}/configs/qwen_image/qwen_image_i2i.json \
--prompt "turn the style of the photo to vintage comic book" \ --prompt "turn the style of the photo to vintage comic book" \
--negative_prompt " " \ --negative_prompt " " \
--image_path /data/nvme2/wushuo/qwen-image/pie.png \ --image_path pie.png \
--save_result_path ${lightx2v_path}/save_results/qwen_image_i2i.png \ --save_result_path ${lightx2v_path}/save_results/qwen_image_i2i.png \
--seed 0 --seed 0
#!/bin/bash
export CUDA_VISIBLE_DEVICES=
# set path and first
export lightx2v_path=
export model_path=
# check section
if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
cuda_devices=0
echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
export CUDA_VISIBLE_DEVICES=${cuda_devices}
fi
if [ -z "${lightx2v_path}" ]; then
echo "Error: lightx2v_path is not set. Please set this variable first."
exit 1
fi
if [ -z "${model_path}" ]; then
echo "Error: model_path is not set. Please set this variable first."
exit 1
fi
export TOKENIZERS_PARALLELISM=false
export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
export DTYPE=BF16
export PROFILING_DEBUG_LEVEL=2
python -m lightx2v.infer \
--model_cls qwen_image \
--task i2i \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/qwen_image/qwen_image_i2i_2509.json \
--prompt "Have the two characters swap clothes and stand in front of the castle." \
--negative_prompt " " \
--image_path 1.jpeg,2.jpeg \
--save_result_path ${lightx2v_path}/save_results/qwen_image_i2i_2509.png \
--seed 0
#!/bin/bash
export CUDA_VISIBLE_DEVICES=
# set path and first
export lightx2v_path=
export model_path=
# check section
if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
cuda_devices=0
echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
export CUDA_VISIBLE_DEVICES=${cuda_devices}
fi
if [ -z "${lightx2v_path}" ]; then
echo "Error: lightx2v_path is not set. Please set this variable first."
exit 1
fi
if [ -z "${model_path}" ]; then
echo "Error: model_path is not set. Please set this variable first."
exit 1
fi
export TOKENIZERS_PARALLELISM=false
export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
export DTYPE=BF16
export PROFILING_DEBUG_LEVEL=2
python -m lightx2v.infer \
--model_cls qwen_image \
--task i2i \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/offload/block/qwen_image_i2i_2509_block.json \
--prompt "Have the two characters swap clothes and stand in front of the castle." \
--negative_prompt " " \
--image_path 1.jpeg,2.jpeg \
--save_result_path ${lightx2v_path}/save_results/qwen_image_i2i_2509.png \
--seed 0
...@@ -34,7 +34,7 @@ python -m lightx2v.infer \ ...@@ -34,7 +34,7 @@ python -m lightx2v.infer \
--model_cls qwen_image \ --model_cls qwen_image \
--task i2i \ --task i2i \
--model_path $model_path \ --model_path $model_path \
--config_json ${lightx2v_path}/configs/qwen_image/qwen_image_i2i.json \ --config_json ${lightx2v_path}/configs/offload/block/qwen_image_i2i_block.json \
--prompt "turn the style of the photo to vintage comic book" \ --prompt "turn the style of the photo to vintage comic book" \
--negative_prompt " " \ --negative_prompt " " \
--image_path pie.png \ --image_path pie.png \
......
...@@ -33,7 +33,7 @@ python -m lightx2v.infer \ ...@@ -33,7 +33,7 @@ python -m lightx2v.infer \
--model_cls qwen_image \ --model_cls qwen_image \
--task t2i \ --task t2i \
--model_path $model_path \ --model_path $model_path \
--config_json ${lightx2v_path}/configs/qwen_image/qwen_image_t2i.json \ --config_json ${lightx2v_path}/configs/offload/block/qwen_image_t2i_block.json \
--prompt 'A coffee shop entrance features a chalkboard sign reading "Qwen Coffee 😊 $2 per cup," with a neon light beside it displaying "通义千问". Next to it hangs a poster showing a beautiful Chinese woman, and beneath the poster is written "π≈3.1415926-53589793-23846264-33832795-02384197". Ultra HD, 4K, cinematic composition, Ultra HD, 4K, cinematic composition.' \ --prompt 'A coffee shop entrance features a chalkboard sign reading "Qwen Coffee 😊 $2 per cup," with a neon light beside it displaying "通义千问". Next to it hangs a poster showing a beautiful Chinese woman, and beneath the poster is written "π≈3.1415926-53589793-23846264-33832795-02384197". Ultra HD, 4K, cinematic composition, Ultra HD, 4K, cinematic composition.' \
--negative_prompt " " \ --negative_prompt " " \
--save_result_path ${lightx2v_path}/save_results/qwen_image_t2i.png \ --save_result_path ${lightx2v_path}/save_results/qwen_image_t2i.png \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment