from __future__ import annotations import argparse import base64 import json import re from datetime import datetime from pathlib import Path from typing import Any import requests DEFAULT_BASE_URL = "http://0.0.0.0:8000/v1" DEFAULT_API_KEY = "dummy" DEFAULT_MODEL = "sensenova-u1" INTERLEAVE_SYSTEM_PROMPT = ( "You are a multimodal assistant capable of reasoning with both text and images. " "You support two modes:\n\n" "Think Mode: When reasoning is needed, you MUST start with a block " "and place all reasoning inside it. You MUST interleave text with generated images " "using tags like , . Images can ONLY be generated between and " ", and may be referenced in the final answer.\n\n" "Non-Think Mode: When no reasoning is needed, directly provide the answer without reasoning. " "Do not use tags like , ; present any images naturally alongside the text.\n\n" "After the think block, always provide a concise, user-facing final answer. " "The answer may include text, images, or both. Match the user's language in both reasoning " "and the final answer." ) GENERATION_SYSTEM_PROMPT = ( "You are an image generation and editing assistant that accurately understands and executes " "user intent.\n\nYou support two modes:\n\n1. Think Mode:\nIf the task requires reasoning, you " "MUST start with a block. Put all reasoning inside the block using plain text. " "DO NOT include any image tags. Keep it reasonable and directly useful for producing the final " "image.\n\n2. Non-Think Mode:\nIf no reasoning is needed, directly produce the final image.\n\n" "Task Types:\n\nA. Text-to-Image Generation:\n" "- Generate a high-quality image based on the user's description.\n" "- Ensure visual clarity, semantic consistency, and completeness.\n" "- DO NOT introduce elements that contradict or override the user's intent.\n\n" "B. Image Editing:\n" "- Use the provided image(s) as input or reference for modification or transformation.\n" "- The result can be an edited image or a new image based on the reference(s).\n" "- Preserve all unspecified attributes unless explicitly changed.\n\n" "General Rules:\n" "- For any visible text in the image, follow the language specified for the rendered text in " "the user's description, not the language of the prompt. If no language is specified, use the " "user's input language." ) """ _aspect_ratio_to_resolution: ClassVar[dict] = { "1:1": {"1K": (1024, 1024), "1.5K": (1536, 1536), "2K": (2048, 2048)}, "16:9": {"1.5K": (2048, 1152), "2K": (2720, 1536)}, "9:16": {"1.5K": (1152, 2048), "2K": (1536, 2720)}, "3:2": {"1.5K": (1888, 1248), "2K": (2496, 1664)}, "2:3": {"1.5K": (1248, 1888), "2K": (1664, 2496)}, "4:3": {"1.5K": (1760, 1312), "2K": (2368, 1760)}, "3:4": {"1.5K": (1312, 1760), "2K": (1760, 2368)}, "1:2": {"1.5K": (1088, 2144), "2K": (1440, 2880)}, "2:1": {"1.5K": (2144, 1088), "2K": (2880, 1440)}, "1:3": {"1.5K": (864, 2592), "2K": (1152, 3456)}, "3:1": {"1.5K": (2592, 864), "2K": (3456, 1152)}, } """ IMAGE_CONFIG_DEFAULT = { "aspect_ratio": "16:9", "image_size": "2K", "image_type": "jpeg", "seed": 42, # If set to True, the generated image will have the same resolution as the input image. # If set to False, the resolution of the generated image will be determined by the image_size and aspect_ratio. "dynamic_resolution": True, # if you want to determine the resolution of the generated image by yourself, set the height and width. # the default value is -1. "height": -1, "width": -1, } def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="OpenAI-compatible API test client for LightLLM + LightX2V.") parser.add_argument( "--mode", required=True, choices=["t2i", "it2i", "interleave", "vqa"], help="Test mode. If omitted, the script asks interactively.", ) parser.add_argument("--prompt", required=True, help="User prompt. If omitted, ask interactively.") parser.add_argument( "--image_path", default=None, help="Input image path for it2i / interleave.", ) parser.add_argument("--url", default=DEFAULT_BASE_URL) parser.add_argument("--api-key", default=DEFAULT_API_KEY) parser.add_argument("--model", default=DEFAULT_MODEL) parser.add_argument( "--out-dir", default="./api_test_outputs", help="Directory to save generated images and raw responses.", ) parser.add_argument("--temperature", type=float, default=0.8) parser.add_argument("--top-p", type=float, default=0.95) parser.add_argument("--max-tokens", type=int, default=4096) parser.add_argument( "--enable-thinking", action=argparse.BooleanOptionalAction, default=True, help="Pass chat_template_kwargs.enable_thinking to backend.", ) parser.add_argument( "--seed", type=int, default=IMAGE_CONFIG_DEFAULT["seed"], help="Sampling seed for image config / streaming request.", ) parser.add_argument( "--aspect-ratio", default=IMAGE_CONFIG_DEFAULT["aspect_ratio"], help="Aspect ratio for generated image (e.g. 16:9, 1:1).", ) parser.add_argument( "--image-size", default=IMAGE_CONFIG_DEFAULT["image_size"], help="Image size preset for generation (e.g. 1.5K, 2K).", ) parser.add_argument( "--height", type=int, default=IMAGE_CONFIG_DEFAULT["height"], help="Manual image height. Use with --width; keep -1 for auto resolution.", ) parser.add_argument( "--width", type=int, default=IMAGE_CONFIG_DEFAULT["width"], help="Manual image width. Use with --height; keep -1 for auto resolution.", ) return parser.parse_args() def build_image_config(args: argparse.Namespace) -> dict[str, Any]: image_config = { **IMAGE_CONFIG_DEFAULT, "aspect_ratio": args.aspect_ratio, "image_size": args.image_size, "seed": args.seed, "height": args.height, "width": args.width, } if args.height > 0 and args.width > 0: image_config["dynamic_resolution"] = False return image_config def local_image_to_data_url(path: str) -> str: image_path = Path(path) if not image_path.exists(): raise FileNotFoundError(f"image not found: {image_path}") suffix = image_path.suffix.lower() if suffix in {".jpg", ".jpeg"}: mime = "image/jpeg" elif suffix == ".png": mime = "image/png" elif suffix == ".webp": mime = "image/webp" else: mime = "image/jpeg" data = base64.b64encode(image_path.read_bytes()).decode("utf-8") return f"data:{mime};base64,{data}" def save_data_url_to_file(data_url: str, path: Path) -> None: matched = re.match(r"data:image/(?P[\w+.-]+);base64,(?P.+)", data_url, re.DOTALL) if not matched: raise ValueError(f"unsupported data url prefix: {data_url[:80]}...") raw = base64.b64decode(matched.group("b64")) path.write_bytes(raw) print(f"[saved] {path} ({len(raw)} bytes)") def save_images_from_message(message: dict[str, Any], out_dir: Path, prefix: str) -> None: images = message.get("images") or [] for idx, item in enumerate(images): if not isinstance(item, dict): continue image_url = (item.get("image_url") or {}).get("url", "") if not image_url.startswith("data:image/"): continue ext = "png" if image_url.startswith("data:image/jpeg") or image_url.startswith("data:image/jpg"): ext = "jpg" elif image_url.startswith("data:image/webp"): ext = "webp" save_data_url_to_file(image_url, out_dir / f"{prefix}_{idx}.{ext}") def build_client(base_url: str, api_key: str) -> tuple[str, dict[str, str]]: chat_url = f"{base_url.rstrip('/')}/chat/completions" headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} return chat_url, headers def run_t2i(args: argparse.Namespace, out_dir: Path, timestamp: str) -> None: chat_url, headers = build_client(args.url, args.api_key) image_config = build_image_config(args) payload = { "model": args.model, "messages": [{"role": "system", "content": GENERATION_SYSTEM_PROMPT}, {"role": "user", "content": args.prompt}], "modalities": ["image"], "stream": False, "n": 1, "temperature": args.temperature, "top_p": args.top_p, "max_tokens": args.max_tokens, "chat_template_kwargs": {"enable_thinking": args.enable_thinking}, "image_config": image_config, } response = requests.post(chat_url, headers=headers, json=payload, timeout=600) response.raise_for_status() data = response.json() raw_path = out_dir / f"{timestamp}_t2i_response.json" raw_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8") print(f"[saved] {raw_path}") message = ((data.get("choices") or [{}])[0]).get("message") or {} print("\n--- assistant content ---") print(message.get("content", "")) save_images_from_message(message, out_dir=out_dir, prefix=f"{timestamp}_t2i") def run_it2i(args: argparse.Namespace, out_dir: Path, timestamp: str) -> None: chat_url, headers = build_client(args.url, args.api_key) assert args.image_path is not None, "image_path is required" image_config = build_image_config(args) payload = { "model": args.model, "messages": [ {"role": "system", "content": GENERATION_SYSTEM_PROMPT}, { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": local_image_to_data_url(args.image)}}, {"type": "text", "text": args.prompt}, ], }, ], "modalities": ["image"], "stream": False, "n": 1, "temperature": args.temperature, "top_p": args.top_p, "max_tokens": args.max_tokens, "chat_template_kwargs": {"enable_thinking": args.enable_thinking}, "image_config": image_config, } response = requests.post(chat_url, headers=headers, json=payload, timeout=600) response.raise_for_status() data = response.json() raw_path = out_dir / f"{timestamp}_it2i_response.json" raw_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8") print(f"[saved] {raw_path}") message = ((data.get("choices") or [{}])[0]).get("message") or {} print("\n--- assistant content ---") print(message.get("content", "")) save_images_from_message(message, out_dir=out_dir, prefix=f"{timestamp}_it2i") def run_interleave_stream(args: argparse.Namespace, out_dir: Path, timestamp: str) -> None: chat_url, headers = build_client(args.url, args.api_key) image_config = build_image_config(args) content = [] if args.image_path: content.append({"type": "image_url", "image_url": {"url": local_image_to_data_url(args.image)}}) content.append({"type": "text", "text": args.prompt}) payload = { "model": args.model, "messages": [ {"role": "system", "content": INTERLEAVE_SYSTEM_PROMPT}, { "role": "user", "content": content, }, ], "modalities": ["text", "image"], "stream": True, "n": 1, "temperature": args.temperature, "top_p": args.top_p, "max_tokens": args.max_tokens, "chat_template_kwargs": {"enable_thinking": args.enable_thinking}, "image_config": image_config, "seed": args.seed, } response = requests.post(chat_url, headers=headers, json=payload, stream=True, timeout=600) response.raise_for_status() text_chunks: list[str] = [] image_idx = 0 for line in response.iter_lines(): if not line: continue decoded = line.decode("utf-8") if not decoded.startswith("data: "): continue body = decoded[6:] if body.strip() == "[DONE]": break try: chunk = json.loads(body) except json.JSONDecodeError: continue choices = chunk.get("choices") or [] if not choices: continue delta = choices[0].get("delta") or {} content = delta.get("content") if content: text_chunks.append(content) print(content, end="", flush=True) for image_item in delta.get("images") or []: image_url = (image_item.get("image_url") or {}).get("url", "") if image_url.startswith("data:image/"): out_file = out_dir / f"{timestamp}_interleave_stream_{image_idx}.png" save_data_url_to_file(image_url, out_file) image_idx += 1 print("\n\n--- stream complete ---") final_text = "".join(text_chunks) text_path = out_dir / f"{timestamp}_interleave_stream.txt" text_path.write_text(final_text, encoding="utf-8") print(f"[saved] {text_path}") def run_vqa(args: argparse.Namespace, out_dir: Path, timestamp: str) -> None: chat_url, headers = build_client(args.url, args.api_key) content = [] if args.image_path: content.append({"type": "image_url", "image_url": {"url": local_image_to_data_url(args.image_path)}}) content.append({"type": "text", "text": args.prompt}) payload = { "model": args.model, "messages": [{"role": "user", "content": content}], } response = requests.post(chat_url, headers=headers, json=payload, timeout=600) response.raise_for_status() data = response.json() message = ((data.get("choices") or [{}])[0]).get("message") or {} print("\n--- assistant content ---") print(message.get("content", "")) def main() -> None: args = parse_args() out_dir = Path(args.out_dir) out_dir.mkdir(parents=True, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") print(f"[config] mode={args.mode}, model={args.model}, url={args.url}") if args.image_path is not None: print(f"[config] input image_path={args.image_path}") print(f"[config] output_dir={out_dir.resolve()}") if args.mode == "t2i": run_t2i(args, out_dir=out_dir, timestamp=timestamp) elif args.mode == "it2i": run_it2i(args, out_dir=out_dir, timestamp=timestamp) elif args.mode == "interleave": run_interleave_stream(args, out_dir=out_dir, timestamp=timestamp) elif args.mode == "vqa": run_vqa(args, out_dir=out_dir, timestamp=timestamp) else: raise ValueError(f"unknown mode: {args.mode}") if __name__ == "__main__": main()