from __future__ import annotations
import argparse
import base64
import json
import re
from datetime import datetime
from pathlib import Path
from typing import Any
import requests
DEFAULT_BASE_URL = "http://0.0.0.0:8000/v1"
DEFAULT_API_KEY = "dummy"
DEFAULT_MODEL = "sensenova-u1"
INTERLEAVE_SYSTEM_PROMPT = (
"You are a multimodal assistant capable of reasoning with both text and images. "
"You support two modes:\n\n"
"Think Mode: When reasoning is needed, you MUST start with a block "
"and place all reasoning inside it. You MUST interleave text with generated images "
"using tags like , . Images can ONLY be generated between and "
", and may be referenced in the final answer.\n\n"
"Non-Think Mode: When no reasoning is needed, directly provide the answer without reasoning. "
"Do not use tags like , ; present any images naturally alongside the text.\n\n"
"After the think block, always provide a concise, user-facing final answer. "
"The answer may include text, images, or both. Match the user's language in both reasoning "
"and the final answer."
)
GENERATION_SYSTEM_PROMPT = (
"You are an image generation and editing assistant that accurately understands and executes "
"user intent.\n\nYou support two modes:\n\n1. Think Mode:\nIf the task requires reasoning, you "
"MUST start with a block. Put all reasoning inside the block using plain text. "
"DO NOT include any image tags. Keep it reasonable and directly useful for producing the final "
"image.\n\n2. Non-Think Mode:\nIf no reasoning is needed, directly produce the final image.\n\n"
"Task Types:\n\nA. Text-to-Image Generation:\n"
"- Generate a high-quality image based on the user's description.\n"
"- Ensure visual clarity, semantic consistency, and completeness.\n"
"- DO NOT introduce elements that contradict or override the user's intent.\n\n"
"B. Image Editing:\n"
"- Use the provided image(s) as input or reference for modification or transformation.\n"
"- The result can be an edited image or a new image based on the reference(s).\n"
"- Preserve all unspecified attributes unless explicitly changed.\n\n"
"General Rules:\n"
"- For any visible text in the image, follow the language specified for the rendered text in "
"the user's description, not the language of the prompt. If no language is specified, use the "
"user's input language."
)
"""
_aspect_ratio_to_resolution: ClassVar[dict] = {
"1:1": {"1K": (1024, 1024), "1.5K": (1536, 1536), "2K": (2048, 2048)},
"16:9": {"1.5K": (2048, 1152), "2K": (2720, 1536)},
"9:16": {"1.5K": (1152, 2048), "2K": (1536, 2720)},
"3:2": {"1.5K": (1888, 1248), "2K": (2496, 1664)},
"2:3": {"1.5K": (1248, 1888), "2K": (1664, 2496)},
"4:3": {"1.5K": (1760, 1312), "2K": (2368, 1760)},
"3:4": {"1.5K": (1312, 1760), "2K": (1760, 2368)},
"1:2": {"1.5K": (1088, 2144), "2K": (1440, 2880)},
"2:1": {"1.5K": (2144, 1088), "2K": (2880, 1440)},
"1:3": {"1.5K": (864, 2592), "2K": (1152, 3456)},
"3:1": {"1.5K": (2592, 864), "2K": (3456, 1152)},
}
"""
IMAGE_CONFIG_DEFAULT = {
"aspect_ratio": "16:9",
"image_size": "2K",
"image_type": "jpeg",
"seed": 42,
# If set to True, the generated image will have the same resolution as the input image.
# If set to False, the resolution of the generated image will be determined by the image_size and aspect_ratio.
"dynamic_resolution": True,
# if you want to determine the resolution of the generated image by yourself, set the height and width.
# the default value is -1.
"height": -1,
"width": -1,
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="OpenAI-compatible API test client for LightLLM + LightX2V.")
parser.add_argument(
"--mode",
required=True,
choices=["t2i", "it2i", "interleave", "vqa"],
help="Test mode. If omitted, the script asks interactively.",
)
parser.add_argument("--prompt", required=True, help="User prompt. If omitted, ask interactively.")
parser.add_argument(
"--image_path",
default=None,
help="Input image path for it2i / interleave.",
)
parser.add_argument("--url", default=DEFAULT_BASE_URL)
parser.add_argument("--api-key", default=DEFAULT_API_KEY)
parser.add_argument("--model", default=DEFAULT_MODEL)
parser.add_argument(
"--out-dir",
default="./api_test_outputs",
help="Directory to save generated images and raw responses.",
)
parser.add_argument("--temperature", type=float, default=0.8)
parser.add_argument("--top-p", type=float, default=0.95)
parser.add_argument("--max-tokens", type=int, default=4096)
parser.add_argument(
"--enable-thinking",
action=argparse.BooleanOptionalAction,
default=True,
help="Pass chat_template_kwargs.enable_thinking to backend.",
)
parser.add_argument(
"--seed",
type=int,
default=IMAGE_CONFIG_DEFAULT["seed"],
help="Sampling seed for image config / streaming request.",
)
parser.add_argument(
"--aspect-ratio",
default=IMAGE_CONFIG_DEFAULT["aspect_ratio"],
help="Aspect ratio for generated image (e.g. 16:9, 1:1).",
)
parser.add_argument(
"--image-size",
default=IMAGE_CONFIG_DEFAULT["image_size"],
help="Image size preset for generation (e.g. 1.5K, 2K).",
)
parser.add_argument(
"--height",
type=int,
default=IMAGE_CONFIG_DEFAULT["height"],
help="Manual image height. Use with --width; keep -1 for auto resolution.",
)
parser.add_argument(
"--width",
type=int,
default=IMAGE_CONFIG_DEFAULT["width"],
help="Manual image width. Use with --height; keep -1 for auto resolution.",
)
return parser.parse_args()
def build_image_config(args: argparse.Namespace) -> dict[str, Any]:
image_config = {
**IMAGE_CONFIG_DEFAULT,
"aspect_ratio": args.aspect_ratio,
"image_size": args.image_size,
"seed": args.seed,
"height": args.height,
"width": args.width,
}
if args.height > 0 and args.width > 0:
image_config["dynamic_resolution"] = False
return image_config
def local_image_to_data_url(path: str) -> str:
image_path = Path(path)
if not image_path.exists():
raise FileNotFoundError(f"image not found: {image_path}")
suffix = image_path.suffix.lower()
if suffix in {".jpg", ".jpeg"}:
mime = "image/jpeg"
elif suffix == ".png":
mime = "image/png"
elif suffix == ".webp":
mime = "image/webp"
else:
mime = "image/jpeg"
data = base64.b64encode(image_path.read_bytes()).decode("utf-8")
return f"data:{mime};base64,{data}"
def save_data_url_to_file(data_url: str, path: Path) -> None:
matched = re.match(r"data:image/(?P[\w+.-]+);base64,(?P.+)", data_url, re.DOTALL)
if not matched:
raise ValueError(f"unsupported data url prefix: {data_url[:80]}...")
raw = base64.b64decode(matched.group("b64"))
path.write_bytes(raw)
print(f"[saved] {path} ({len(raw)} bytes)")
def save_images_from_message(message: dict[str, Any], out_dir: Path, prefix: str) -> None:
images = message.get("images") or []
for idx, item in enumerate(images):
if not isinstance(item, dict):
continue
image_url = (item.get("image_url") or {}).get("url", "")
if not image_url.startswith("data:image/"):
continue
ext = "png"
if image_url.startswith("data:image/jpeg") or image_url.startswith("data:image/jpg"):
ext = "jpg"
elif image_url.startswith("data:image/webp"):
ext = "webp"
save_data_url_to_file(image_url, out_dir / f"{prefix}_{idx}.{ext}")
def build_client(base_url: str, api_key: str) -> tuple[str, dict[str, str]]:
chat_url = f"{base_url.rstrip('/')}/chat/completions"
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
return chat_url, headers
def run_t2i(args: argparse.Namespace, out_dir: Path, timestamp: str) -> None:
chat_url, headers = build_client(args.url, args.api_key)
image_config = build_image_config(args)
payload = {
"model": args.model,
"messages": [{"role": "system", "content": GENERATION_SYSTEM_PROMPT}, {"role": "user", "content": args.prompt}],
"modalities": ["image"],
"stream": False,
"n": 1,
"temperature": args.temperature,
"top_p": args.top_p,
"max_tokens": args.max_tokens,
"chat_template_kwargs": {"enable_thinking": args.enable_thinking},
"image_config": image_config,
}
response = requests.post(chat_url, headers=headers, json=payload, timeout=600)
response.raise_for_status()
data = response.json()
raw_path = out_dir / f"{timestamp}_t2i_response.json"
raw_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"[saved] {raw_path}")
message = ((data.get("choices") or [{}])[0]).get("message") or {}
print("\n--- assistant content ---")
print(message.get("content", ""))
save_images_from_message(message, out_dir=out_dir, prefix=f"{timestamp}_t2i")
def run_it2i(args: argparse.Namespace, out_dir: Path, timestamp: str) -> None:
chat_url, headers = build_client(args.url, args.api_key)
assert args.image_path is not None, "image_path is required"
image_config = build_image_config(args)
payload = {
"model": args.model,
"messages": [
{"role": "system", "content": GENERATION_SYSTEM_PROMPT},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": local_image_to_data_url(args.image)}},
{"type": "text", "text": args.prompt},
],
},
],
"modalities": ["image"],
"stream": False,
"n": 1,
"temperature": args.temperature,
"top_p": args.top_p,
"max_tokens": args.max_tokens,
"chat_template_kwargs": {"enable_thinking": args.enable_thinking},
"image_config": image_config,
}
response = requests.post(chat_url, headers=headers, json=payload, timeout=600)
response.raise_for_status()
data = response.json()
raw_path = out_dir / f"{timestamp}_it2i_response.json"
raw_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"[saved] {raw_path}")
message = ((data.get("choices") or [{}])[0]).get("message") or {}
print("\n--- assistant content ---")
print(message.get("content", ""))
save_images_from_message(message, out_dir=out_dir, prefix=f"{timestamp}_it2i")
def run_interleave_stream(args: argparse.Namespace, out_dir: Path, timestamp: str) -> None:
chat_url, headers = build_client(args.url, args.api_key)
image_config = build_image_config(args)
content = []
if args.image_path:
content.append({"type": "image_url", "image_url": {"url": local_image_to_data_url(args.image)}})
content.append({"type": "text", "text": args.prompt})
payload = {
"model": args.model,
"messages": [
{"role": "system", "content": INTERLEAVE_SYSTEM_PROMPT},
{
"role": "user",
"content": content,
},
],
"modalities": ["text", "image"],
"stream": True,
"n": 1,
"temperature": args.temperature,
"top_p": args.top_p,
"max_tokens": args.max_tokens,
"chat_template_kwargs": {"enable_thinking": args.enable_thinking},
"image_config": image_config,
"seed": args.seed,
}
response = requests.post(chat_url, headers=headers, json=payload, stream=True, timeout=600)
response.raise_for_status()
text_chunks: list[str] = []
image_idx = 0
for line in response.iter_lines():
if not line:
continue
decoded = line.decode("utf-8")
if not decoded.startswith("data: "):
continue
body = decoded[6:]
if body.strip() == "[DONE]":
break
try:
chunk = json.loads(body)
except json.JSONDecodeError:
continue
choices = chunk.get("choices") or []
if not choices:
continue
delta = choices[0].get("delta") or {}
content = delta.get("content")
if content:
text_chunks.append(content)
print(content, end="", flush=True)
for image_item in delta.get("images") or []:
image_url = (image_item.get("image_url") or {}).get("url", "")
if image_url.startswith("data:image/"):
out_file = out_dir / f"{timestamp}_interleave_stream_{image_idx}.png"
save_data_url_to_file(image_url, out_file)
image_idx += 1
print("\n\n--- stream complete ---")
final_text = "".join(text_chunks)
text_path = out_dir / f"{timestamp}_interleave_stream.txt"
text_path.write_text(final_text, encoding="utf-8")
print(f"[saved] {text_path}")
def run_vqa(args: argparse.Namespace, out_dir: Path, timestamp: str) -> None:
chat_url, headers = build_client(args.url, args.api_key)
content = []
if args.image_path:
content.append({"type": "image_url", "image_url": {"url": local_image_to_data_url(args.image_path)}})
content.append({"type": "text", "text": args.prompt})
payload = {
"model": args.model,
"messages": [{"role": "user", "content": content}],
}
response = requests.post(chat_url, headers=headers, json=payload, timeout=600)
response.raise_for_status()
data = response.json()
message = ((data.get("choices") or [{}])[0]).get("message") or {}
print("\n--- assistant content ---")
print(message.get("content", ""))
def main() -> None:
args = parse_args()
out_dir = Path(args.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
print(f"[config] mode={args.mode}, model={args.model}, url={args.url}")
if args.image_path is not None:
print(f"[config] input image_path={args.image_path}")
print(f"[config] output_dir={out_dir.resolve()}")
if args.mode == "t2i":
run_t2i(args, out_dir=out_dir, timestamp=timestamp)
elif args.mode == "it2i":
run_it2i(args, out_dir=out_dir, timestamp=timestamp)
elif args.mode == "interleave":
run_interleave_stream(args, out_dir=out_dir, timestamp=timestamp)
elif args.mode == "vqa":
run_vqa(args, out_dir=out_dir, timestamp=timestamp)
else:
raise ValueError(f"unknown mode: {args.mode}")
if __name__ == "__main__":
main()