openai_chat_client.py

#!/usr/bin/env python3
"""
Bagel OpenAI-compatible chat client for image generation and multimodal tasks.

Usage:
    python openai_chat_client.py --prompt "A cute cat" --output output.png
    python openai_chat_client.py --prompt "Describe this image" --image-url https://example.com/image.png
"""

import argparse
import base64
from pathlib import Path

import requests


def generate_image(
    prompt: str,
    server_url: str = "http://localhost:8091",
    image_url: str | None = None,
    height: int | None = None,
    width: int | None = None,
    steps: int | None = None,
    seed: int | None = None,
    negative_prompt: str | None = None,
    modality: str = "text2img",  # "text2img" (default), "img2img", "img2text", "text2text"
) -> bytes | str | None:
    """Generate an image or text using the chat completions API.

    Args:
        prompt: Text description or prompt
        server_url: Server URL
        image_url: URL or path to input image (for img2img/img2text)
        height: Image height in pixels
        width: Image width in pixels
        steps: Number of inference steps
        seed: Random seed
        negative_prompt: Negative prompt
        modality: Task modality hint

    Returns:
        Image bytes (for image outputs) or Text string (for text outputs) or None if failed
    """

    # Construct Message Content
    content = [{"type": "text", "text": f"<|im_start|>{prompt}<|im_end|>"}]

    if image_url:
        # Check if local file
        if Path(image_url).exists():
            with open(image_url, "rb") as f:
                b64_data = base64.b64encode(f.read()).decode("utf-8")
                final_image_url = f"data:image/jpeg;base64,{b64_data}"
        else:
            final_image_url = image_url

        content.append({"type": "image_url", "image_url": {"url": final_image_url}})

    messages = [{"role": "user", "content": content}]

    # Build request payload with all parameters at top level
    # Note: vLLM ignores "extra_body", so we put parameters directly in the payload
    payload = {"messages": messages}

    # Set output modalities at top level
    if modality == "text2img" or modality == "img2img":
        payload["modalities"] = ["image"]
    elif modality == "img2text" or modality == "text2text":
        payload["modalities"] = ["text"]

    # Add generation parameters directly to payload
    if height is not None:
        payload["height"] = height
    if width is not None:
        payload["width"] = width
    if steps is not None:
        payload["num_inference_steps"] = steps
    if seed is not None:
        payload["seed"] = seed
    if negative_prompt:
        payload["negative_prompt"] = negative_prompt

    # Send request
    try:
        print(f"Sending request to {server_url} with modality {modality}...")
        response = requests.post(
            f"{server_url}/v1/chat/completions",
            headers={"Content-Type": "application/json"},
            json=payload,
            timeout=300,
        )
        response.raise_for_status()
        data = response.json()

        # Extract content - check ALL choices since server may return multiple
        # (e.g., text in choices[0], image in choices[1])
        choices = data.get("choices", [])

        # First pass: look for image output in any choice
        for choice in choices:
            choice_content = choice.get("message", {}).get("content")

            # Handle Image Output
            if isinstance(choice_content, list) and len(choice_content) > 0:
                first_item = choice_content[0]
                if isinstance(first_item, dict) and "image_url" in first_item:
                    img_url_str = first_item["image_url"].get("url", "")
                    if img_url_str.startswith("data:image"):
                        _, b64_data = img_url_str.split(",", 1)
                        return base64.b64decode(b64_data)

        # Second pass: look for text output if no image found
        for choice in choices:
            choice_content = choice.get("message", {}).get("content")
            if isinstance(choice_content, str) and choice_content:
                return choice_content

        print(f"Unexpected response format: {choices}")
        return None

    except Exception as e:
        print(f"Error: {e}")
        return None


def main():
    parser = argparse.ArgumentParser(description="Bagel multimodal chat client")
    parser.add_argument("--prompt", "-p", default="<|im_start|>A cute cat<|im_end|>", help="Text prompt")
    parser.add_argument("--output", "-o", default="bagel_output.png", help="Output file (for image results)")
    parser.add_argument("--server", "-s", default="http://localhost:8091", help="Server URL")

    # Modality Control
    parser.add_argument("--image-url", "-i", type=str, help="Input image URL or local path")
    parser.add_argument(
        "--modality",
        "-m",
        default="text2img",
        choices=["text2img", "img2img", "img2text", "text2text"],
        help="Task modality",
    )

    # Generation Params
    parser.add_argument("--height", type=int, default=512, help="Image height")
    parser.add_argument("--width", type=int, default=512, help="Image width")
    parser.add_argument("--steps", type=int, default=25, help="Inference steps")
    parser.add_argument("--seed", type=int, default=42, help="Random seed")
    parser.add_argument("--negative", help="Negative prompt")

    args = parser.parse_args()

    print(f"Mode: {args.modality}")
    if args.image_url:
        print(f"Input Image: {args.image_url}")

    result = generate_image(
        prompt=args.prompt,
        server_url=args.server,
        image_url=args.image_url,
        height=args.height,
        width=args.width,
        steps=args.steps,
        seed=args.seed,
        negative_prompt=args.negative,
        modality=args.modality,
    )

    if result:
        if isinstance(result, bytes):
            # It's an image
            output_path = Path(args.output)
            output_path.write_bytes(result)
            print(f"Image saved to: {output_path}")
            print(f"Size: {len(result) / 1024:.1f} KB")
        elif isinstance(result, str):
            # It's text
            print("Response:")
            print(result)
    else:
        print("Failed to generate response")
        exit(1)


if __name__ == "__main__":
    main()