test_vision.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import json

import openai
import pytest
import pytest_asyncio
from transformers import AutoProcessor

from vllm.multimodal.media import MediaWithBytes
from vllm.multimodal.utils import encode_image_url, fetch_image
from vllm.platforms import current_platform

from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer

MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
MAXIMUM_IMAGES = 2

# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_ASSETS = [
    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
    "Grayscale_8bits_palette_sample_image.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
    "1280px-Venn_diagram_rgb.svg.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
    "RGBA_comp.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
]

# Required terms for beam search validation
# Each entry is a list of term groups - ALL groups must match
# Each group is a list of alternatives - at least ONE term in the group must appear
# This provides semantic validation while allowing wording variation
REQUIRED_BEAM_SEARCH_TERMS = [
    # Boardwalk image: must have "boardwalk" AND ("wooden" or "wood")
    [["boardwalk"], ["wooden", "wood"]],
    # Parrots image: must have ("parrot" or "bird") AND "two"
    [["parrot", "bird"], ["two"]],
    # Venn diagram: must have "venn" AND "diagram"
    [["venn"], ["diagram"]],
    # Gradient image: must have "gradient" AND ("color" or "spectrum")
    [["gradient"], ["color", "spectrum"]],
]


def check_output_matches_terms(content: str, term_groups: list[list[str]]) -> bool:
    """
    Check if content matches all required term groups.
    Each term group requires at least one of its terms to be present.
    All term groups must be satisfied.
    """
    content_lower = content.lower()
    return all(
        any(term.lower() in content_lower for term in group) for group in term_groups
    )


def assert_non_empty_content(chat_completion, *, context: str = "") -> str:
    """Assert the first choice has non-empty string content; return it.

    Provides a detailed failure message including the full ChatCompletion
    response so flaky / model-quality issues are easy to diagnose.
    """
    prefix = f"[{context}] " if context else ""
    choice = chat_completion.choices[0]
    content = choice.message.content

    assert content is not None, (
        f"{prefix}Expected non-None content but got None. "
        f"finish_reason={choice.finish_reason!r}, "
        f"full message={choice.message!r}, "
        f"usage={chat_completion.usage!r}"
    )
    assert isinstance(content, str), (
        f"{prefix}Expected str content, got {type(content).__name__}: {content!r}"
    )
    assert len(content) > 0, (
        f"{prefix}Expected non-empty content but got empty string. "
        f"finish_reason={choice.finish_reason!r}, "
        f"full message={choice.message!r}, "
        f"usage={chat_completion.usage!r}"
    )
    return content


@pytest.fixture(scope="module")
def server():
    args = [
        "--runner",
        "generate",
        "--max-model-len",
        "2048",
        "--max-num-seqs",
        "5",
        "--enforce-eager",
        "--trust-remote-code",
        "--limit-mm-per-prompt",
        json.dumps({"image": MAXIMUM_IMAGES}),
        *ROCM_EXTRA_ARGS,
    ]

    # ROCm: Increase timeouts to handle potential network delays and slower
    # video processing when downloading multiple videos from external sources
    env_overrides = {
        **ROCM_ENV_OVERRIDES,
        **(
            {
                "VLLM_VIDEO_FETCH_TIMEOUT": "120",
                "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
            }
            if current_platform.is_rocm()
            else {}
        ),
    }

    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
        yield remote_server


@pytest_asyncio.fixture
async def client(server):
    async with server.get_async_client() as async_client:
        yield async_client


@pytest.fixture(scope="session")
def url_encoded_image(local_asset_server) -> dict[str, str]:
    return {
        image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
        for image_asset in TEST_IMAGE_ASSETS
    }


def dummy_messages_from_image_url(
    image_urls: str | list[str],
    content_text: str = "What's in this image?",
):
    if isinstance(image_urls, str):
        image_urls = [image_urls]

    return [
        {
            "role": "user",
            "content": [
                *(
                    {"type": "image_url", "image_url": {"url": image_url}}
                    for image_url in image_urls
                ),
                {"type": "text", "text": content_text},
            ],
        }
    ]


def describe_image_messages(
    image_url: str, *, extra_image_fields: dict | None = None
) -> list[dict]:
    """Build the system + user messages used by the completions-with-image
    family of tests. *extra_image_fields* is merged into the top-level
    image content block (for uuid / bad-key tests)."""
    image_block: dict = {
        "type": "image_url",
        "image_url": {"url": image_url},
    }
    if extra_image_fields:
        image_block.update(extra_image_fields)

    return [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Describe this image."},
                image_block,
            ],
        },
    ]


async def complete_and_check(
    client: openai.AsyncOpenAI,
    model_name: str,
    messages: list[dict],
    *,
    context: str,
    max_completion_tokens: int = 50,
    temperature: float = 0.0,
) -> str:
    """Run a chat completion and assert the output is non-empty.
    Returns the content string."""
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
        max_completion_tokens=max_completion_tokens,
        temperature=temperature,
    )
    return assert_non_empty_content(chat_completion, context=context)


def get_hf_prompt_tokens(model_name, content, image_url):
    processor = AutoProcessor.from_pretrained(
        model_name, trust_remote_code=True, num_crops=4
    )

    placeholder = "<|image_1|>\n"
    messages = [
        {
            "role": "user",
            "content": f"{placeholder}{content}",
        }
    ]
    image = fetch_image(image_url)
    # Unwrap MediaWithBytes if present
    if isinstance(image, MediaWithBytes):
        image = image.media
    images = [image]

    prompt = processor.tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = processor(prompt, images, return_tensors="pt")

    return inputs.input_ids.shape[1]


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_single_chat_session_image(
    client: openai.AsyncOpenAI, model_name: str, image_url: str
):
    content_text = "What's in this image?"
    messages = dummy_messages_from_image_url(image_url, content_text)

    max_completion_tokens = 10
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
        max_completion_tokens=max_completion_tokens,
        logprobs=True,
        temperature=0.0,
        top_logprobs=5,
    )
    assert len(chat_completion.choices) == 1, (
        f"Expected 1 choice, got {len(chat_completion.choices)}"
    )

    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length", (
        f"Expected finish_reason='length' (capped at {max_completion_tokens} "
        f"tokens), got {choice.finish_reason!r}. "
        f"content={choice.message.content!r}"
    )

    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
    expected_usage = openai.types.CompletionUsage(
        completion_tokens=max_completion_tokens,
        prompt_tokens=hf_prompt_tokens,
        total_tokens=hf_prompt_tokens + max_completion_tokens,
    )
    assert chat_completion.usage == expected_usage, (
        f"Usage mismatch: got {chat_completion.usage!r}, expected {expected_usage!r}"
    )

    message = choice.message
    assert message.content is not None and len(message.content) >= 10, (
        f"Expected content with >=10 chars, got {message.content!r}"
    )
    assert message.role == "assistant", (
        f"Expected role='assistant', got {message.role!r}"
    )

    messages.append({"role": "assistant", "content": message.content})

    # test multi-turn dialogue
    messages.append({"role": "user", "content": "express your result in json"})
    await complete_and_check(
        client,
        model_name,
        messages,
        context=f"multi-turn follow-up for {image_url}",
        max_completion_tokens=10,
    )


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_error_on_invalid_image_url_type(
    client: openai.AsyncOpenAI, model_name: str, image_url: str
):
    content_text = "What's in this image?"
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": image_url},
                {"type": "text", "text": content_text},
            ],
        }
    ]

    # image_url should be a dict {"url": "some url"}, not directly a string
    with pytest.raises(openai.BadRequestError):
        await client.chat.completions.create(
            model=model_name,
            messages=messages,
            max_completion_tokens=10,
            temperature=0.0,
        )


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_single_chat_session_image_beamsearch(
    client: openai.AsyncOpenAI, model_name: str, image_url: str
):
    content_text = "What's in this image?"
    messages = dummy_messages_from_image_url(image_url, content_text)

    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
        n=2,
        max_completion_tokens=10,
        logprobs=True,
        top_logprobs=5,
        extra_body=dict(use_beam_search=True),
    )
    assert len(chat_completion.choices) == 2, (
        f"Expected 2 beam search choices, got {len(chat_completion.choices)}"
    )

    content_0 = chat_completion.choices[0].message.content
    content_1 = chat_completion.choices[1].message.content
    assert content_0 != content_1, (
        f"Beam search should produce different outputs for {image_url}, "
        f"but both returned: {content_0!r}"
    )


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_single_chat_session_image_base64encoded(
    client: openai.AsyncOpenAI,
    model_name: str,
    raw_image_url: str,
    image_url: str,
    url_encoded_image: dict[str, str],
):
    content_text = "What's in this image?"
    messages = dummy_messages_from_image_url(
        url_encoded_image[raw_image_url],
        content_text,
    )

    max_completion_tokens = 10
    # test single completion
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
        max_completion_tokens=max_completion_tokens,
        logprobs=True,
        temperature=0.0,
        top_logprobs=5,
    )
    assert len(chat_completion.choices) == 1, (
        f"Expected 1 choice, got {len(chat_completion.choices)}"
    )

    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length", (
        f"Expected finish_reason='length', got {choice.finish_reason!r}. "
        f"content={choice.message.content!r}"
    )

    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
    expected_usage = openai.types.CompletionUsage(
        completion_tokens=max_completion_tokens,
        prompt_tokens=hf_prompt_tokens,
        total_tokens=hf_prompt_tokens + max_completion_tokens,
    )
    assert chat_completion.usage == expected_usage, (
        f"Usage mismatch: got {chat_completion.usage!r}, expected {expected_usage!r}"
    )

    message = choice.message
    assert message.content is not None and len(message.content) >= 10, (
        f"Expected content with >=10 chars, got {message.content!r}"
    )
    assert message.role == "assistant", (
        f"Expected role='assistant', got {message.role!r}"
    )

    messages.append({"role": "assistant", "content": message.content})

    # test multi-turn dialogue
    messages.append({"role": "user", "content": "express your result in json"})
    await complete_and_check(
        client,
        model_name,
        messages,
        context=f"multi-turn base64 follow-up for {raw_image_url}",
        max_completion_tokens=10,
        temperature=0.0,
    )


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS))))
async def test_single_chat_session_image_base64encoded_beamsearch(
    client: openai.AsyncOpenAI,
    model_name: str,
    image_idx: int,
    url_encoded_image: dict[str, str],
):
    # NOTE: This test validates that we pass MM data through beam search
    raw_image_url = TEST_IMAGE_ASSETS[image_idx]
    required_terms = REQUIRED_BEAM_SEARCH_TERMS[image_idx]

    messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url])

    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
        n=2,
        max_completion_tokens=10,
        temperature=0.0,
        extra_body=dict(use_beam_search=True),
    )
    assert len(chat_completion.choices) == 2, (
        f"Expected 2 beam search choices for image {image_idx} "
        f"({raw_image_url}), got {len(chat_completion.choices)}"
    )

    # Verify beam search produces two different non-empty outputs
    content_0 = chat_completion.choices[0].message.content
    content_1 = chat_completion.choices[1].message.content

    # Emit beam search outputs for debugging
    print(
        f"Beam search outputs for image {image_idx} ({raw_image_url}): "
        f"Output 0: {content_0!r}, Output 1: {content_1!r}"
    )

    assert content_0, (
        f"First beam output is empty for image {image_idx} ({raw_image_url}). "
        f"finish_reason={chat_completion.choices[0].finish_reason!r}"
    )
    assert content_1, (
        f"Second beam output is empty for image {image_idx} "
        f"({raw_image_url}). "
        f"finish_reason={chat_completion.choices[1].finish_reason!r}"
    )
    assert content_0 != content_1, (
        f"Beam search produced identical outputs for image {image_idx} "
        f"({raw_image_url}): {content_0!r}"
    )

    # Verify each output contains the required terms for this image
    for i, content in enumerate([content_0, content_1]):
        assert check_output_matches_terms(content, required_terms), (
            f"Beam output {i} for image {image_idx} ({raw_image_url}) "
            f"doesn't match required terms.\n"
            f"  content: {content!r}\n"
            f"  required (all groups, >=1 per group): {required_terms}"
        )


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
async def test_chat_streaming_image(
    client: openai.AsyncOpenAI, model_name: str, image_url: str
):
    messages = dummy_messages_from_image_url(image_url)

    # test single completion
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
        max_completion_tokens=10,
        temperature=0.0,
    )
    output = chat_completion.choices[0].message.content
    stop_reason = chat_completion.choices[0].finish_reason

    # test streaming
    stream = await client.chat.completions.create(
        model=model_name,
        messages=messages,
        max_completion_tokens=10,
        temperature=0.0,
        stream=True,
    )
    chunks: list[str] = []
    finish_reason_count = 0
    async for chunk in stream:
        delta = chunk.choices[0].delta
        if delta.role:
            assert delta.role == "assistant", (
                f"Expected role='assistant' in stream delta, got {delta.role!r}"
            )
        if delta.content:
            chunks.append(delta.content)
        if chunk.choices[0].finish_reason is not None:
            finish_reason_count += 1
    # finish reason should only return in last block
    assert finish_reason_count == 1, (
        f"Expected exactly 1 finish_reason across stream chunks, "
        f"got {finish_reason_count}"
    )
    assert chunk.choices[0].finish_reason == stop_reason, (
        f"Stream finish_reason={chunk.choices[0].finish_reason!r} "
        f"doesn't match non-stream finish_reason={stop_reason!r}"
    )

    streamed_text = "".join(chunks)
    assert streamed_text == output, (
        f"Streamed output doesn't match non-streamed for {image_url}.\n"
        f"  streamed:     {streamed_text!r}\n"
        f"  non-streamed: {output!r}"
    )


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
    "image_urls",
    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
    indirect=True,
)
async def test_multi_image_input(
    client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]
):
    messages = dummy_messages_from_image_url(image_urls)

    if len(image_urls) > MAXIMUM_IMAGES:
        with pytest.raises(openai.BadRequestError):  # test multi-image input
            await client.chat.completions.create(
                model=model_name,
                messages=messages,
                max_completion_tokens=10,
                temperature=0.0,
            )

        # the server should still work afterwards
        completion = await client.completions.create(
            model=model_name,
            prompt=[0, 0, 0, 0, 0],
            max_tokens=5,
            temperature=0.0,
        )
        assert completion.choices[0].text is not None, (
            "Server failed to produce output after rejecting over-limit "
            "multi-image request"
        )
    else:
        await complete_and_check(
            client,
            model_name,
            messages,
            context=f"multi-image input ({len(image_urls)} images)",
            max_completion_tokens=10,
            temperature=0.0,
        )


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
    "image_urls",
    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
    indirect=True,
)
async def test_completions_with_image(
    client: openai.AsyncOpenAI,
    model_name: str,
    image_urls: list[str],
):
    for image_url in image_urls:
        messages = describe_image_messages(image_url)
        await complete_and_check(
            client,
            model_name,
            messages,
            context=f"completions_with_image url={image_url}",
        )


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
    "image_urls",
    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
    indirect=True,
)
async def test_completions_with_image_with_uuid(
    client: openai.AsyncOpenAI,
    model_name: str,
    image_urls: list[str],
):
    for image_url in image_urls:
        messages = describe_image_messages(
            image_url,
            extra_image_fields={"uuid": image_url},
        )
        await complete_and_check(
            client,
            model_name,
            messages,
            context=f"uuid first request url={image_url}",
        )

        cached_messages: list[dict] = [
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this image."},
                    {"type": "image_url", "image_url": {}, "uuid": image_url},
                ],
            },
        ]
        await complete_and_check(
            client,
            model_name,
            cached_messages,
            context=f"uuid cached (empty image) uuid={image_url}",
        )


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_completions_with_empty_image_with_uuid_without_cache_hit(
    client: openai.AsyncOpenAI,
    model_name: str,
):
    with pytest.raises(openai.BadRequestError):
        await client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Describe this image."},
                        {
                            "type": "image_url",
                            "image_url": {},
                            "uuid": "uuid_not_previously_seen",
                        },
                    ],
                },
            ],
            model=model_name,
        )


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
    "image_urls",
    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
    indirect=True,
)
async def test_completions_with_image_with_incorrect_uuid_format(
    client: openai.AsyncOpenAI,
    model_name: str,
    image_urls: list[str],
):
    for image_url in image_urls:
        messages = describe_image_messages(
            image_url,
            extra_image_fields={
                "also_incorrect_uuid_key": image_url,
            },
        )
        # Inject the bad key inside image_url dict too
        messages[1]["content"][1]["image_url"]["incorrect_uuid_key"] = image_url

        await complete_and_check(
            client,
            model_name,
            messages,
            context=f"incorrect uuid format url={image_url}",
        )