test_mllama4.py 1.84 KB
Newer Older
1
2
3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for mllama's multimodal preprocessing and profiling."""
4

5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import pytest
from torch import prod
from transformers import Llama4Config

from vllm.multimodal import MULTIMODAL_REGISTRY

from ...utils import build_model_context


@pytest.mark.parametrize("model_id", ["meta-llama/Llama-Guard-4-12B"])
@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
def test_profiling(model_id: str, max_model_len: int):
    model_config_kwargs = {
        "max_model_len": max_model_len,
    }
20
    mm_counts = {"image": 1}
21
22
23
    ctx = build_model_context(
        model_id,
        model_config_kwargs=model_config_kwargs,
24
        limit_mm_per_prompt=mm_counts,
25
26
    )

27
28
    mm_inputs = MULTIMODAL_REGISTRY.get_dummy_mm_inputs(
        ctx.model_config,
29
        mm_counts=mm_counts,
30
31
32
33
34
35
    )

    hf_config = ctx.get_hf_config(Llama4Config)
    image_size = hf_config.vision_config.image_size
    patch_size = hf_config.vision_config.patch_size
    downsample_ratio = int(
36
37
38
        round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))
    )
    tokens_per_patch = ((image_size // patch_size) ** 2) // downsample_ratio
39
40

    mm_data = mm_inputs["mm_kwargs"].get_data()
41
    chunks_per_image = prod(mm_data["patches_per_image"])
42
    total_num_patches = chunks_per_image * tokens_per_patch
43
44
45
46
47
48
    num_tiles = (
        mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][1]
    )  # x-y separator tokens
    total_tokens = (
        total_num_patches.item() + num_tiles.item() + 3
    )  # image start, image, image end
49

50
51
    assert total_num_patches == sum(
        item.get_num_embeds for item in mm_inputs["mm_placeholders"]["image"]
52
53
    )
    assert total_tokens == sum(
54
        placeholder.length for placeholder in mm_inputs["mm_placeholders"]["image"]
55
    )