test_glm4_1v.py 4.19 KB
Newer Older
1
2
3
4
5
6
7
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest

from vllm.assets.video import VideoAsset
from vllm.multimodal import MULTIMODAL_REGISTRY
8
from vllm.multimodal.inputs import batched_tensors_equal
9
from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
10
11
12
13

from ...utils import build_model_context


14
@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
15
@pytest.mark.parametrize("expected_toks_per_frame", [299])
16
17
18
19
20
21
22
23
24
25
26
27
@pytest.mark.parametrize(
    "num_frames, fps, expected_grid_t",
    [
        # pre-sampled fixed frames (unexpected behavior,
        # but we still expect it to work without errors)
        (32, 1, 16),
        (32, 2, 16),
        (128, 1, 64),
        (128, 2, 64),
        # post-sampled frames (expected behavior)
        (-1, 1, 5),
        (-1, 2, 10),
28
29
    ],
)
30
31
32
33
34
35
36
37
38
39
40
41
42
def test_processor_override(
    model_id: str,
    expected_toks_per_frame: int,
    expected_grid_t: int,
    fps: int,
    num_frames: int,
):
    """Ensure GLM4vMultiModalProcessor can handle video frames properly."""
    ctx = build_model_context(
        model_id,
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"video": 1},
    )
43
    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
44
45
46
47
48
49
50
51
52
53
54
    tokenizer = processor.info.get_tokenizer()
    hf_processor_mm_kwargs = {"fps": fps}

    # Build the image str / prompt based on the number of images we pass
    video_assets = VideoAsset(name="baby_reading", num_frames=num_frames)
    prompt = "<|begin_of_video|><|video|><|end_of_video|>"

    video, metadata = video_assets.np_ndarrays, video_assets.metadata
    metadata["fps"] = fps
    mm_data = {"video": [(video, metadata)]}

55
    processed_inputs = processor(
56
57
58
59
        prompt,
        mm_items=processor.info.parse_mm_data(mm_data),
        hf_processor_mm_kwargs=hf_processor_mm_kwargs,
    )
60
61
62
63

    # Ensure we have the right number of placeholders per num_crops size
    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
    video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
64
65
    video_tok_count = processed_inputs["prompt_token_ids"].count(video_token_id)
    grid_t, _, _ = processed_inputs["mm_kwargs"].get_data()["video_grid_thw"][0]
66
67
68

    assert grid_t == expected_grid_t
    assert video_tok_count == expected_toks_per_frame * grid_t
69
70
71
72
73
74
75
76
77


@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
@pytest.mark.parametrize("fps", [2])
def test_video_loader_consistency(
    model_id: str,
    fps: int,
):
    """
78
    Ensure dynamic video loader (pre-sampled by loader) and normal video
79
80
81
82
83
84
85
    loader (post-sampled by processor) produce same video processing outputs.
    """
    ctx = build_model_context(
        model_id,
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"video": 1},
    )
86
    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
87
88
89
90
91
92
93
94
95
96
97
    hf_processor_mm_kwargs = {"fps": fps}

    # Build the image str / prompt based on the number of images we pass
    prompt = "<|begin_of_video|><|video|><|end_of_video|>"

    video_path = VideoAsset(name="baby_reading", num_frames=-1).video_path
    with open(video_path, "rb") as f:
        video_bytes = f.read()

    static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
    dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
98
99
        video_bytes, fps=fps
    )
100
101
102
103
104
105
106

    # pre-sampled loader shouldn't read all frames
    assert len(dynamic_video) < len(static_video)

    static_mm_data = {"video": [(static_video, static_metadata)]}
    dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}

107
    static_outputs = processor(
108
109
110
111
        prompt,
        mm_items=processor.info.parse_mm_data(static_mm_data),
        hf_processor_mm_kwargs=hf_processor_mm_kwargs,
    )
112
    dynamic_outputs = processor(
113
114
115
116
        prompt,
        mm_items=processor.info.parse_mm_data(dynamic_mm_data),
        hf_processor_mm_kwargs=hf_processor_mm_kwargs,
    )
117

118
    assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"]
119
120
121
    assert batched_tensors_equal(
        static_outputs["mm_kwargs"].get_data(),
        dynamic_outputs["mm_kwargs"].get_data(),
122
    )