test_chat_utils.py 80.4 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import warnings
5
from collections.abc import Mapping
6
from typing import Literal
7
8

import pytest
9
import os
10
import torch
11
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
12

13
from vllm.assets.audio import AudioAsset
14
from vllm.assets.image import ImageAsset
15
from vllm.assets.video import VideoAsset
16
from vllm.config import ModelConfig
17
18
from vllm.entrypoints.chat_utils import (
    _try_extract_ast,
19
    apply_mistral_chat_template,
20
21
22
23
24
25
26
    load_chat_template,
    parse_chat_messages,
    parse_chat_messages_futures,
    resolve_chat_template_content_format,
    resolve_chat_template_kwargs,
    resolve_hf_chat_template,
)
27
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
28
from vllm.multimodal.utils import (
29
30
31
    encode_audio_url,
    encode_image_url,
    encode_video_url,
32
)
33
34
from vllm.tokenizers import get_tokenizer
from vllm.tokenizers.mistral import MistralTokenizer
35
from vllm.utils.serial_utils import tensor2base64
36

37
from ..models.registry import HF_EXAMPLE_MODELS
38
from ..utils import VLLM_PATH, models_path_prefix
39
40
41

EXAMPLES_DIR = VLLM_PATH / "examples"

42
PHI3V_MODEL_ID = os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")
zhuwenwen's avatar
zhuwenwen committed
43
ULTRAVOX_MODEL_ID = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b")
zhuwenwen's avatar
zhuwenwen committed
44
QWEN2AUDIO_MODEL_ID = os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct")
zhuwenwen's avatar
zhuwenwen committed
45
QWEN2VL_MODEL_ID = os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")
zhuwenwen's avatar
zhuwenwen committed
46
QWEN25VL_MODEL_ID = os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")
47
QWEN25OMNI_MODEL_ID = os.path.join(models_path_prefix, "Qwen/Qwen2.5-Omni-7B")
48
QWEN3_MODEL_ID = os.path.join(models_path_prefix, "Qwen/Qwen3-8B")
zhuwenwen's avatar
zhuwenwen committed
49
LLAMA_GUARD_MODEL_ID = os.path.join(models_path_prefix, "meta-llama/Llama-Guard-3-1B")
zhuwenwen's avatar
zhuwenwen committed
50
HERMES_MODEL_ID = os.path.join(models_path_prefix, "NousResearch/Hermes-3-Llama-3.1-8B")
zhuwenwen's avatar
zhuwenwen committed
51
MISTRAL_MODEL_ID = os.path.join(models_path_prefix, "mistralai/Mistral-Small-3.1-24B-Instruct-2503")
52
53


54
@pytest.fixture(scope="function")
55
def phi3v_model_config():
56
57
58
59
60
61
62
63
    return ModelConfig(
        PHI3V_MODEL_ID,
        runner="generate",
        trust_remote_code=True,
        limit_mm_per_prompt={
            "image": 2,
        },
    )
64
65


66
67
@pytest.fixture(scope="function")
def phi3v_model_config_mm_interleaved():
68
69
70
71
72
73
74
75
76
    return ModelConfig(
        PHI3V_MODEL_ID,
        runner="generate",
        trust_remote_code=True,
        interleave_mm_strings=True,
        limit_mm_per_prompt={
            "image": 2,
        },
    )
77
78


79
80
81
82
83
84
85
86
87
88
89
90
91
@pytest.fixture(scope="function")
def phi3v_model_config_image_embeds():
    return ModelConfig(
        PHI3V_MODEL_ID,
        runner="generate",
        trust_remote_code=True,
        limit_mm_per_prompt={
            "image": 2,
        },
        enable_mm_embeds=True,
    )


92
@pytest.fixture(scope="function")
93
def qwen2_audio_model_config():
94
    return ModelConfig(
95
        QWEN2AUDIO_MODEL_ID,
96
        runner="generate",
97
        trust_remote_code=True,
98
99
100
101
        limit_mm_per_prompt={
            "audio": 1,
        },
    )
102
103


104
105
106
107
108
109
110
111
112
113
114
115
116
@pytest.fixture(scope="function")
def audio_embeds_model_config():
    return ModelConfig(
        QWEN2AUDIO_MODEL_ID,
        runner="generate",
        trust_remote_code=True,
        limit_mm_per_prompt={
            "audio": 2,
        },
        enable_mm_embeds=True,
    )


117
118
@pytest.fixture(scope="function")
def qwen25omni_model_config_mm_interleaved():
119
120
121
122
123
124
125
126
127
128
    return ModelConfig(
        QWEN25OMNI_MODEL_ID,
        runner="generate",
        interleave_mm_strings=True,
        limit_mm_per_prompt={
            "image": 2,
            "audio": 1,
            "video": 1,
        },
    )
129
130


131
132
@pytest.fixture(scope="function")
def mistral_model_config():
133
134
135
136
137
138
139
    return ModelConfig(
        MISTRAL_MODEL_ID,
        runner="generate",
        limit_mm_per_prompt={
            "image": 2,
        },
    )
140
141


142
143
@pytest.fixture(scope="module")
def image_url():
144
    image = ImageAsset("cherry_blossom")
145
    return encode_image_url(image.pil_image)
146
147


148
149
@pytest.fixture(scope="module")
def video_url():
150
    video = VideoAsset("baby_reading", 1)
151
    return encode_video_url(video.np_ndarrays)
152
153
154
155


@pytest.fixture(scope="module")
def audio_url():
156
    audio = AudioAsset("mary_had_lamb")
157
    return encode_audio_url(*audio.audio_and_sample_rate)
158
159


160
def _assert_mm_data_is_image_input(
161
    mm_data: MultiModalDataDict | None,
162
    image_count: int,
163
    skipped_image_indices: list | None = None,
164
165
166
167
168
169
170
) -> None:
    assert mm_data is not None
    assert set(mm_data.keys()) == {"image"}

    image_data = mm_data.get("image")
    assert image_data is not None

171
    assert isinstance(image_data, list) and len(image_data) == image_count
172
173
174
    if skipped_image_indices is not None:
        for i in skipped_image_indices:
            assert image_data[i] is None
175
176


177
def _assert_mm_uuids(
178
    mm_uuids: MultiModalUUIDDict | None,
179
    media_count: int,
180
    expected_uuids: list[str | None],
181
182
183
184
185
186
187
188
189
    modality: str = "image",
) -> None:
    if len(expected_uuids) > 0:
        assert mm_uuids is not None
        assert modality in mm_uuids

        image_uuids = mm_uuids.get(modality)
        assert image_uuids is not None

190
        assert isinstance(image_uuids, list) and len(image_uuids) == media_count
191
192
193
194
195
196

        assert image_uuids == expected_uuids
    else:
        assert mm_uuids is None


197
198
199
200
201
ModalityType = Literal["image", "video", "audio"]
MultiModalDataCounts = Mapping[ModalityType, int]


def _assert_mm_data_inputs(
202
    mm_data: MultiModalDataDict | None,
203
    data_count: MultiModalDataCounts,
204
    skipped_media_indices: dict[str, list] | None = None,  # modality -> list[int]
205
206
207
208
209
210
211
212
213
) -> None:
    assert mm_data is not None
    assert set(data_count.keys()) == (set(mm_data.keys()))

    for modality, n in data_count.items():
        modality_data = mm_data.get(modality)
        assert modality_data is not None
        assert isinstance(modality_data, list) and len(modality_data) == n

214
        if skipped_media_indices is not None:
215
            skipped_media_indices_for_modality = skipped_media_indices.get(modality)
216
217
218
219
            assert skipped_media_indices_for_modality is not None
            for i in skipped_media_indices_for_modality:
                assert modality_data[i] is None

220

221
222
223
224
def test_parse_chat_messages_single_image(
    phi3v_model_config,
    image_url,
):
225
    conversation, mm_data, mm_uuids = parse_chat_messages(
226
227
228
229
230
231
232
233
234
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
235
236
237
        phi3v_model_config,
        content_format="string",
    )
238

239
240
241
    assert conversation == [
        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
    ]
242
    _assert_mm_data_is_image_input(mm_data, 1)
243
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
244
245


246
247
248
def test_parse_chat_messages_single_image_with_uuid(
    phi3v_model_config,
    image_url,
249
):
250
251
    image_uuid = str(hash(image_url))
    conversation, mm_data, mm_uuids = parse_chat_messages(
252
253
254
255
256
257
258
259
260
261
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url,
                        },
                        "uuid": image_uuid,
262
                    },
263
264
265
266
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
267
        phi3v_model_config,
268
269
270
        content_format="string",
    )

271
272
273
    assert conversation == [
        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
    ]
274
275
    _assert_mm_data_is_image_input(mm_data, 1)
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
276
277


278
279
280
281
282
283
def test_parse_chat_messages_single_empty_image_with_uuid(
    phi3v_model_config,
    image_url,
):
    image_uuid = str(hash(image_url))
    conversation, mm_data, mm_uuids = parse_chat_messages(
284
285
286
287
288
289
290
291
292
293
294
295
296
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": None,
                        "uuid": image_uuid,
                    },
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
297
298
299
300
        phi3v_model_config,
        content_format="string",
    )

301
302
303
    assert conversation == [
        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
    ]
304
305
306
307
    _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0])
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])


308
def test_parse_chat_messages_single_image_with_bad_uuid_format(
309
310
311
    phi3v_model_config,
    image_url,
):
312
313
    image_uuid = str(hash(image_url))
    conversation, mm_data, mm_uuids = parse_chat_messages(
314
315
316
317
318
319
320
321
322
323
324
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url,
                            "uuid": image_uuid,
                        },
                        "bad_uuid_key": image_uuid,
325
                    },
326
327
328
329
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
330
331
332
        phi3v_model_config,
        content_format="string",
    )
333

334
335
336
    assert conversation == [
        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
    ]
337
338
    _assert_mm_data_is_image_input(mm_data, 1)
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
339
340


341
def test_parse_chat_messages_multiple_images_with_uuids(
342
343
344
    phi3v_model_config,
    image_url,
):
345
346
347
348
    image_uuid1 = "my_uuid_1"
    image_uuid2 = "my_uuid_2"

    conversation, mm_data, mm_uuids = parse_chat_messages(
349
350
351
352
353
354
355
356
357
358
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url,
                        },
                        "uuid": image_uuid1,
359
                    },
360
361
362
363
364
365
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url,
                        },
                        "uuid": image_uuid2,
366
                    },
367
368
369
370
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
371
372
373
        phi3v_model_config,
        content_format="string",
    )
374

375
376
377
378
379
380
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in the image?",
        }
    ]
381
    _assert_mm_data_is_image_input(mm_data, 2)
382
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
383
384


385
386
387
388
389
390
391
392
def test_parse_chat_messages_multiple_empty_images_with_uuids(
    phi3v_model_config,
    image_url,
):
    image_uuid1 = "my_uuid_1"
    image_uuid2 = "my_uuid_2"

    conversation, mm_data, mm_uuids = parse_chat_messages(
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": None,
                        "uuid": image_uuid1,
                    },
                    {
                        "type": "image_url",
                        "image_url": None,
                        "uuid": image_uuid2,
                    },
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
411
412
413
414
        phi3v_model_config,
        content_format="string",
    )

415
416
417
418
419
420
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in the image?",
        }
    ]
421
422
423
424
425
426
427
428
429
430
431
432
    _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[0, 1])
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])


def test_parse_chat_messages_mixed_empty_images_with_uuids(
    phi3v_model_config,
    image_url,
):
    image_uuid1 = "my_uuid_1"
    image_uuid2 = "my_uuid_2"

    conversation, mm_data, mm_uuids = parse_chat_messages(
433
434
435
436
437
438
439
440
441
442
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url,
                        },
                        "uuid": image_uuid1,
443
                    },
444
445
446
447
448
449
450
451
452
                    {
                        "type": "image_url",
                        "image_url": None,
                        "uuid": image_uuid2,
                    },
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
453
454
455
456
        phi3v_model_config,
        content_format="string",
    )

457
458
459
460
461
462
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in the image?",
        }
    ]
463
464
465
466
    _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[1])
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])


467
@pytest.mark.asyncio
468
async def test_parse_chat_messages_single_image_with_uuid_async(
469
470
471
    phi3v_model_config,
    image_url,
):
472
473
    image_uuid = str(hash(image_url))
    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
474
475
476
477
478
479
480
481
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url},
                        "uuid": image_uuid,
482
                    },
483
484
485
486
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
487
488
489
        phi3v_model_config,
        content_format="string",
    )
490

491
492
493
    assert conversation == [
        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
    ]
494
495
    _assert_mm_data_is_image_input(await mm_future, 1)
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
496
497


498
499
500
501
502
503
504
@pytest.mark.asyncio
async def test_parse_chat_messages_empty_image_with_uuid_async(
    phi3v_model_config,
    image_url,
):
    image_uuid = str(hash(image_url))
    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
505
506
507
508
509
510
511
512
513
514
515
516
517
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": None,
                        "uuid": image_uuid,
                    },
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
518
519
520
521
        phi3v_model_config,
        content_format="string",
    )

522
523
524
525
    assert conversation == [
        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
    ]
    _assert_mm_data_is_image_input(await mm_future, 1, skipped_image_indices=[0])
526
527
528
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])


529
530
@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_with_uuids_async(
531
532
533
    phi3v_model_config,
    image_url,
):
534
535
536
537
    image_uuid1 = "my_uuid_1"
    image_uuid2 = "my_uuid_2"

    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
538
539
540
541
542
543
544
545
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url},
                        "uuid": image_uuid1,
546
                    },
547
548
549
550
551
552
553
554
555
                    {
                        "type": "image_pil",
                        "image_pil": ImageAsset("cherry_blossom").pil_image,
                        "uuid": image_uuid2,
                    },
                    {"type": "text", "text": "What's in these images?"},
                ],
            }
        ],
556
557
558
        phi3v_model_config,
        content_format="string",
    )
559

560
561
562
563
564
565
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
        }
    ]
566
567
    _assert_mm_data_is_image_input(await mm_future, 2)
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
568
569


570
571
572
573
574
575
576
577
578
@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
    phi3v_model_config,
    image_url,
):
    image_uuid1 = "my_uuid_1"
    image_uuid2 = "my_uuid_2"

    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": None,
                        "uuid": image_uuid1,
                    },
                    {
                        "type": "image_pil",
                        "image_pil": None,
                        "uuid": image_uuid2,
                    },
                    {"type": "text", "text": "What's in these images?"},
                ],
            }
        ],
597
598
599
600
        phi3v_model_config,
        content_format="string",
    )

601
602
603
604
605
606
607
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
        }
    ]
    _assert_mm_data_is_image_input(await mm_future, 2, skipped_image_indices=[0, 1])
608
609
610
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])


611
612
@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
613
614
615
    phi3v_model_config,
    image_url,
):
616
617
618
    image_uuid2 = "my_uuid_2"

    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
619
620
621
622
623
624
625
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url},
626
                    },
627
628
629
630
631
632
633
634
635
                    {
                        "type": "image_pil",
                        "image_pil": ImageAsset("cherry_blossom").pil_image,
                        "uuid": image_uuid2,
                    },
                    {"type": "text", "text": "What's in these images?"},
                ],
            }
        ],
636
637
638
        phi3v_model_config,
        content_format="string",
    )
639

640
641
642
643
644
645
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
        }
    ]
646
647
    _assert_mm_data_is_image_input(await mm_future, 2)
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, image_uuid2])
648
649


650
651
def test_parse_chat_messages_empty_system(
    mistral_model_config,
652
):
653
    # Test string format
654
    conversation, _, _ = parse_chat_messages(
655
        [
656
            {"role": "system", "content": ""},
657
658
            {
                "role": "user",
659
                "content": [{"type": "text", "text": "Who are you?"}],
660
661
            },
        ],
662
        mistral_model_config,
663
664
        content_format="string",
    )
665
    assert conversation == [
666
667
        {"role": "system", "content": ""},
        {"role": "user", "content": "Who are you?"},
668
    ]
669
670

    # Test openai format
671
    conversation, _, _ = parse_chat_messages(
672
        [
673
            {"role": "system", "content": ""},
674
675
            {
                "role": "user",
676
                "content": [{"type": "text", "text": "Who are you?"}],
677
678
679
680
681
682
            },
        ],
        mistral_model_config,
        content_format="openai",
    )
    assert conversation == [
683
684
        {"role": "system", "content": [{"type": "text", "text": ""}]},
        {"role": "user", "content": [{"type": "text", "text": "Who are you?"}]},
685
686
687
688
    ]


@pytest.mark.asyncio
689
async def test_parse_chat_messages_single_image_async(
690
    phi3v_model_config,
691
    image_url,
692
):
693
    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
694
695
696
697
698
699
700
701
702
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
703
704
705
        phi3v_model_config,
        content_format="string",
    )
706

707
708
709
    assert conversation == [
        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
    ]
710
    _assert_mm_data_is_image_input(await mm_future, 1)
711
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
712
713


714
715
716
717
def test_parse_chat_messages_multiple_images(
    phi3v_model_config,
    image_url,
):
718
    conversation, mm_data, mm_uuids = parse_chat_messages(
719
720
721
722
723
724
725
726
727
728
729
730
731
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {
                        "type": "image_pil",
                        "image_pil": ImageAsset("cherry_blossom").pil_image,
                    },
                    {"type": "text", "text": "What's in these images?"},
                ],
            }
        ],
732
733
734
        phi3v_model_config,
        content_format="string",
    )
735

736
737
738
739
740
741
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
        }
    ]
742
    _assert_mm_data_is_image_input(mm_data, 2)
743
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
744
745


746
747
748
749
750
def test_parse_chat_messages_empty_pil_image_with_uuid(
    phi3v_model_config,
):
    uuid = "abcd"
    conversation, mm_data, mm_uuids = parse_chat_messages(
751
752
753
754
755
756
757
758
759
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_pil", "image_pil": None, "uuid": uuid},
                    {"type": "text", "text": "What's in this image?"},
                ],
            }
        ],
760
761
762
763
        phi3v_model_config,
        content_format="string",
    )

764
765
766
767
768
769
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\nWhat's in this image?",
        }
    ]
770
771
772
773
774
    _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0])
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])


def test_parse_chat_messages_empty_image_embeds_with_uuid(
775
    phi3v_model_config_image_embeds,
776
777
778
):
    uuid = "abcd"
    conversation, mm_data, mm_uuids = parse_chat_messages(
779
780
781
782
783
784
785
786
787
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_embeds", "image_embeds": None, "uuid": uuid},
                    {"type": "text", "text": "What's in this image?"},
                ],
            }
        ],
788
        phi3v_model_config_image_embeds,
789
790
791
        content_format="string",
    )

792
793
794
795
796
797
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\nWhat's in this image?",
        }
    ]
798

799
800
    assert mm_data is not None
    assert "image" in mm_data
801
802
803
804
    assert isinstance(mm_data["image"], list)
    assert len(mm_data["image"]) == 1
    assert mm_data["image"][0] is None

805
806
807
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])


808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
def test_parse_chat_messages_empty_audio_embeds_with_uuid(
    audio_embeds_model_config,
):
    """Test audio_embeds with UUID (no actual embeds data)."""
    uuid = "test-audio-uuid-123"

    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this audio"},
                    {"type": "audio_embeds", "audio_embeds": None, "uuid": uuid},
                ],
            }
        ],
        audio_embeds_model_config,
        content_format="string",
    )

    # Should have audio in mm_data as None (UUID provided)
    assert mm_data is not None
    assert "audio" in mm_data
831
832
833
834
    assert isinstance(mm_data["audio"], list)
    assert len(mm_data["audio"]) == 1
    assert mm_data["audio"][0] is None

835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
    # UUID should be recorded
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[uuid])


def test_parse_chat_messages_audio_embeds_with_string(
    audio_embeds_model_config,
):
    """Test audio_embeds with base64 string embedding data."""

    import torch

    # Create a sample audio embedding tensor
    audio_embedding = torch.randn(1, 128, 768)

    # Encode it as base64
850
    base64_audio_embedding = tensor2base64(audio_embedding)
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891

    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this audio"},
                    {
                        "type": "audio_embeds",
                        "audio_embeds": base64_audio_embedding,
                    },
                ],
            }
        ],
        audio_embeds_model_config,
        content_format="string",
    )

    # Should have audio embedding in mm_data (single tensor, not a list)
    assert mm_data is not None
    assert "audio" in mm_data
    assert isinstance(mm_data["audio"], torch.Tensor)
    assert mm_data["audio"].shape == audio_embedding.shape
    # No UUID provided
    assert mm_uuids is not None
    assert "audio" in mm_uuids
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])


@pytest.mark.asyncio
async def test_parse_chat_messages_audio_embeds_async(
    audio_embeds_model_config,
):
    """Test audio_embeds with async futures."""

    import torch

    # Create a sample audio embedding tensor
    audio_embedding = torch.randn(1, 128, 768)

    # Encode it as base64
892
    base64_audio_embedding = tensor2base64(audio_embedding)
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922

    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
        [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this audio"},
                    {
                        "type": "audio_embeds",
                        "audio_embeds": base64_audio_embedding,
                    },
                ],
            }
        ],
        audio_embeds_model_config,
        content_format="string",
    )

    # Should have audio embedding in mm_data (single tensor, not a list)
    mm_data = await mm_future
    assert mm_data is not None
    assert "audio" in mm_data
    assert isinstance(mm_data["audio"], torch.Tensor)
    assert mm_data["audio"].shape == audio_embedding.shape
    # No UUID provided
    assert mm_uuids is not None
    assert "audio" in mm_uuids
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])


923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
def test_parse_chat_messages_multiple_image_embeds(
    phi3v_model_config_image_embeds,
):
    """Test that multiple image_embeds in a single message are now supported.

    This test validates the fix for the limitation that previously only allowed
    one message with {'type': 'image_embeds'}. Now multiple image embeddings
    can be provided in a single request, similar to regular images.
    """
    # Create two sample image embedding tensors
    image_embedding_1 = torch.randn(256, 1024)
    image_embedding_2 = torch.randn(128, 1024)

    # Encode them as base64 using the convenience function
    base64_image_embedding_1 = tensor2base64(image_embedding_1)
    base64_image_embedding_2 = tensor2base64(image_embedding_2)

    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_embeds",
                        "image_embeds": base64_image_embedding_1,
                    },
                    {
                        "type": "image_embeds",
                        "image_embeds": base64_image_embedding_2,
                    },
                    {"type": "text", "text": "Describe these two images."},
                ],
            }
        ],
        phi3v_model_config_image_embeds,
        content_format="string",
    )

    # Verify conversation structure
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nDescribe these two images.",
        }
    ]

    # Verify mm_data contains a list of embeddings (not a single embedding)
    assert mm_data is not None
    assert "image" in mm_data
    assert isinstance(mm_data["image"], list)
    assert len(mm_data["image"]) == 2

    # Verify each embedding has the correct shape
    assert isinstance(mm_data["image"][0], torch.Tensor)
    assert mm_data["image"][0].shape == image_embedding_1.shape
    assert isinstance(mm_data["image"][1], torch.Tensor)
    assert mm_data["image"][1].shape == image_embedding_2.shape

    # Verify UUIDs (None since we didn't provide any)
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])


def test_parse_chat_messages_multiple_image_embeds_with_uuids(
    phi3v_model_config_image_embeds,
):
    """Test multiple image_embeds with UUIDs.

    This validates that UUIDs are properly tracked for multiple embeddings.
    """
    uuid1 = "image-uuid-1"
    uuid2 = "image-uuid-2"

    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_embeds",
                        "image_embeds": None,
                        "uuid": uuid1,
                    },
                    {
                        "type": "image_embeds",
                        "image_embeds": None,
                        "uuid": uuid2,
                    },
                    {"type": "text", "text": "Compare these images."},
                ],
            }
        ],
        phi3v_model_config_image_embeds,
        content_format="string",
    )

    # Verify conversation structure
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nCompare these images.",
        }
    ]

    # Verify mm_data contains a list with None values (UUID references)
    assert mm_data is not None
    assert "image" in mm_data
    assert isinstance(mm_data["image"], list)
    assert len(mm_data["image"]) == 2
    assert mm_data["image"][0] is None
    assert mm_data["image"][1] is None

    # Verify UUIDs are correctly tracked
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[uuid1, uuid2])


@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_image_embeds_async(
    phi3v_model_config_image_embeds,
):
    """Test multiple image_embeds with async parsing.

    This validates the AsyncMultiModalItemTracker also supports multiple embeddings.
    """
    # Create two sample image embedding tensors
    image_embedding_1 = torch.randn(200, 768)
    image_embedding_2 = torch.randn(150, 768)

    # Encode them as base64 using the convenience function
    base64_image_embedding_1 = tensor2base64(image_embedding_1)
    base64_image_embedding_2 = tensor2base64(image_embedding_2)

    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_embeds",
                        "image_embeds": base64_image_embedding_1,
                    },
                    {
                        "type": "image_embeds",
                        "image_embeds": base64_image_embedding_2,
                    },
                    {"type": "text", "text": "What do these images show?"},
                ],
            }
        ],
        phi3v_model_config_image_embeds,
        content_format="string",
    )

    # Verify conversation structure
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat do these images show?",
        }
    ]

    # Await the future and verify mm_data
    mm_data = await mm_future
    assert mm_data is not None
    assert "image" in mm_data
    assert isinstance(mm_data["image"], list)
    assert len(mm_data["image"]) == 2

    # Verify each embedding has the correct shape
    assert isinstance(mm_data["image"][0], torch.Tensor)
    assert mm_data["image"][0].shape == image_embedding_1.shape
    assert isinstance(mm_data["image"][1], torch.Tensor)
    assert mm_data["image"][1].shape == image_embedding_2.shape

    # Verify UUIDs
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])


1100
1101
@pytest.mark.asyncio
async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
1102
    phi3v_model_config_image_embeds,
1103
1104
1105
):
    uuid = "abcd"
    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
1106
1107
1108
1109
1110
1111
1112
1113
1114
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_embeds", "image_embeds": None, "uuid": uuid},
                    {"type": "text", "text": "What's in this image?"},
                ],
            }
        ],
1115
        phi3v_model_config_image_embeds,
1116
1117
1118
        content_format="string",
    )

1119
1120
1121
1122
1123
1124
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\nWhat's in this image?",
        }
    ]
1125
1126
1127
    mm_data = await mm_future
    assert mm_data is not None
    assert "image" in mm_data
1128
1129
1130
1131
    assert isinstance(mm_data["image"], list)
    assert len(mm_data["image"]) == 1
    assert mm_data["image"][0] is None

1132
1133
1134
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])


1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
def test_parse_chat_messages_empty_dict_image_embeds(
    phi3v_model_config_image_embeds,
):
    """Test that empty dictionary for image_embeds is handled without errors."""
    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_embeds", "image_embeds": {}},
                    {"type": "text", "text": "What's in this image?"},
                ],
            }
        ],
        phi3v_model_config_image_embeds,
        content_format="string",
    )

    # Verify conversation structure
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\nWhat's in this image?",
        }
    ]

    # Verify mm_data contains an empty dictionary of embeddings
    assert mm_data is not None
    assert "image" in mm_data
    assert isinstance(mm_data["image"], dict)
    assert len(mm_data["image"]) == 0

    # Verify UUIDs (None since we didn't provide any)
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])


def test_parse_chat_messages_multiple_dict_image_embeds(
    phi3v_model_config_image_embeds,
):
    """Test that multiple dictionaries for image_embeds is handled without errors."""
    # Create two sample image embedding tensors
    batch_size = 2
    image_embedding_1 = torch.randn(batch_size, 256, 1024)
    image_embedding_2 = torch.randn(batch_size, 3)

    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_embeds",
                        "image_embeds": {
                            "image_embedding_1": tensor2base64(p),
                            "image_embedding_2": tensor2base64(i),
                        },
                    }
                    for p, i in zip(image_embedding_1, image_embedding_2)
                ]
                + [
                    {"type": "text", "text": "Describe these two images."},
                ],
            }
        ],
        phi3v_model_config_image_embeds,
        content_format="string",
    )

    # Verify conversation structure
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nDescribe these two images.",
        }
    ]

    # Verify mm_data contains a dictionary of multi-embeddings
    assert mm_data is not None
    assert "image" in mm_data
    assert isinstance(mm_data["image"], dict)
    assert len(mm_data["image"]) == batch_size

    # Verify each embedding has the correct shape
    assert isinstance(mm_data["image"]["image_embedding_1"], torch.Tensor)
    assert mm_data["image"]["image_embedding_1"].shape == image_embedding_1.shape
    assert isinstance(mm_data["image"]["image_embedding_2"], torch.Tensor)
    assert mm_data["image"]["image_embedding_2"].shape == image_embedding_2.shape

    # Verify UUIDs (None since we didn't provide any)
    _assert_mm_uuids(mm_uuids, batch_size, expected_uuids=[None, None])


1227
@pytest.mark.asyncio
1228
1229
1230
1231
async def test_parse_chat_messages_multiple_images_async(
    phi3v_model_config,
    image_url,
):
1232
    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {
                        "type": "image_pil",
                        "image_pil": ImageAsset("cherry_blossom").pil_image,
                    },
                    {"type": "text", "text": "What's in these images?"},
                ],
            }
        ],
1246
1247
1248
        phi3v_model_config,
        content_format="string",
    )
1249

1250
1251
1252
1253
1254
1255
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
        }
    ]
1256
    _assert_mm_data_is_image_input(await mm_future, 2)
1257
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
1258

1259
1260

def test_parse_chat_messages_placeholder_already_in_prompt(
1261
1262
1263
    phi3v_model_config,
    image_url,
):
1264
    conversation, mm_data, mm_uuids = parse_chat_messages(
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {
                        "type": "text",
                        "text": "What's in <|image_1|> and how does it compare to <|image_2|>?",  # noqa: E501
                    },
                ],
            }
        ],
1278
1279
1280
        phi3v_model_config,
        content_format="string",
    )
1281
1282
1283
1284
1285
1286
    assert conversation == [
        {
            "role": "user",
            "content": "What's in <|image_1|> and how does it compare to <|image_2|>?",
        }
    ]
1287
    _assert_mm_data_is_image_input(mm_data, 2)
1288
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
1289
1290


1291
1292
def test_parse_chat_messages_placeholder_one_already_in_prompt(
    phi3v_model_config,
1293
1294
    image_url,
):
1295
    conversation, mm_data, mm_uuids = parse_chat_messages(
1296
1297
1298
1299
1300
1301
1302
1303
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {
                        "type": "text",
1304
1305
                        "text": "What's in <|image_1|> and how does it compare to "
                        "the other one?",
1306
1307
1308
1309
                    },
                ],
            }
        ],
1310
        phi3v_model_config,
1311
1312
1313
        content_format="string",
    )

1314
1315
1316
    assert conversation == [
        {
            "role": "user",
1317
1318
            "content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to "
            "the other one?",
1319
1320
        }
    ]
1321
    _assert_mm_data_is_image_input(mm_data, 2)
1322
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
1323
1324


1325
1326
1327
1328
def test_parse_chat_messages_multiple_images_across_messages(
    phi3v_model_config,
    image_url,
):
1329
    conversation, mm_data, mm_uuids = parse_chat_messages(
1330
1331
        [
            {
1332
                "role": "user",
1333
                "content": [
1334
1335
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "What's in this image?"},
1336
1337
                ],
            },
1338
            {"role": "assistant", "content": "Some stuff."},
1339
            {
1340
                "role": "user",
1341
                "content": [
1342
1343
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "What about this one?"},
1344
1345
1346
                ],
            },
        ],
1347
1348
1349
        phi3v_model_config,
        content_format="string",
    )
1350
1351

    assert conversation == [
1352
1353
1354
        {"role": "user", "content": "<|image_1|>\nWhat's in this image?"},
        {"role": "assistant", "content": "Some stuff."},
        {"role": "user", "content": "<|image_2|>\nWhat about this one?"},
1355
    ]
1356
    _assert_mm_data_is_image_input(mm_data, 2)
1357
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
1358
1359


1360
def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
1361
    phi3v_model_config,
1362
    image_url,
1363
):
1364
1365
    image_uuid = str(hash(image_url))
    conversation, mm_data, mm_uuids = parse_chat_messages(
1366
1367
        [
            {
1368
                "role": "user",
1369
1370
1371
                "content": [
                    {
                        "type": "image_url",
1372
                        "image_url": {"url": image_url},
1373
1374
                        "uuid": image_uuid,
                    },
1375
                    {"type": "text", "text": "What's in this image?"},
1376
                ],
1377
            },
1378
            {"role": "assistant", "content": "Some stuff."},
1379
            {
1380
                "role": "user",
1381
1382
1383
                "content": [
                    {
                        "type": "image_url",
1384
                        "image_url": {"url": image_url},
1385
1386
                        "uuid": image_uuid,
                    },
1387
                    {"type": "text", "text": "What about this one?"},
1388
1389
1390
1391
1392
1393
1394
1395
                ],
            },
        ],
        phi3v_model_config,
        content_format="string",
    )

    assert conversation == [
1396
1397
1398
        {"role": "user", "content": "<|image_1|>\nWhat's in this image?"},
        {"role": "assistant", "content": "Some stuff."},
        {"role": "user", "content": "<|image_2|>\nWhat about this one?"},
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
    ]
    _assert_mm_data_is_image_input(mm_data, 2)
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])


def test_parse_chat_messages_context_text_format(
    phi3v_model_config,
):
    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
                "role": "user",
1411
                "content": [{"type": "text", "text": "What's in this text?"}],
1412
            },
1413
1414
            {"role": "assistant", "content": "Some stuff."},
            {"role": "user", "content": "What about this one?"},
1415
        ],
1416
1417
1418
        phi3v_model_config,
        content_format="openai",
    )
1419
1420
1421
1422

    assert conversation == [
        {
            "role": "user",
1423
            "content": [{"type": "text", "text": "What's in this text?"}],
1424
1425
1426
        },
        {
            "role": "assistant",
1427
            "content": [{"type": "text", "text": "Some stuff."}],
1428
1429
1430
        },
        {
            "role": "user",
1431
            "content": [{"type": "text", "text": "What about this one?"}],
1432
1433
        },
    ]
1434
1435
    assert mm_data is None
    assert mm_uuids is None
1436
1437


1438
1439
1440
1441
def test_parse_chat_messages_rejects_too_many_images_in_one_message(
    phi3v_model_config,
    image_url,
):
1442
1443
1444
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
1445
1446
            message="coroutine 'async_get_and_parse_image' was never awaited",
        )
1447
        with pytest.raises(ValueError, match="At most"):
1448
            parse_chat_messages(
1449
1450
1451
1452
1453
1454
1455
                [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image_url",
                                "image_url": {"url": image_url},
1456
                            },
1457
1458
1459
                            {
                                "type": "image_url",
                                "image_url": {"url": image_url},
1460
                            },
1461
1462
1463
                            {
                                "type": "image_url",
                                "image_url": {"url": image_url},
1464
                            },
1465
1466
1467
1468
                            {"type": "text", "text": "What's in these images?"},
                        ],
                    }
                ],
1469
1470
1471
                phi3v_model_config,
                content_format="string",
            )
1472
1473


1474
1475
1476
1477
def test_parse_chat_messages_rejects_too_many_images_across_messages(
    phi3v_model_config,
    image_url,
):
1478
1479
1480
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
1481
1482
            message="coroutine 'async_get_and_parse_image' was never awaited",
        )
1483
        with pytest.raises(ValueError, match="At most"):
1484
            parse_chat_messages(
1485
1486
                [
                    {
1487
                        "role": "user",
1488
1489
1490
                        "content": [
                            {
                                "type": "image_url",
1491
                                "image_url": {"url": image_url},
1492
                            },
1493
                            {"type": "text", "text": "What's in this image?"},
1494
1495
                        ],
                    },
1496
                    {"role": "assistant", "content": "Some stuff."},
1497
                    {
1498
                        "role": "user",
1499
1500
1501
                        "content": [
                            {
                                "type": "image_url",
1502
                                "image_url": {"url": image_url},
1503
1504
1505
                            },
                            {
                                "type": "image_url",
1506
                                "image_url": {"url": image_url},
1507
                            },
1508
                            {"type": "text", "text": "What about these two?"},
1509
1510
1511
                        ],
                    },
                ],
1512
1513
1514
                phi3v_model_config,
                content_format="string",
            )
1515
1516
1517
1518
1519
1520


def test_parse_chat_messages_multiple_images_uncommon_input(
    phi3v_model_config,
    image_url,
):
1521
    conversation, mm_data, mm_uuids = parse_chat_messages(
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
        [
            {
                "role": "user",
                "content": [
                    "What's in these images?",
                    {"image_url": image_url},
                    {"image_url": image_url},
                ],
            }
        ],
1532
1533
1534
        phi3v_model_config,
        content_format="string",
    )
1535

1536
1537
1538
1539
1540
1541
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
        }
    ]
1542
    _assert_mm_data_is_image_input(mm_data, 2)
1543
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
1544
1545


1546
1547
1548
1549
def test_parse_chat_messages_multiple_images_interleave(
    phi3v_model_config_mm_interleaved,
    image_url,
):
1550
    conversation, mm_data, mm_uuids = parse_chat_messages(
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "I need you to compare this image",
                    },
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "and this one"},
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "Do they have differences?"},
                ],
            }
        ],
1566
1567
1568
1569
        phi3v_model_config_mm_interleaved,
        content_format="string",
    )

1570
1571
1572
1573
1574
1575
1576
    assert conversation == [
        {
            "role": "user",
            "content": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
            "Do they have differences?",
        }
    ]
1577
    _assert_mm_data_is_image_input(mm_data, 2)
1578
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
1579
1580
1581
1582
1583
1584
1585


@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_interleave_async(
    phi3v_model_config_mm_interleaved,
    image_url,
):
1586
    conversation, mm_data, mm_uuids = parse_chat_messages_futures(
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "I need you to compare this image",
                    },
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "and this one"},
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "Do they have differences?"},
                ],
            }
        ],
1602
1603
1604
1605
        phi3v_model_config_mm_interleaved,
        content_format="string",
    )

1606
1607
1608
1609
1610
1611
1612
    assert conversation == [
        {
            "role": "user",
            "content": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
            "Do they have differences?",
        }
    ]
1613
    _assert_mm_data_is_image_input(await mm_data, 2)
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])


@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
    phi3v_model_config_mm_interleaved,
    image_url,
):
    image_uuid = str(hash(image_url))
    conversation, mm_data, mm_uuids = parse_chat_messages_futures(
1624
1625
1626
1627
1628
1629
1630
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "I need you to compare this image",
1631
                    },
1632
1633
1634
1635
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url},
                        "uuid": image_uuid,
1636
                    },
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
                    {"type": "text", "text": "and this one"},
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url},
                        "uuid": image_uuid,
                    },
                    {"type": "text", "text": "Do they have differences?"},
                ],
            }
        ],
1647
1648
1649
1650
        phi3v_model_config_mm_interleaved,
        content_format="string",
    )

1651
1652
1653
1654
1655
1656
1657
    assert conversation == [
        {
            "role": "user",
            "content": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
            "Do they have differences?",
        }
    ]
1658
1659
    _assert_mm_data_is_image_input(await mm_data, 2)
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
1660
1661
1662
1663
1664
1665


def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
    phi3v_model_config_mm_interleaved,
    image_url,
):
1666
    conversation, mm_data, mm_uuids = parse_chat_messages(
1667
1668
        [
            {
1669
                "role": "user",
1670
                "content": [
1671
1672
1673
                    {"type": "text", "text": "What's on this image?"},
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "Be accurate."},
1674
1675
                ],
            },
1676
            {"role": "assistant", "content": "Some stuff."},
1677
            {
1678
                "role": "user",
1679
                "content": [
1680
1681
                    {"type": "text", "text": "What's on this image?"},
                    {"type": "image_url", "image_url": {"url": image_url}},
1682
1683
1684
                ],
            },
        ],
1685
1686
1687
1688
        phi3v_model_config_mm_interleaved,
        content_format="string",
    )

1689
1690
1691
1692
1693
    assert conversation == [
        {
            "role": "user",
            "content": "What's on this image?\n<|image_1|>\nBe accurate.",
        },
1694
1695
        {"role": "assistant", "content": "Some stuff."},
        {"role": "user", "content": "What's on this image?\n<|image_2|>"},
1696
    ]
1697
    _assert_mm_data_is_image_input(mm_data, 2)
1698
1699
1700
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])


1701
def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(
1702
1703
1704
1705
1706
1707
1708
    phi3v_model_config_mm_interleaved,
    image_url,
):
    image_uuid = str(hash(image_url))
    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
1709
                "role": "user",
1710
                "content": [
1711
                    {"type": "text", "text": "What's on this image?"},
1712
1713
                    {
                        "type": "image_url",
1714
                        "image_url": {"url": image_url},
1715
1716
                        "uuid": image_uuid,
                    },
1717
                    {"type": "text", "text": "Be accurate."},
1718
1719
                ],
            },
1720
            {"role": "assistant", "content": "Some stuff."},
1721
            {
1722
                "role": "user",
1723
                "content": [
1724
                    {"type": "text", "text": "What's on this image?"},
1725
1726
                    {
                        "type": "image_url",
1727
                        "image_url": {"url": image_url},
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
                        "uuid": image_uuid,
                    },
                ],
            },
        ],
        phi3v_model_config_mm_interleaved,
        content_format="string",
    )

    assert conversation == [
        {
            "role": "user",
            "content": "What's on this image?\n<|image_1|>\nBe accurate.",
        },
1742
1743
        {"role": "assistant", "content": "Some stuff."},
        {"role": "user", "content": "What's on this image?\n<|image_2|>"},
1744
1745
1746
    ]
    _assert_mm_data_is_image_input(mm_data, 2)
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
1747
1748
1749


def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
1750
1751
1752
1753
1754
    qwen25omni_model_config_mm_interleaved,
    image_url,
    video_url,
    audio_url,
):
1755
    conversation, mm_data, mm_uuids = parse_chat_messages(
1756
1757
        [
            {
1758
                "role": "user",
1759
                "content": [
1760
1761
1762
1763
                    {"type": "text", "text": "What's on this image?"},
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "Now listen to this audio"},
                    {"type": "audio_url", "audio_url": {"url": audio_url}},
1764
1765
                ],
            },
1766
            {"role": "assistant", "content": "Some stuff."},
1767
            {
1768
                "role": "user",
1769
                "content": [
1770
1771
1772
1773
                    {"type": "text", "text": "What's on this image?"},
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "And what's in the video?"},
                    {"type": "video_url", "video_url": {"url": video_url}},
1774
1775
1776
1777
1778
1779
1780
1781
1782
                ],
            },
        ],
        qwen25omni_model_config_mm_interleaved,
        content_format="string",
    )

    assert conversation == [
        {
1783
            "role": "user",
1784
1785
            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
            "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
1786
        },
1787
        {"role": "assistant", "content": "Some stuff."},
1788
        {
1789
            "role": "user",
1790
1791
            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
            "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
1792
1793
        },
    ]
1794
1795

    _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
1796
    _assert_mm_uuids(mm_uuids, 2, modality="image", expected_uuids=[None, None])
1797
1798
1799
1800
    _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=[None])
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])


1801
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(
1802
1803
1804
1805
1806
    qwen25omni_model_config_mm_interleaved,
    image_url,
    video_url,
    audio_url,
):
1807
    conversation, mm_data, mm_uuids = parse_chat_messages(
1808
1809
        [
            {
1810
                "role": "user",
1811
                "content": [
1812
                    {"type": "text", "text": "What's on this image?"},
1813
1814
                    {
                        "type": "image_url",
1815
                        "image_url": {"url": image_url},
1816
                        "uuid": "image_123",
1817
                    },
1818
                    {"type": "text", "text": "Now listen to this audio"},
1819
1820
                    {
                        "type": "audio_url",
1821
                        "audio_url": {"url": audio_url},
1822
                        "uuid": "audio_123",
1823
1824
1825
                    },
                ],
            },
1826
            {"role": "assistant", "content": "Some stuff."},
1827
            {
1828
                "role": "user",
1829
                "content": [
1830
                    {"type": "text", "text": "What's on this image?"},
1831
1832
                    {
                        "type": "image_url",
1833
                        "image_url": {"url": image_url},
1834
                        "uuid": "image_123",
1835
                    },
1836
                    {"type": "text", "text": "And what's in the video?"},
1837
1838
                    {
                        "type": "video_url",
1839
                        "video_url": {"url": video_url},
1840
                        "uuid": "video_123",
1841
1842
1843
1844
1845
                    },
                ],
            },
        ],
        qwen25omni_model_config_mm_interleaved,
1846
1847
1848
        content_format="string",
    )

1849
1850
    assert conversation == [
        {
1851
            "role": "user",
1852
1853
            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
            "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
1854
        },
1855
        {"role": "assistant", "content": "Some stuff."},
1856
        {
1857
            "role": "user",
1858
1859
            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
            "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
1860
1861
        },
    ]
1862
1863

    _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
1864
1865
1866
1867
1868
    _assert_mm_uuids(
        mm_uuids, 2, modality="image", expected_uuids=["image_123", "image_123"]
    )
    _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=["video_123"])
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=["audio_123"])
1869
1870


1871
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave(  # noqa: E501
1872
1873
1874
1875
1876
1877
1878
1879
    qwen25omni_model_config_mm_interleaved,
    image_url,
    video_url,
    audio_url,
):
    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
1880
                "role": "user",
1881
                "content": [
1882
                    {"type": "text", "text": "What's on this image?"},
1883
1884
                    {
                        "type": "image_url",
1885
                        "image_url": None,
1886
1887
                        "uuid": "image_123",
                    },
1888
                    {"type": "text", "text": "Now listen to this audio"},
1889
1890
                    {
                        "type": "audio_url",
1891
                        "audio_url": None,
1892
1893
1894
1895
                        "uuid": "audio_123",
                    },
                ],
            },
1896
            {"role": "assistant", "content": "Some stuff."},
1897
            {
1898
                "role": "user",
1899
                "content": [
1900
                    {"type": "text", "text": "What's on this image?"},
1901
1902
                    {
                        "type": "image_url",
1903
                        "image_url": None,
1904
1905
                        "uuid": "image_123",
                    },
1906
                    {"type": "text", "text": "And what's in the video?"},
1907
1908
                    {
                        "type": "video_url",
1909
                        "video_url": None,
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
                        "uuid": "video_123",
                    },
                ],
            },
        ],
        qwen25omni_model_config_mm_interleaved,
        content_format="string",
    )

    assert conversation == [
        {
1921
            "role": "user",
1922
1923
            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
            "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
1924
        },
1925
        {"role": "assistant", "content": "Some stuff."},
1926
        {
1927
            "role": "user",
1928
1929
            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
            "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
1930
1931
1932
        },
    ]

1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
    _assert_mm_data_inputs(
        mm_data,
        {"image": 2, "video": 1, "audio": 1},
        skipped_media_indices={"image": [0, 1], "video": [0], "audio": [0]},
    )
    _assert_mm_uuids(
        mm_uuids, 2, modality="image", expected_uuids=["image_123", "image_123"]
    )
    _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=["video_123"])
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=["audio_123"])
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953


def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave(  # noqa: E501
    qwen25omni_model_config_mm_interleaved,
    image_url,
    video_url,
    audio_url,
):
    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
1954
                "role": "user",
1955
                "content": [
1956
                    {"type": "text", "text": "What's on this image?"},
1957
1958
                    {
                        "type": "image_url",
1959
                        "image_url": {"url": image_url},
1960
1961
                        "uuid": "image_123",
                    },
1962
1963
                    {"type": "text", "text": "Now listen to this audio"},
                    {"type": "audio_url", "audio_url": {"url": audio_url}},
1964
1965
                ],
            },
1966
            {"role": "assistant", "content": "Some stuff."},
1967
            {
1968
                "role": "user",
1969
                "content": [
1970
1971
1972
                    {"type": "text", "text": "What's on this image?"},
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "And what's in the video?"},
1973
1974
                    {
                        "type": "video_url",
1975
                        "video_url": {"url": video_url},
1976
1977
1978
1979
1980
                        "uuid": "video_123",
                    },
                ],
            },
        ],
1981
1982
1983
1984
        qwen25omni_model_config_mm_interleaved,
        content_format="string",
    )

1985
1986
    assert conversation == [
        {
1987
            "role": "user",
1988
1989
            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
            "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
1990
        },
1991
        {"role": "assistant", "content": "Some stuff."},
1992
        {
1993
            "role": "user",
1994
1995
            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
            "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
1996
1997
        },
    ]
1998
1999

    _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
2000
2001
    _assert_mm_uuids(mm_uuids, 2, modality="image", expected_uuids=["image_123", None])
    _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=["video_123"])
2002
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
2003
2004
2005
2006
2007
2008
2009


def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
    phi3v_model_config_mm_interleaved,
    image_url,
):
    with pytest.raises(
2010
2011
2012
        ValueError,
        match=r"Found more '<|image_1|>' placeholders in input prompt "
        "than actual multimodal data items.",
2013
    ):
2014
        parse_chat_messages(
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
            [
                {
                    "role": "user",
                    "content": [
                        {"type": "image_url", "image_url": {"url": image_url}},
                        {"type": "image_url", "image_url": {"url": image_url}},
                        {
                            "type": "text",
                            "text": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
                            "Do they have differences?",
                        },
                    ],
                }
            ],
2029
2030
2031
2032
2033
            phi3v_model_config_mm_interleaved,
            content_format="string",
        )


2034
2035
2036
2037
2038
@pytest.mark.parametrize(
    "model",
    [
        QWEN2VL_MODEL_ID,  # tokenizer.chat_template is of type str
        HERMES_MODEL_ID,  # tokenizer.chat_template is of type dict
2039
2040
    ],
)
2041
2042
2043
@pytest.mark.parametrize("use_tools", [True, False])
def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
    """checks that chat_template is a dict type for HF models."""
2044
2045
2046
2047
2048
2049
2050
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")

    model_config = ModelConfig(
        model,
        tokenizer=model_info.tokenizer or model,
        tokenizer_mode=model_info.tokenizer_mode,
2051
        revision=model_info.revision,
2052
2053
        trust_remote_code=model_info.trust_remote_code,
        hf_overrides=model_info.hf_overrides,
2054
2055
2056
        skip_tokenizer_init=model_info.require_embed_inputs,
        enable_prompt_embeds=model_info.require_embed_inputs,
        enable_mm_embeds=model_info.require_embed_inputs,
2057
        enforce_eager=model_info.enforce_eager,
2058
2059
        dtype=model_info.dtype,
    )
2060

2061
2062
    # Build the tokenizer
    tokenizer = get_tokenizer(
2063
        model,
2064
        trust_remote_code=model_config.trust_remote_code,
2065
2066
    )

2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
    tools = (
        [
            {
                "type": "function",
                "function": {
                    "name": "dummy_function_name",
                    "description": "This is a dummy function",
                    "parameters": sample_json_schema,
                },
            }
        ]
        if use_tools
        else None
    )
2081
2082

    # Test detecting the tokenizer's chat_template
2083
    chat_template = resolve_hf_chat_template(
2084
2085
2086
        tokenizer,
        chat_template=None,
        tools=tools,
2087
        model_config=model_config,
2088
2089
2090
2091
    )
    assert isinstance(chat_template, str)


2092
2093
2094
2095
2096
2097
@pytest.mark.parametrize(
    "model, expected_kwargs",
    [
        (
            QWEN2VL_MODEL_ID,
            {
2098
2099
2100
2101
                "add_vision_id",
                "add_generation_prompt",
                "continue_final_message",
                "tools",
2102
2103
2104
2105
2106
            },
        ),
        (
            QWEN3_MODEL_ID,
            {
2107
2108
2109
2110
                "enable_thinking",
                "add_generation_prompt",
                "continue_final_message",
                "tools",
2111
2112
2113
2114
            },
        ),
    ],
)
2115

2116
def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwargs):
2117
2118
2119
2120
    """checks that chat_template is a dict type for HF models."""
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")

2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
    tools = [
        {
            "type": "function",
            "function": {
                "name": "dummy_function_name",
                "description": "This is a dummy function",
                "parameters": sample_json_schema,
            },
        }
    ]
2131
2132
2133
2134
2135
2136
2137

    chat_template_kwargs = {
        # both unused
        "unsed_kwargs_1": 123,
        "unsed_kwargs_2": "abc",
        # should not appear
        "chat_template": "{% Hello world! %}",
2138
        "tokenize": True,
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
        # used by tokenizer
        "continue_final_message": True,
        "tools": tools,
        # both used by Qwen2-VL and Qwen3
        "add_generation_prompt": True,
        # only used by Qwen2-VL
        "add_vision_id": True,
        # only used by Qwen3
        "enable_thinking": True,
    }

    model_config = ModelConfig(
        model,
        tokenizer=model_info.tokenizer or model,
        tokenizer_mode=model_info.tokenizer_mode,
        revision=model_info.revision,
        trust_remote_code=model_info.trust_remote_code,
        hf_overrides=model_info.hf_overrides,
2157
2158
2159
        skip_tokenizer_init=model_info.require_embed_inputs,
        enable_prompt_embeds=model_info.require_embed_inputs,
        enable_mm_embeds=model_info.require_embed_inputs,
2160
        enforce_eager=model_info.enforce_eager,
2161
2162
        dtype=model_info.dtype,
    )
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176

    # Build the tokenizer
    tokenizer = get_tokenizer(
        model,
        trust_remote_code=model_config.trust_remote_code,
    )

    # Test detecting the tokenizer's chat_template
    chat_template = resolve_hf_chat_template(
        tokenizer,
        chat_template=None,
        tools=tools,
        model_config=model_config,
    )
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
    with pytest.raises(
        ValueError, match="Found unexpected chat template kwargs from request"
    ):
        # should raise error if `chat_template_kwargs` contains
        # `chat_template` or `tokenize`
        resolve_chat_template_kwargs(
            tokenizer,
            chat_template=chat_template,
            chat_template_kwargs=chat_template_kwargs,
        )
2187
2188
2189
2190
    resolved_chat_template_kwargs = resolve_chat_template_kwargs(
        tokenizer,
        chat_template=chat_template,
        chat_template_kwargs=chat_template_kwargs,
2191
        raise_on_unexpected=False,
2192
2193
2194
    )
    assert set(resolved_chat_template_kwargs.keys()) == expected_kwargs

2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
    # Additional test: Verify HF base parameters work with **kwargs tokenizers
    # This validates the fix for tokenizers like Kimi K2 that use **kwargs
    # to receive standard HuggingFace parameters instead of declaring them explicitly
    from vllm.entrypoints.chat_utils import _get_hf_base_chat_template_params

    hf_base_params = _get_hf_base_chat_template_params()
    # Verify common HF parameters are in the base class
    assert {"add_generation_prompt", "tools", "continue_final_message"}.issubset(
        hf_base_params
    ), f"Expected HF base params not found in {hf_base_params}"

    # Test with a mock tokenizer that uses **kwargs (like Kimi K2)
    class MockTokenizerWithKwargs:
        def apply_chat_template(self, conversation, **kwargs):
            return "mocked_output"

    mock_tokenizer = MockTokenizerWithKwargs()
    mock_kwargs = {
        "add_generation_prompt": True,
        "tools": tools,
        "continue_final_message": False,
        "unknown_param": "should_be_filtered",
    }
    resolved_mock = resolve_chat_template_kwargs(
        mock_tokenizer, chat_template, mock_kwargs, raise_on_unexpected=False
    )
    # HF base params should pass through even with **kwargs tokenizer
    assert "add_generation_prompt" in resolved_mock
    assert "tools" in resolved_mock
    assert "continue_final_message" in resolved_mock
    # Unknown params should be filtered out
    assert "unknown_param" not in resolved_mock

2228

2229
2230
# NOTE: Qwen2-Audio default chat template is specially defined inside
# processor class instead of using `tokenizer_config.json`
2231
2232
@pytest.mark.parametrize(
    ("model", "expected_format"),
2233
2234
2235
2236
2237
2238
2239
2240
    [
        (PHI3V_MODEL_ID, "string"),
        (QWEN2VL_MODEL_ID, "openai"),
        (QWEN25VL_MODEL_ID, "openai"),
        (ULTRAVOX_MODEL_ID, "string"),
        (QWEN2AUDIO_MODEL_ID, "openai"),
        (LLAMA_GUARD_MODEL_ID, "openai"),
    ],
2241
2242
)
def test_resolve_content_format_hf_defined(model, expected_format):
2243
2244
2245
2246
2247
2248
2249
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")

    model_config = ModelConfig(
        model,
        tokenizer=model_info.tokenizer or model,
        tokenizer_mode=model_info.tokenizer_mode,
2250
        revision=model_info.revision,
2251
2252
        trust_remote_code=model_info.trust_remote_code,
        hf_overrides=model_info.hf_overrides,
2253
2254
2255
        skip_tokenizer_init=model_info.require_embed_inputs,
        enable_prompt_embeds=model_info.require_embed_inputs,
        enable_mm_embeds=model_info.require_embed_inputs,
2256
        enforce_eager=model_info.enforce_eager,
2257
2258
        dtype=model_info.dtype,
    )
2259

2260
    tokenizer = get_tokenizer(
2261
        model,
2262
        trust_remote_code=model_config.trust_remote_code,
2263
2264
    )

2265
    # Test detecting the tokenizer's chat_template
2266
    chat_template = resolve_hf_chat_template(
2267
2268
2269
        tokenizer,
        chat_template=None,
        tools=None,
2270
        model_config=model_config,
2271
    )
2272
2273
2274
2275
2276
2277
2278
2279
2280
    assert isinstance(chat_template, str)

    print("[TEXT]")
    print(chat_template)
    print("[AST]")
    print(_try_extract_ast(chat_template))

    resolved_format = resolve_chat_template_content_format(
        None,  # Test detecting the tokenizer's chat_template
2281
        None,
2282
2283
        "auto",
        tokenizer,
2284
        model_config=model_config,
2285
2286
2287
2288
2289
2290
2291
    )

    assert resolved_format == expected_format


@pytest.mark.parametrize(
    ("model", "expected_format"),
2292
    [
2293
2294
2295
2296
2297
2298
2299
        (os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b"), "string"),
        (os.path.join(models_path_prefix, "facebook/chameleon-7b"), "string"),
        (os.path.join(models_path_prefix, "deepseek-ai/deepseek-vl2-tiny"), "string"),
        (os.path.join(models_path_prefix, "adept/fuyu-8b"), "string"),
        (os.path.join(models_path_prefix, "google/paligemma-3b-mix-224"), "string"),
        (os.path.join(models_path_prefix, "Qwen/Qwen-VL"), "string"),
        (os.path.join(models_path_prefix, "Qwen/Qwen-VL-Chat"), "string"),
2300
    ],
2301
2302
2303
2304
2305
2306
2307
2308
def test_resolve_content_format_fallbacks(model, expected_format):
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")

    model_config = ModelConfig(
        model,
        tokenizer=model_info.tokenizer or model,
        tokenizer_mode=model_info.tokenizer_mode,
2309
        revision=model_info.revision,
2310
2311
        trust_remote_code=model_info.trust_remote_code,
        hf_overrides=model_info.hf_overrides,
2312
2313
2314
        skip_tokenizer_init=model_info.require_embed_inputs,
        enable_prompt_embeds=model_info.require_embed_inputs,
        enable_mm_embeds=model_info.require_embed_inputs,
2315
        enforce_eager=model_info.enforce_eager,
2316
2317
        dtype=model_info.dtype,
    )
2318

2319
    tokenizer = get_tokenizer(
2320
2321
2322
2323
2324
2325
2326
2327
2328
        model_config.tokenizer,
        trust_remote_code=model_config.trust_remote_code,
    )

    # Test detecting the tokenizer's chat_template
    chat_template = resolve_hf_chat_template(
        tokenizer,
        chat_template=None,
        tools=None,
2329
        model_config=model_config,
2330
2331
2332
2333
2334
2335
2336
2337
2338
    )
    assert isinstance(chat_template, str)

    print("[TEXT]")
    print(chat_template)
    print("[AST]")
    print(_try_extract_ast(chat_template))

    resolved_format = resolve_chat_template_content_format(
2339
        None,  # Test detecting the tokenizer's chat_template
2340
        None,
2341
2342
        "auto",
        tokenizer,
2343
        model_config=model_config,
2344
2345
2346
2347
2348
2349
2350
    )

    assert resolved_format == expected_format


@pytest.mark.parametrize(
    ("template_path", "expected_format"),
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
    [
        ("template_alpaca.jinja", "string"),
        ("template_baichuan.jinja", "string"),
        ("template_chatglm.jinja", "string"),
        ("template_chatglm2.jinja", "string"),
        ("template_chatml.jinja", "string"),
        ("template_dse_qwen2_vl.jinja", "openai"),
        ("template_falcon_180b.jinja", "string"),
        ("template_falcon.jinja", "string"),
        ("template_inkbot.jinja", "string"),
        ("template_teleflm.jinja", "string"),
        ("template_vlm2vec_phi3v.jinja", "openai"),
        ("template_vlm2vec_qwen2vl.jinja", "openai"),
        ("tool_chat_template_granite_20b_fc.jinja", "string"),
        ("tool_chat_template_hermes.jinja", "string"),
        ("tool_chat_template_internlm2_tool.jinja", "string"),
        ("tool_chat_template_llama3.1_json.jinja", "openai"),
        ("tool_chat_template_llama3.2_json.jinja", "openai"),
        ("tool_chat_template_mistral_parallel.jinja", "string"),
        ("tool_chat_template_mistral.jinja", "string"),
    ],
2372
2373
)
def test_resolve_content_format_examples(template_path, expected_format):
2374
2375
2376
2377
2378
2379
    model_config = ModelConfig(
        PHI3V_MODEL_ID,  # Dummy
        tokenizer=PHI3V_MODEL_ID,  # Dummy
        trust_remote_code=True,
    )

2380
    dummy_tokenizer = get_tokenizer(
2381
2382
        PHI3V_MODEL_ID,  # Dummy
        trust_remote_code=model_config.trust_remote_code,
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
    )
    dummy_tokenizer.chat_template = None

    chat_template = load_chat_template(EXAMPLES_DIR / template_path)
    assert isinstance(chat_template, str)

    print("[TEXT]")
    print(chat_template)
    print("[AST]")
    print(_try_extract_ast(chat_template))

    resolved_format = resolve_chat_template_content_format(
        chat_template,
2396
        None,
2397
2398
        "auto",
        dummy_tokenizer,
2399
        model_config=model_config,
2400
2401
2402
    )

    assert resolved_format == expected_format
Julien Denize's avatar
Julien Denize committed
2403
2404


2405
def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
    messages = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": "You are a helpful assistant."},
                {
                    "type": "thinking",
                    "closed": True,
                    "thinking": "Only return the answer when you are confident.",
                },
            ],
        },
        {"role": "user", "content": "What is 2+2?"},
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": "Let me think about it."},
                {"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
                {
                    "type": "text",
                    "text": "The answer is 4.",
                },
            ],
        },
    ]
Julien Denize's avatar
Julien Denize committed
2431

2432
    conversation_with_thinking, _, _ = parse_chat_messages(
Julien Denize's avatar
Julien Denize committed
2433
2434
2435
2436
2437
        messages,
        mistral_model_config,
        content_format="openai",
    )

2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
    expected_conversation = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": "You are a helpful assistant."},
                {
                    "type": "text",
                    "text": "Only return the answer when you are confident.",
                },
            ],
        },
        {
            "role": "user",
            "content": [{"type": "text", "text": "What is 2+2?"}],
        },
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": "Let me think about it."},
                {"type": "text", "text": "2+2 = 4"},
                {"type": "text", "text": "The answer is 4."},
            ],
        },
    ]
Julien Denize's avatar
Julien Denize committed
2462
2463
2464
2465
2466

    assert conversation_with_thinking == expected_conversation


def test_apply_mistral_chat_template_thinking_chunk():
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
    messages = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": "You are a helpful assistant."},
                {
                    "type": "thinking",
                    "closed": True,
                    "thinking": "Only return the answer when you are confident.",
                },
            ],
        },
        {"role": "user", "content": "What is 2+2?"},
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": "Let me think about it."},
                {"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
                {
                    "type": "text",
                    "text": "The answer is 4.",
                },
            ],
        },
        {"role": "user", "content": "Thanks, what is 3+3?"},
    ]
Julien Denize's avatar
Julien Denize committed
2493
    mistral_tokenizer = MistralTokenizer.from_pretrained(
2494
        "mistralai/Magistral-Small-2509"
2495
    )
Julien Denize's avatar
Julien Denize committed
2496

2497
2498
2499
    tokens_ids = apply_mistral_chat_template(
        mistral_tokenizer, messages, chat_template=None, tools=None
    )
Julien Denize's avatar
Julien Denize committed
2500
2501

    string_tokens = mistral_tokenizer.mistral.decode(
2502
2503
        tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP
    )
Julien Denize's avatar
Julien Denize committed
2504
2505
2506
2507
2508
2509

    expected_tokens = (
        r"<s>[SYSTEM_PROMPT]You are a helpful assistant.[THINK]Only return the"
        r" answer when you are confident.[/THINK][/SYSTEM_PROMPT]"
        r"[INST]What is 2+2?[/INST]"
        r"Let me think about it.[THINK]2+2 = 4[/THINK]The answer is 4.</s>"
2510
2511
        r"[INST]Thanks, what is 3+3?[/INST]"
    )
Julien Denize's avatar
Julien Denize committed
2512
2513

    assert string_tokens == expected_tokens
2514
2515
2516
2517
2518
2519
2520


def test_parse_chat_messages_single_empty_audio_with_uuid(
    qwen2_audio_model_config,
):
    audio_uuid = "abcd"
    conversation, mm_data, mm_uuids = parse_chat_messages(
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "input_audio",
                        "input_audio": {},
                        "uuid": audio_uuid,
                    },
                    {"type": "text", "text": "What does the audio say?"},
                ],
            }
        ],
2534
2535
2536
2537
        qwen2_audio_model_config,
        content_format="string",
    )

2538
2539
2540
    assert conversation == [
        {
            "role": "user",
2541
2542
            "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the "
            "audio say?",
2543
2544
        }
    ]
2545
    _assert_mm_data_inputs(mm_data, {"audio": 1})
2546
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
2547
2548
2549
2550
2551
2552
2553
2554


@pytest.mark.asyncio
async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
    qwen2_audio_model_config,
):
    audio_uuid = "abcd"
    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "input_audio",
                        "input_audio": {},
                        "uuid": audio_uuid,
                    },
                    {"type": "text", "text": "What does the audio say?"},
                ],
            }
        ],
2568
2569
2570
2571
        qwen2_audio_model_config,
        content_format="string",
    )

2572
2573
2574
    assert conversation == [
        {
            "role": "user",
2575
2576
            "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the "
            "audio say?",
2577
2578
        }
    ]
2579
    _assert_mm_data_inputs(await mm_future, {"audio": 1})
2580
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])