test_chat_utils.py 68.2 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import warnings
5
from collections.abc import Mapping
6
from typing import Literal
7
8

import pytest
9
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
10

11
from vllm.assets.audio import AudioAsset
12
from vllm.assets.image import ImageAsset
13
from vllm.assets.video import VideoAsset
14
from vllm.config import ModelConfig
15
16
from vllm.entrypoints.chat_utils import (
    _try_extract_ast,
17
    apply_mistral_chat_template,
18
19
20
21
22
23
24
    load_chat_template,
    parse_chat_messages,
    parse_chat_messages_futures,
    resolve_chat_template_content_format,
    resolve_chat_template_kwargs,
    resolve_hf_chat_template,
)
25
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
26
27
28
29
30
from vllm.multimodal.utils import (
    encode_audio_base64,
    encode_image_base64,
    encode_video_base64,
)
31
from vllm.transformers_utils.tokenizer import get_tokenizer
Julien Denize's avatar
Julien Denize committed
32
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
33

34
from ..models.registry import HF_EXAMPLE_MODELS
35
36
37
38
from ..utils import VLLM_PATH

EXAMPLES_DIR = VLLM_PATH / "examples"

39
PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
40
ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
41
QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
42
QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
43
QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
44
QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
45
QWEN3_MODEL_ID = "Qwen/Qwen3-8B"
46
LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
47
HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
48
MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
49
50


51
@pytest.fixture(scope="function")
52
def phi3v_model_config():
53
54
55
56
57
58
59
60
    return ModelConfig(
        PHI3V_MODEL_ID,
        runner="generate",
        trust_remote_code=True,
        limit_mm_per_prompt={
            "image": 2,
        },
    )
61
62


63
64
@pytest.fixture(scope="function")
def phi3v_model_config_mm_interleaved():
65
66
67
68
69
70
71
72
73
    return ModelConfig(
        PHI3V_MODEL_ID,
        runner="generate",
        trust_remote_code=True,
        interleave_mm_strings=True,
        limit_mm_per_prompt={
            "image": 2,
        },
    )
74
75


76
77
78
79
80
81
82
83
84
85
86
87
88
@pytest.fixture(scope="function")
def phi3v_model_config_image_embeds():
    return ModelConfig(
        PHI3V_MODEL_ID,
        runner="generate",
        trust_remote_code=True,
        limit_mm_per_prompt={
            "image": 2,
        },
        enable_mm_embeds=True,
    )


89
90
@pytest.fixture(scope="module")
def phi3v_tokenizer():
91
    return get_tokenizer(PHI3V_MODEL_ID)
92
93


94
95
96
97
98
99
100
101
102
103
104
105
106
107
@pytest.fixture(scope="function")
def qwen2_audio_model_config():
    return ModelConfig(
        QWEN2AUDIO_MODEL_ID,
        runner="generate",
        trust_remote_code=True,
        limit_mm_per_prompt={
            "audio": 1,
        },
    )


@pytest.fixture(scope="module")
def qwen2_audio_tokenizer():
108
    return get_tokenizer(QWEN2AUDIO_MODEL_ID)
109
110


111
112
@pytest.fixture(scope="function")
def qwen25omni_model_config_mm_interleaved():
113
114
115
116
117
118
119
120
121
122
    return ModelConfig(
        QWEN25OMNI_MODEL_ID,
        runner="generate",
        interleave_mm_strings=True,
        limit_mm_per_prompt={
            "image": 2,
            "audio": 1,
            "video": 1,
        },
    )
123
124
125
126


@pytest.fixture(scope="module")
def qwen25omni_tokenizer():
127
    return get_tokenizer(QWEN25OMNI_MODEL_ID)
128
129


130
131
@pytest.fixture(scope="function")
def mistral_model_config():
132
133
134
135
136
137
138
    return ModelConfig(
        MISTRAL_MODEL_ID,
        runner="generate",
        limit_mm_per_prompt={
            "image": 2,
        },
    )
139
140
141
142


@pytest.fixture(scope="module")
def mistral_tokenizer():
143
    return get_tokenizer(MISTRAL_MODEL_ID)
144
145


146
147
@pytest.fixture(scope="module")
def image_url():
148
    image = ImageAsset("cherry_blossom")
149
150
151
152
    base64 = encode_image_base64(image.pil_image)
    return f"data:image/jpeg;base64,{base64}"


153
154
@pytest.fixture(scope="module")
def video_url():
155
    video = VideoAsset("baby_reading", 1)
156
157
158
159
160
161
    base64 = encode_video_base64(video.np_ndarrays)
    return f"data:video/jpeg;base64,{base64}"


@pytest.fixture(scope="module")
def audio_url():
162
    audio = AudioAsset("mary_had_lamb")
163
164
165
166
    base64 = encode_audio_base64(*audio.audio_and_sample_rate)
    return f"data:audio/ogg;base64,{base64}"


167
def _assert_mm_data_is_image_input(
168
    mm_data: MultiModalDataDict | None,
169
    image_count: int,
170
    skipped_image_indices: list | None = None,
171
172
173
174
175
176
177
) -> None:
    assert mm_data is not None
    assert set(mm_data.keys()) == {"image"}

    image_data = mm_data.get("image")
    assert image_data is not None

178
    assert isinstance(image_data, list) and len(image_data) == image_count
179
180
181
    if skipped_image_indices is not None:
        for i in skipped_image_indices:
            assert image_data[i] is None
182
183


184
def _assert_mm_uuids(
185
    mm_uuids: MultiModalUUIDDict | None,
186
    media_count: int,
187
    expected_uuids: list[str | None],
188
189
190
191
192
193
194
195
196
    modality: str = "image",
) -> None:
    if len(expected_uuids) > 0:
        assert mm_uuids is not None
        assert modality in mm_uuids

        image_uuids = mm_uuids.get(modality)
        assert image_uuids is not None

197
        assert isinstance(image_uuids, list) and len(image_uuids) == media_count
198
199
200
201
202
203

        assert image_uuids == expected_uuids
    else:
        assert mm_uuids is None


204
205
206
207
208
ModalityType = Literal["image", "video", "audio"]
MultiModalDataCounts = Mapping[ModalityType, int]


def _assert_mm_data_inputs(
209
    mm_data: MultiModalDataDict | None,
210
    data_count: MultiModalDataCounts,
211
    skipped_media_indices: dict[str, list] | None = None,  # modality -> list[int]
212
213
214
215
216
217
218
219
220
) -> None:
    assert mm_data is not None
    assert set(data_count.keys()) == (set(mm_data.keys()))

    for modality, n in data_count.items():
        modality_data = mm_data.get(modality)
        assert modality_data is not None
        assert isinstance(modality_data, list) and len(modality_data) == n

221
        if skipped_media_indices is not None:
222
            skipped_media_indices_for_modality = skipped_media_indices.get(modality)
223
224
225
226
            assert skipped_media_indices_for_modality is not None
            for i in skipped_media_indices_for_modality:
                assert modality_data[i] is None

227

228
229
230
231
232
def test_parse_chat_messages_single_image(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
233
    conversation, mm_data, mm_uuids = parse_chat_messages(
234
235
236
237
238
239
240
241
242
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
243
244
245
246
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )
247

248
249
250
    assert conversation == [
        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
    ]
251
    _assert_mm_data_is_image_input(mm_data, 1)
252
253
254
255
256
257
258
259
260
261
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])


def test_parse_chat_messages_single_image_with_uuid(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    image_uuid = str(hash(image_url))
    conversation, mm_data, mm_uuids = parse_chat_messages(
262
263
264
265
266
267
268
269
270
271
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url,
                        },
                        "uuid": image_uuid,
272
                    },
273
274
275
276
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
277
278
279
280
281
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )

282
283
284
    assert conversation == [
        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
    ]
285
286
287
288
    _assert_mm_data_is_image_input(mm_data, 1)
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])


289
290
291
292
293
294
295
def test_parse_chat_messages_single_empty_image_with_uuid(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    image_uuid = str(hash(image_url))
    conversation, mm_data, mm_uuids = parse_chat_messages(
296
297
298
299
300
301
302
303
304
305
306
307
308
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": None,
                        "uuid": image_uuid,
                    },
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
309
310
311
312
313
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )

314
315
316
    assert conversation == [
        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
    ]
317
318
319
320
    _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0])
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])


321
322
323
324
325
326
327
def test_parse_chat_messages_single_image_with_bad_uuid_format(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    image_uuid = str(hash(image_url))
    conversation, mm_data, mm_uuids = parse_chat_messages(
328
329
330
331
332
333
334
335
336
337
338
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url,
                            "uuid": image_uuid,
                        },
                        "bad_uuid_key": image_uuid,
339
                    },
340
341
342
343
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
344
345
346
347
348
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )

349
350
351
    assert conversation == [
        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
    ]
352
353
354
355
356
357
358
359
360
361
362
363
364
    _assert_mm_data_is_image_input(mm_data, 1)
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])


def test_parse_chat_messages_multiple_images_with_uuids(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    image_uuid1 = "my_uuid_1"
    image_uuid2 = "my_uuid_2"

    conversation, mm_data, mm_uuids = parse_chat_messages(
365
366
367
368
369
370
371
372
373
374
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url,
                        },
                        "uuid": image_uuid1,
375
                    },
376
377
378
379
380
381
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url,
                        },
                        "uuid": image_uuid2,
382
                    },
383
384
385
386
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
387
388
389
390
391
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )

392
393
394
395
396
397
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in the image?",
        }
    ]
398
399
400
401
    _assert_mm_data_is_image_input(mm_data, 2)
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])


402
403
404
405
406
407
408
409
410
def test_parse_chat_messages_multiple_empty_images_with_uuids(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    image_uuid1 = "my_uuid_1"
    image_uuid2 = "my_uuid_2"

    conversation, mm_data, mm_uuids = parse_chat_messages(
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": None,
                        "uuid": image_uuid1,
                    },
                    {
                        "type": "image_url",
                        "image_url": None,
                        "uuid": image_uuid2,
                    },
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
429
430
431
432
433
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )

434
435
436
437
438
439
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in the image?",
        }
    ]
440
441
442
443
444
445
446
447
448
449
450
451
452
    _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[0, 1])
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])


def test_parse_chat_messages_mixed_empty_images_with_uuids(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    image_uuid1 = "my_uuid_1"
    image_uuid2 = "my_uuid_2"

    conversation, mm_data, mm_uuids = parse_chat_messages(
453
454
455
456
457
458
459
460
461
462
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url,
                        },
                        "uuid": image_uuid1,
463
                    },
464
465
466
467
468
469
470
471
472
                    {
                        "type": "image_url",
                        "image_url": None,
                        "uuid": image_uuid2,
                    },
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
473
474
475
476
477
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )

478
479
480
481
482
483
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in the image?",
        }
    ]
484
485
486
487
    _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[1])
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])


488
489
490
491
492
493
494
495
@pytest.mark.asyncio
async def test_parse_chat_messages_single_image_with_uuid_async(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    image_uuid = str(hash(image_url))
    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
496
497
498
499
500
501
502
503
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url},
                        "uuid": image_uuid,
504
                    },
505
506
507
508
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
509
510
511
512
513
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )

514
515
516
    assert conversation == [
        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
    ]
517
518
519
520
    _assert_mm_data_is_image_input(await mm_future, 1)
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])


521
522
523
524
525
526
527
528
@pytest.mark.asyncio
async def test_parse_chat_messages_empty_image_with_uuid_async(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    image_uuid = str(hash(image_url))
    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
529
530
531
532
533
534
535
536
537
538
539
540
541
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": None,
                        "uuid": image_uuid,
                    },
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
542
543
544
545
546
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )

547
548
549
550
    assert conversation == [
        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
    ]
    _assert_mm_data_is_image_input(await mm_future, 1, skipped_image_indices=[0])
551
552
553
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])


554
555
556
557
558
559
560
561
562
563
@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_with_uuids_async(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    image_uuid1 = "my_uuid_1"
    image_uuid2 = "my_uuid_2"

    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
564
565
566
567
568
569
570
571
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url},
                        "uuid": image_uuid1,
572
                    },
573
574
575
576
577
578
579
580
581
                    {
                        "type": "image_pil",
                        "image_pil": ImageAsset("cherry_blossom").pil_image,
                        "uuid": image_uuid2,
                    },
                    {"type": "text", "text": "What's in these images?"},
                ],
            }
        ],
582
583
584
585
586
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )

587
588
589
590
591
592
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
        }
    ]
593
594
595
596
    _assert_mm_data_is_image_input(await mm_future, 2)
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])


597
598
599
600
601
602
603
604
605
606
@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    image_uuid1 = "my_uuid_1"
    image_uuid2 = "my_uuid_2"

    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": None,
                        "uuid": image_uuid1,
                    },
                    {
                        "type": "image_pil",
                        "image_pil": None,
                        "uuid": image_uuid2,
                    },
                    {"type": "text", "text": "What's in these images?"},
                ],
            }
        ],
625
626
627
628
629
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )

630
631
632
633
634
635
636
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
        }
    ]
    _assert_mm_data_is_image_input(await mm_future, 2, skipped_image_indices=[0, 1])
637
638
639
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])


640
641
642
643
644
645
646
647
648
@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
    image_uuid2 = "my_uuid_2"

    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
649
650
651
652
653
654
655
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url},
656
                    },
657
658
659
660
661
662
663
664
665
                    {
                        "type": "image_pil",
                        "image_pil": ImageAsset("cherry_blossom").pil_image,
                        "uuid": image_uuid2,
                    },
                    {"type": "text", "text": "What's in these images?"},
                ],
            }
        ],
666
667
668
669
670
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )

671
672
673
674
675
676
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
        }
    ]
677
678
    _assert_mm_data_is_image_input(await mm_future, 2)
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, image_uuid2])
679
680


681
682
683
684
685
def test_parse_chat_messages_empty_system(
    mistral_model_config,
    mistral_tokenizer,
):
    # Test string format
686
    conversation, _, _ = parse_chat_messages(
687
        [
688
            {"role": "system", "content": ""},
689
690
            {
                "role": "user",
691
                "content": [{"type": "text", "text": "Who are you?"}],
692
693
            },
        ],
694
695
696
697
        mistral_model_config,
        mistral_tokenizer,
        content_format="string",
    )
698
    assert conversation == [
699
700
        {"role": "system", "content": ""},
        {"role": "user", "content": "Who are you?"},
701
    ]
702
703

    # Test openai format
704
    conversation, _, _ = parse_chat_messages(
705
        [
706
            {"role": "system", "content": ""},
707
708
            {
                "role": "user",
709
                "content": [{"type": "text", "text": "Who are you?"}],
710
711
712
713
714
715
716
            },
        ],
        mistral_model_config,
        mistral_tokenizer,
        content_format="openai",
    )
    assert conversation == [
717
718
        {"role": "system", "content": [{"type": "text", "text": ""}]},
        {"role": "user", "content": [{"type": "text", "text": "Who are you?"}]},
719
    ]
720
721


722
@pytest.mark.asyncio
723
724
725
726
727
async def test_parse_chat_messages_single_image_async(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
728
    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
729
730
731
732
733
734
735
736
737
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "What's in the image?"},
                ],
            }
        ],
738
739
740
741
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )
742

743
744
745
    assert conversation == [
        {"role": "user", "content": "<|image_1|>\nWhat's in the image?"}
    ]
746
    _assert_mm_data_is_image_input(await mm_future, 1)
747
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
748
749
750
751
752
753
754


def test_parse_chat_messages_multiple_images(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
755
    conversation, mm_data, mm_uuids = parse_chat_messages(
756
757
758
759
760
761
762
763
764
765
766
767
768
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {
                        "type": "image_pil",
                        "image_pil": ImageAsset("cherry_blossom").pil_image,
                    },
                    {"type": "text", "text": "What's in these images?"},
                ],
            }
        ],
769
770
771
772
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )
773

774
775
776
777
778
779
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
        }
    ]
780
    _assert_mm_data_is_image_input(mm_data, 2)
781
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
782
783


784
785
786
787
788
789
def test_parse_chat_messages_empty_pil_image_with_uuid(
    phi3v_model_config,
    phi3v_tokenizer,
):
    uuid = "abcd"
    conversation, mm_data, mm_uuids = parse_chat_messages(
790
791
792
793
794
795
796
797
798
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_pil", "image_pil": None, "uuid": uuid},
                    {"type": "text", "text": "What's in this image?"},
                ],
            }
        ],
799
800
801
802
803
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )

804
805
806
807
808
809
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\nWhat's in this image?",
        }
    ]
810
811
812
813
814
    _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0])
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])


def test_parse_chat_messages_empty_image_embeds_with_uuid(
815
    phi3v_model_config_image_embeds,
816
817
818
819
    phi3v_tokenizer,
):
    uuid = "abcd"
    conversation, mm_data, mm_uuids = parse_chat_messages(
820
821
822
823
824
825
826
827
828
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_embeds", "image_embeds": None, "uuid": uuid},
                    {"type": "text", "text": "What's in this image?"},
                ],
            }
        ],
829
        phi3v_model_config_image_embeds,
830
831
832
833
        phi3v_tokenizer,
        content_format="string",
    )

834
835
836
837
838
839
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\nWhat's in this image?",
        }
    ]
840
841
842
843
844
845
846
847
    assert mm_data is not None
    assert "image" in mm_data
    assert mm_data["image"] is None
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])


@pytest.mark.asyncio
async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
848
    phi3v_model_config_image_embeds,
849
850
851
852
    phi3v_tokenizer,
):
    uuid = "abcd"
    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
853
854
855
856
857
858
859
860
861
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_embeds", "image_embeds": None, "uuid": uuid},
                    {"type": "text", "text": "What's in this image?"},
                ],
            }
        ],
862
        phi3v_model_config_image_embeds,
863
864
865
866
        phi3v_tokenizer,
        content_format="string",
    )

867
868
869
870
871
872
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\nWhat's in this image?",
        }
    ]
873
874
875
876
877
878
879
    mm_data = await mm_future
    assert mm_data is not None
    assert "image" in mm_data
    assert mm_data["image"] is None
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])


880
@pytest.mark.asyncio
881
882
883
884
885
async def test_parse_chat_messages_multiple_images_async(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
886
    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
887
888
889
890
891
892
893
894
895
896
897
898
899
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {
                        "type": "image_pil",
                        "image_pil": ImageAsset("cherry_blossom").pil_image,
                    },
                    {"type": "text", "text": "What's in these images?"},
                ],
            }
        ],
900
901
902
903
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )
904

905
906
907
908
909
910
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
        }
    ]
911
    _assert_mm_data_is_image_input(await mm_future, 2)
912
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
913
914
915
916
917
918
919


def test_parse_chat_messages_placeholder_already_in_prompt(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
920
    conversation, mm_data, mm_uuids = parse_chat_messages(
921
922
923
924
925
926
927
928
929
930
931
932
933
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {
                        "type": "text",
                        "text": "What's in <|image_1|> and how does it compare to <|image_2|>?",  # noqa: E501
                    },
                ],
            }
        ],
934
935
936
937
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )
938
939
940
941
942
943
    assert conversation == [
        {
            "role": "user",
            "content": "What's in <|image_1|> and how does it compare to <|image_2|>?",
        }
    ]
944
    _assert_mm_data_is_image_input(mm_data, 2)
945
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
946
947


948
949
950
951
952
def test_parse_chat_messages_placeholder_one_already_in_prompt(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
953
    conversation, mm_data, mm_uuids = parse_chat_messages(
954
955
956
957
958
959
960
961
        [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {
                        "type": "text",
962
963
                        "text": "What's in <|image_1|> and how does it compare to "
                        "the other one?",
964
965
966
967
                    },
                ],
            }
        ],
968
969
970
971
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )
972

973
974
975
    assert conversation == [
        {
            "role": "user",
976
977
            "content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to "
            "the other one?",
978
979
        }
    ]
980
    _assert_mm_data_is_image_input(mm_data, 2)
981
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
982
983


984
985
986
987
988
def test_parse_chat_messages_multiple_images_across_messages(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
989
    conversation, mm_data, mm_uuids = parse_chat_messages(
990
991
        [
            {
992
                "role": "user",
993
                "content": [
994
995
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "What's in this image?"},
996
997
                ],
            },
998
            {"role": "assistant", "content": "Some stuff."},
999
            {
1000
                "role": "user",
1001
                "content": [
1002
1003
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "What about this one?"},
1004
1005
1006
                ],
            },
        ],
1007
1008
1009
1010
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )
1011
1012

    assert conversation == [
1013
1014
1015
        {"role": "user", "content": "<|image_1|>\nWhat's in this image?"},
        {"role": "assistant", "content": "Some stuff."},
        {"role": "user", "content": "<|image_2|>\nWhat about this one?"},
1016
    ]
1017
    _assert_mm_data_is_image_input(mm_data, 2)
1018
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
1019
1020


1021
def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
1022
1023
    phi3v_model_config,
    phi3v_tokenizer,
1024
    image_url,
1025
):
1026
1027
    image_uuid = str(hash(image_url))
    conversation, mm_data, mm_uuids = parse_chat_messages(
1028
1029
        [
            {
1030
                "role": "user",
1031
1032
1033
                "content": [
                    {
                        "type": "image_url",
1034
                        "image_url": {"url": image_url},
1035
1036
                        "uuid": image_uuid,
                    },
1037
                    {"type": "text", "text": "What's in this image?"},
1038
                ],
1039
            },
1040
            {"role": "assistant", "content": "Some stuff."},
1041
            {
1042
                "role": "user",
1043
1044
1045
                "content": [
                    {
                        "type": "image_url",
1046
                        "image_url": {"url": image_url},
1047
1048
                        "uuid": image_uuid,
                    },
1049
                    {"type": "text", "text": "What about this one?"},
1050
1051
1052
1053
1054
1055
1056
1057
1058
                ],
            },
        ],
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )

    assert conversation == [
1059
1060
1061
        {"role": "user", "content": "<|image_1|>\nWhat's in this image?"},
        {"role": "assistant", "content": "Some stuff."},
        {"role": "user", "content": "<|image_2|>\nWhat about this one?"},
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
    ]
    _assert_mm_data_is_image_input(mm_data, 2)
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])


def test_parse_chat_messages_context_text_format(
    phi3v_model_config,
    phi3v_tokenizer,
):
    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
                "role": "user",
1075
                "content": [{"type": "text", "text": "What's in this text?"}],
1076
            },
1077
1078
            {"role": "assistant", "content": "Some stuff."},
            {"role": "user", "content": "What about this one?"},
1079
        ],
1080
1081
1082
1083
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="openai",
    )
1084
1085
1086
1087

    assert conversation == [
        {
            "role": "user",
1088
            "content": [{"type": "text", "text": "What's in this text?"}],
1089
1090
1091
        },
        {
            "role": "assistant",
1092
            "content": [{"type": "text", "text": "Some stuff."}],
1093
1094
1095
        },
        {
            "role": "user",
1096
            "content": [{"type": "text", "text": "What about this one?"}],
1097
1098
        },
    ]
1099
1100
    assert mm_data is None
    assert mm_uuids is None
1101
1102


1103
1104
1105
1106
1107
def test_parse_chat_messages_rejects_too_many_images_in_one_message(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
1108
1109
1110
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
1111
1112
            message="coroutine 'async_get_and_parse_image' was never awaited",
        )
1113
        with pytest.raises(ValueError, match="At most"):
1114
            parse_chat_messages(
1115
1116
1117
1118
1119
1120
1121
                [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image_url",
                                "image_url": {"url": image_url},
1122
                            },
1123
1124
1125
                            {
                                "type": "image_url",
                                "image_url": {"url": image_url},
1126
                            },
1127
1128
1129
                            {
                                "type": "image_url",
                                "image_url": {"url": image_url},
1130
                            },
1131
1132
1133
1134
                            {"type": "text", "text": "What's in these images?"},
                        ],
                    }
                ],
1135
1136
1137
1138
                phi3v_model_config,
                phi3v_tokenizer,
                content_format="string",
            )
1139
1140


1141
1142
1143
1144
1145
def test_parse_chat_messages_rejects_too_many_images_across_messages(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
1146
1147
1148
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
1149
1150
            message="coroutine 'async_get_and_parse_image' was never awaited",
        )
1151
        with pytest.raises(ValueError, match="At most"):
1152
            parse_chat_messages(
1153
1154
                [
                    {
1155
                        "role": "user",
1156
1157
1158
                        "content": [
                            {
                                "type": "image_url",
1159
                                "image_url": {"url": image_url},
1160
                            },
1161
                            {"type": "text", "text": "What's in this image?"},
1162
1163
                        ],
                    },
1164
                    {"role": "assistant", "content": "Some stuff."},
1165
                    {
1166
                        "role": "user",
1167
1168
1169
                        "content": [
                            {
                                "type": "image_url",
1170
                                "image_url": {"url": image_url},
1171
1172
1173
                            },
                            {
                                "type": "image_url",
1174
                                "image_url": {"url": image_url},
1175
                            },
1176
                            {"type": "text", "text": "What about these two?"},
1177
1178
1179
                        ],
                    },
                ],
1180
1181
1182
1183
                phi3v_model_config,
                phi3v_tokenizer,
                content_format="string",
            )
1184
1185
1186
1187
1188
1189
1190


def test_parse_chat_messages_multiple_images_uncommon_input(
    phi3v_model_config,
    phi3v_tokenizer,
    image_url,
):
1191
    conversation, mm_data, mm_uuids = parse_chat_messages(
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
        [
            {
                "role": "user",
                "content": [
                    "What's in these images?",
                    {"image_url": image_url},
                    {"image_url": image_url},
                ],
            }
        ],
1202
1203
1204
1205
        phi3v_model_config,
        phi3v_tokenizer,
        content_format="string",
    )
1206

1207
1208
1209
1210
1211
1212
    assert conversation == [
        {
            "role": "user",
            "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?",
        }
    ]
1213
    _assert_mm_data_is_image_input(mm_data, 2)
1214
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
1215
1216


1217
1218
1219
1220
1221
def test_parse_chat_messages_multiple_images_interleave(
    phi3v_model_config_mm_interleaved,
    phi3v_tokenizer,
    image_url,
):
1222
    conversation, mm_data, mm_uuids = parse_chat_messages(
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "I need you to compare this image",
                    },
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "and this one"},
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "Do they have differences?"},
                ],
            }
        ],
1238
1239
1240
1241
1242
        phi3v_model_config_mm_interleaved,
        phi3v_tokenizer,
        content_format="string",
    )

1243
1244
1245
1246
1247
1248
1249
    assert conversation == [
        {
            "role": "user",
            "content": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
            "Do they have differences?",
        }
    ]
1250
    _assert_mm_data_is_image_input(mm_data, 2)
1251
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
1252
1253
1254
1255
1256
1257
1258
1259


@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_interleave_async(
    phi3v_model_config_mm_interleaved,
    phi3v_tokenizer,
    image_url,
):
1260
    conversation, mm_data, mm_uuids = parse_chat_messages_futures(
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "I need you to compare this image",
                    },
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "and this one"},
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "Do they have differences?"},
                ],
            }
        ],
1276
1277
1278
1279
1280
        phi3v_model_config_mm_interleaved,
        phi3v_tokenizer,
        content_format="string",
    )

1281
1282
1283
1284
1285
1286
1287
    assert conversation == [
        {
            "role": "user",
            "content": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
            "Do they have differences?",
        }
    ]
1288
    _assert_mm_data_is_image_input(await mm_data, 2)
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])


@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
    phi3v_model_config_mm_interleaved,
    phi3v_tokenizer,
    image_url,
):
    image_uuid = str(hash(image_url))
    conversation, mm_data, mm_uuids = parse_chat_messages_futures(
1300
1301
1302
1303
1304
1305
1306
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "I need you to compare this image",
1307
                    },
1308
1309
1310
1311
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url},
                        "uuid": image_uuid,
1312
                    },
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
                    {"type": "text", "text": "and this one"},
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url},
                        "uuid": image_uuid,
                    },
                    {"type": "text", "text": "Do they have differences?"},
                ],
            }
        ],
1323
1324
1325
1326
1327
        phi3v_model_config_mm_interleaved,
        phi3v_tokenizer,
        content_format="string",
    )

1328
1329
1330
1331
1332
1333
1334
    assert conversation == [
        {
            "role": "user",
            "content": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
            "Do they have differences?",
        }
    ]
1335
1336
    _assert_mm_data_is_image_input(await mm_data, 2)
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
1337
1338
1339
1340
1341
1342
1343


def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
    phi3v_model_config_mm_interleaved,
    phi3v_tokenizer,
    image_url,
):
1344
    conversation, mm_data, mm_uuids = parse_chat_messages(
1345
1346
        [
            {
1347
                "role": "user",
1348
                "content": [
1349
1350
1351
                    {"type": "text", "text": "What's on this image?"},
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "Be accurate."},
1352
1353
                ],
            },
1354
            {"role": "assistant", "content": "Some stuff."},
1355
            {
1356
                "role": "user",
1357
                "content": [
1358
1359
                    {"type": "text", "text": "What's on this image?"},
                    {"type": "image_url", "image_url": {"url": image_url}},
1360
1361
1362
                ],
            },
        ],
1363
1364
1365
1366
1367
        phi3v_model_config_mm_interleaved,
        phi3v_tokenizer,
        content_format="string",
    )

1368
1369
1370
1371
1372
    assert conversation == [
        {
            "role": "user",
            "content": "What's on this image?\n<|image_1|>\nBe accurate.",
        },
1373
1374
        {"role": "assistant", "content": "Some stuff."},
        {"role": "user", "content": "What's on this image?\n<|image_2|>"},
1375
    ]
1376
    _assert_mm_data_is_image_input(mm_data, 2)
1377
1378
1379
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])


1380
def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(
1381
1382
1383
1384
1385
1386
1387
1388
    phi3v_model_config_mm_interleaved,
    phi3v_tokenizer,
    image_url,
):
    image_uuid = str(hash(image_url))
    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
1389
                "role": "user",
1390
                "content": [
1391
                    {"type": "text", "text": "What's on this image?"},
1392
1393
                    {
                        "type": "image_url",
1394
                        "image_url": {"url": image_url},
1395
1396
                        "uuid": image_uuid,
                    },
1397
                    {"type": "text", "text": "Be accurate."},
1398
1399
                ],
            },
1400
            {"role": "assistant", "content": "Some stuff."},
1401
            {
1402
                "role": "user",
1403
                "content": [
1404
                    {"type": "text", "text": "What's on this image?"},
1405
1406
                    {
                        "type": "image_url",
1407
                        "image_url": {"url": image_url},
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
                        "uuid": image_uuid,
                    },
                ],
            },
        ],
        phi3v_model_config_mm_interleaved,
        phi3v_tokenizer,
        content_format="string",
    )

    assert conversation == [
        {
            "role": "user",
            "content": "What's on this image?\n<|image_1|>\nBe accurate.",
        },
1423
1424
        {"role": "assistant", "content": "Some stuff."},
        {"role": "user", "content": "What's on this image?\n<|image_2|>"},
1425
1426
1427
    ]
    _assert_mm_data_is_image_input(mm_data, 2)
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
1428
1429
1430


def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
1431
1432
1433
1434
1435
1436
    qwen25omni_model_config_mm_interleaved,
    qwen25omni_tokenizer,
    image_url,
    video_url,
    audio_url,
):
1437
    conversation, mm_data, mm_uuids = parse_chat_messages(
1438
1439
        [
            {
1440
                "role": "user",
1441
                "content": [
1442
1443
1444
1445
                    {"type": "text", "text": "What's on this image?"},
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "Now listen to this audio"},
                    {"type": "audio_url", "audio_url": {"url": audio_url}},
1446
1447
                ],
            },
1448
            {"role": "assistant", "content": "Some stuff."},
1449
            {
1450
                "role": "user",
1451
                "content": [
1452
1453
1454
1455
                    {"type": "text", "text": "What's on this image?"},
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "And what's in the video?"},
                    {"type": "video_url", "video_url": {"url": video_url}},
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
                ],
            },
        ],
        qwen25omni_model_config_mm_interleaved,
        qwen25omni_tokenizer,
        content_format="string",
    )

    assert conversation == [
        {
1466
            "role": "user",
1467
1468
            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
            "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
1469
        },
1470
        {"role": "assistant", "content": "Some stuff."},
1471
        {
1472
            "role": "user",
1473
1474
            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
            "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
1475
1476
        },
    ]
1477
1478

    _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
1479
    _assert_mm_uuids(mm_uuids, 2, modality="image", expected_uuids=[None, None])
1480
1481
1482
1483
    _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=[None])
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])


1484
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(
1485
1486
1487
1488
1489
1490
1491
1492
1493
    qwen25omni_model_config_mm_interleaved,
    qwen25omni_tokenizer,
    image_url,
    video_url,
    audio_url,
):
    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
1494
                "role": "user",
1495
                "content": [
1496
                    {"type": "text", "text": "What's on this image?"},
1497
1498
                    {
                        "type": "image_url",
1499
                        "image_url": {"url": image_url},
1500
1501
                        "uuid": "image_123",
                    },
1502
                    {"type": "text", "text": "Now listen to this audio"},
1503
1504
                    {
                        "type": "audio_url",
1505
                        "audio_url": {"url": audio_url},
1506
1507
1508
1509
                        "uuid": "audio_123",
                    },
                ],
            },
1510
            {"role": "assistant", "content": "Some stuff."},
1511
            {
1512
                "role": "user",
1513
                "content": [
1514
                    {"type": "text", "text": "What's on this image?"},
1515
1516
                    {
                        "type": "image_url",
1517
                        "image_url": {"url": image_url},
1518
1519
                        "uuid": "image_123",
                    },
1520
                    {"type": "text", "text": "And what's in the video?"},
1521
1522
                    {
                        "type": "video_url",
1523
                        "video_url": {"url": video_url},
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
                        "uuid": "video_123",
                    },
                ],
            },
        ],
        qwen25omni_model_config_mm_interleaved,
        qwen25omni_tokenizer,
        content_format="string",
    )

    assert conversation == [
        {
1536
            "role": "user",
1537
1538
            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
            "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
1539
        },
1540
        {"role": "assistant", "content": "Some stuff."},
1541
        {
1542
            "role": "user",
1543
1544
            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
            "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
1545
1546
1547
1548
        },
    ]

    _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
1549
1550
1551
1552
1553
    _assert_mm_uuids(
        mm_uuids, 2, modality="image", expected_uuids=["image_123", "image_123"]
    )
    _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=["video_123"])
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=["audio_123"])
1554
1555


1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave(  # noqa: E501
    qwen25omni_model_config_mm_interleaved,
    qwen25omni_tokenizer,
    image_url,
    video_url,
    audio_url,
):
    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
1566
                "role": "user",
1567
                "content": [
1568
                    {"type": "text", "text": "What's on this image?"},
1569
1570
1571
1572
1573
                    {
                        "type": "image_url",
                        "image_url": None,
                        "uuid": "image_123",
                    },
1574
                    {"type": "text", "text": "Now listen to this audio"},
1575
1576
1577
1578
1579
1580
1581
                    {
                        "type": "audio_url",
                        "audio_url": None,
                        "uuid": "audio_123",
                    },
                ],
            },
1582
            {"role": "assistant", "content": "Some stuff."},
1583
            {
1584
                "role": "user",
1585
                "content": [
1586
                    {"type": "text", "text": "What's on this image?"},
1587
1588
1589
1590
1591
                    {
                        "type": "image_url",
                        "image_url": None,
                        "uuid": "image_123",
                    },
1592
                    {"type": "text", "text": "And what's in the video?"},
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
                    {
                        "type": "video_url",
                        "video_url": None,
                        "uuid": "video_123",
                    },
                ],
            },
        ],
        qwen25omni_model_config_mm_interleaved,
        qwen25omni_tokenizer,
        content_format="string",
    )

    assert conversation == [
        {
1608
            "role": "user",
1609
1610
            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
            "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
1611
        },
1612
        {"role": "assistant", "content": "Some stuff."},
1613
        {
1614
            "role": "user",
1615
1616
            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
            "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
1617
1618
1619
        },
    ]

1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
    _assert_mm_data_inputs(
        mm_data,
        {"image": 2, "video": 1, "audio": 1},
        skipped_media_indices={"image": [0, 1], "video": [0], "audio": [0]},
    )
    _assert_mm_uuids(
        mm_uuids, 2, modality="image", expected_uuids=["image_123", "image_123"]
    )
    _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=["video_123"])
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=["audio_123"])
1630
1631


1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave(  # noqa: E501
    qwen25omni_model_config_mm_interleaved,
    qwen25omni_tokenizer,
    image_url,
    video_url,
    audio_url,
):
    conversation, mm_data, mm_uuids = parse_chat_messages(
        [
            {
1642
                "role": "user",
1643
                "content": [
1644
                    {"type": "text", "text": "What's on this image?"},
1645
1646
                    {
                        "type": "image_url",
1647
                        "image_url": {"url": image_url},
1648
1649
                        "uuid": "image_123",
                    },
1650
1651
                    {"type": "text", "text": "Now listen to this audio"},
                    {"type": "audio_url", "audio_url": {"url": audio_url}},
1652
1653
                ],
            },
1654
            {"role": "assistant", "content": "Some stuff."},
1655
            {
1656
                "role": "user",
1657
                "content": [
1658
1659
1660
                    {"type": "text", "text": "What's on this image?"},
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "And what's in the video?"},
1661
1662
                    {
                        "type": "video_url",
1663
                        "video_url": {"url": video_url},
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
                        "uuid": "video_123",
                    },
                ],
            },
        ],
        qwen25omni_model_config_mm_interleaved,
        qwen25omni_tokenizer,
        content_format="string",
    )

    assert conversation == [
        {
1676
            "role": "user",
1677
1678
            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
            "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
1679
        },
1680
        {"role": "assistant", "content": "Some stuff."},
1681
        {
1682
            "role": "user",
1683
1684
            "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
            "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
1685
1686
1687
1688
        },
    ]

    _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
1689
1690
    _assert_mm_uuids(mm_uuids, 2, modality="image", expected_uuids=["image_123", None])
    _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=["video_123"])
1691
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
1692
1693
1694
1695
1696
1697
1698
1699


def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
    phi3v_model_config_mm_interleaved,
    phi3v_tokenizer,
    image_url,
):
    with pytest.raises(
1700
1701
1702
        ValueError,
        match=r"Found more '<|image_1|>' placeholders in input prompt "
        "than actual multimodal data items.",
1703
    ):
1704
        parse_chat_messages(
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
            [
                {
                    "role": "user",
                    "content": [
                        {"type": "image_url", "image_url": {"url": image_url}},
                        {"type": "image_url", "image_url": {"url": image_url}},
                        {
                            "type": "text",
                            "text": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
                            "Do they have differences?",
                        },
                    ],
                }
            ],
1719
1720
1721
1722
1723
1724
            phi3v_model_config_mm_interleaved,
            phi3v_tokenizer,
            content_format="string",
        )


1725
1726
1727
1728
1729
@pytest.mark.parametrize(
    "model",
    [
        QWEN2VL_MODEL_ID,  # tokenizer.chat_template is of type str
        HERMES_MODEL_ID,  # tokenizer.chat_template is of type dict
1730
1731
    ],
)
1732
1733
1734
@pytest.mark.parametrize("use_tools", [True, False])
def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
    """checks that chat_template is a dict type for HF models."""
1735
1736
1737
1738
1739
1740
1741
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")

    model_config = ModelConfig(
        model,
        tokenizer=model_info.tokenizer or model,
        tokenizer_mode=model_info.tokenizer_mode,
1742
        revision=model_info.revision,
1743
1744
        trust_remote_code=model_info.trust_remote_code,
        hf_overrides=model_info.hf_overrides,
1745
1746
1747
        skip_tokenizer_init=model_info.require_embed_inputs,
        enable_prompt_embeds=model_info.require_embed_inputs,
        enable_mm_embeds=model_info.require_embed_inputs,
1748
        enforce_eager=model_info.enforce_eager,
1749
1750
        dtype=model_info.dtype,
    )
1751

1752
1753
    # Build the tokenizer
    tokenizer = get_tokenizer(
1754
        model,
1755
        trust_remote_code=model_config.trust_remote_code,
1756
1757
    )

1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
    tools = (
        [
            {
                "type": "function",
                "function": {
                    "name": "dummy_function_name",
                    "description": "This is a dummy function",
                    "parameters": sample_json_schema,
                },
            }
        ]
        if use_tools
        else None
    )
1772
1773

    # Test detecting the tokenizer's chat_template
1774
    chat_template = resolve_hf_chat_template(
1775
1776
1777
        tokenizer,
        chat_template=None,
        tools=tools,
1778
        model_config=model_config,
1779
1780
1781
1782
    )
    assert isinstance(chat_template, str)


1783
1784
1785
1786
1787
1788
@pytest.mark.parametrize(
    "model, expected_kwargs",
    [
        (
            QWEN2VL_MODEL_ID,
            {
1789
1790
1791
1792
                "add_vision_id",
                "add_generation_prompt",
                "continue_final_message",
                "tools",
1793
1794
1795
1796
1797
            },
        ),
        (
            QWEN3_MODEL_ID,
            {
1798
1799
1800
1801
                "enable_thinking",
                "add_generation_prompt",
                "continue_final_message",
                "tools",
1802
1803
1804
1805
            },
        ),
    ],
)
1806
def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwargs):
1807
1808
1809
1810
    """checks that chat_template is a dict type for HF models."""
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")

1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
    tools = [
        {
            "type": "function",
            "function": {
                "name": "dummy_function_name",
                "description": "This is a dummy function",
                "parameters": sample_json_schema,
            },
        }
    ]
1821
1822
1823
1824
1825
1826
1827

    chat_template_kwargs = {
        # both unused
        "unsed_kwargs_1": 123,
        "unsed_kwargs_2": "abc",
        # should not appear
        "chat_template": "{% Hello world! %}",
1828
        "tokenize": True,
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
        # used by tokenizer
        "continue_final_message": True,
        "tools": tools,
        # both used by Qwen2-VL and Qwen3
        "add_generation_prompt": True,
        # only used by Qwen2-VL
        "add_vision_id": True,
        # only used by Qwen3
        "enable_thinking": True,
    }

    model_config = ModelConfig(
        model,
        tokenizer=model_info.tokenizer or model,
        tokenizer_mode=model_info.tokenizer_mode,
        revision=model_info.revision,
        trust_remote_code=model_info.trust_remote_code,
        hf_overrides=model_info.hf_overrides,
1847
1848
1849
        skip_tokenizer_init=model_info.require_embed_inputs,
        enable_prompt_embeds=model_info.require_embed_inputs,
        enable_mm_embeds=model_info.require_embed_inputs,
1850
        enforce_eager=model_info.enforce_eager,
1851
1852
        dtype=model_info.dtype,
    )
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866

    # Build the tokenizer
    tokenizer = get_tokenizer(
        model,
        trust_remote_code=model_config.trust_remote_code,
    )

    # Test detecting the tokenizer's chat_template
    chat_template = resolve_hf_chat_template(
        tokenizer,
        chat_template=None,
        tools=tools,
        model_config=model_config,
    )
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
    with pytest.raises(
        ValueError, match="Found unexpected chat template kwargs from request"
    ):
        # should raise error if `chat_template_kwargs` contains
        # `chat_template` or `tokenize`
        resolve_chat_template_kwargs(
            tokenizer,
            chat_template=chat_template,
            chat_template_kwargs=chat_template_kwargs,
        )
1877
1878
1879
1880
    resolved_chat_template_kwargs = resolve_chat_template_kwargs(
        tokenizer,
        chat_template=chat_template,
        chat_template_kwargs=chat_template_kwargs,
1881
        raise_on_unexpected=False,
1882
1883
1884
1885
    )
    assert set(resolved_chat_template_kwargs.keys()) == expected_kwargs


1886
1887
# NOTE: Qwen2-Audio default chat template is specially defined inside
# processor class instead of using `tokenizer_config.json`
1888
1889
@pytest.mark.parametrize(
    ("model", "expected_format"),
1890
1891
1892
1893
1894
1895
1896
1897
    [
        (PHI3V_MODEL_ID, "string"),
        (QWEN2VL_MODEL_ID, "openai"),
        (QWEN25VL_MODEL_ID, "openai"),
        (ULTRAVOX_MODEL_ID, "string"),
        (QWEN2AUDIO_MODEL_ID, "openai"),
        (LLAMA_GUARD_MODEL_ID, "openai"),
    ],
1898
1899
)
def test_resolve_content_format_hf_defined(model, expected_format):
1900
1901
1902
1903
1904
1905
1906
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")

    model_config = ModelConfig(
        model,
        tokenizer=model_info.tokenizer or model,
        tokenizer_mode=model_info.tokenizer_mode,
1907
        revision=model_info.revision,
1908
1909
        trust_remote_code=model_info.trust_remote_code,
        hf_overrides=model_info.hf_overrides,
1910
1911
1912
        skip_tokenizer_init=model_info.require_embed_inputs,
        enable_prompt_embeds=model_info.require_embed_inputs,
        enable_mm_embeds=model_info.require_embed_inputs,
1913
        enforce_eager=model_info.enforce_eager,
1914
1915
        dtype=model_info.dtype,
    )
1916

1917
    tokenizer = get_tokenizer(
1918
        model,
1919
        trust_remote_code=model_config.trust_remote_code,
1920
1921
    )

1922
    # Test detecting the tokenizer's chat_template
1923
    chat_template = resolve_hf_chat_template(
1924
1925
1926
        tokenizer,
        chat_template=None,
        tools=None,
1927
        model_config=model_config,
1928
    )
1929
1930
1931
1932
1933
1934
1935
1936
    assert isinstance(chat_template, str)

    print("[TEXT]")
    print(chat_template)
    print("[AST]")
    print(_try_extract_ast(chat_template))

    resolved_format = resolve_chat_template_content_format(
1937
1938
1939
1940
        None,  # Test detecting the tokenizer's chat_template
        None,
        "auto",
        tokenizer,
1941
        model_config=model_config,
1942
1943
1944
1945
1946
1947
1948
    )

    assert resolved_format == expected_format


@pytest.mark.parametrize(
    ("model", "expected_format"),
1949
1950
1951
1952
1953
1954
1955
1956
1957
    [
        ("Salesforce/blip2-opt-2.7b", "string"),
        ("facebook/chameleon-7b", "string"),
        ("deepseek-ai/deepseek-vl2-tiny", "string"),
        ("adept/fuyu-8b", "string"),
        ("google/paligemma-3b-mix-224", "string"),
        ("Qwen/Qwen-VL", "string"),
        ("Qwen/Qwen-VL-Chat", "string"),
    ],
1958
1959
1960
1961
1962
1963
1964
1965
1966
)
def test_resolve_content_format_fallbacks(model, expected_format):
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")

    model_config = ModelConfig(
        model,
        tokenizer=model_info.tokenizer or model,
        tokenizer_mode=model_info.tokenizer_mode,
1967
        revision=model_info.revision,
1968
1969
        trust_remote_code=model_info.trust_remote_code,
        hf_overrides=model_info.hf_overrides,
1970
1971
1972
        skip_tokenizer_init=model_info.require_embed_inputs,
        enable_prompt_embeds=model_info.require_embed_inputs,
        enable_mm_embeds=model_info.require_embed_inputs,
1973
        enforce_eager=model_info.enforce_eager,
1974
1975
        dtype=model_info.dtype,
    )
1976

1977
    tokenizer = get_tokenizer(
1978
1979
1980
1981
1982
1983
1984
1985
1986
        model_config.tokenizer,
        trust_remote_code=model_config.trust_remote_code,
    )

    # Test detecting the tokenizer's chat_template
    chat_template = resolve_hf_chat_template(
        tokenizer,
        chat_template=None,
        tools=None,
1987
        model_config=model_config,
1988
1989
1990
1991
1992
1993
1994
1995
1996
    )
    assert isinstance(chat_template, str)

    print("[TEXT]")
    print(chat_template)
    print("[AST]")
    print(_try_extract_ast(chat_template))

    resolved_format = resolve_chat_template_content_format(
1997
        None,  # Test detecting the tokenizer's chat_template
1998
        None,
1999
2000
        "auto",
        tokenizer,
2001
        model_config=model_config,
2002
2003
2004
2005
2006
2007
2008
    )

    assert resolved_format == expected_format


@pytest.mark.parametrize(
    ("template_path", "expected_format"),
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
    [
        ("template_alpaca.jinja", "string"),
        ("template_baichuan.jinja", "string"),
        ("template_chatglm.jinja", "string"),
        ("template_chatglm2.jinja", "string"),
        ("template_chatml.jinja", "string"),
        ("template_dse_qwen2_vl.jinja", "openai"),
        ("template_falcon_180b.jinja", "string"),
        ("template_falcon.jinja", "string"),
        ("template_inkbot.jinja", "string"),
        ("template_teleflm.jinja", "string"),
        ("template_vlm2vec_phi3v.jinja", "openai"),
        ("template_vlm2vec_qwen2vl.jinja", "openai"),
        ("tool_chat_template_granite_20b_fc.jinja", "string"),
        ("tool_chat_template_hermes.jinja", "string"),
        ("tool_chat_template_internlm2_tool.jinja", "string"),
        ("tool_chat_template_llama3.1_json.jinja", "openai"),
        ("tool_chat_template_llama3.2_json.jinja", "openai"),
        ("tool_chat_template_mistral_parallel.jinja", "string"),
        ("tool_chat_template_mistral.jinja", "string"),
    ],
2030
2031
)
def test_resolve_content_format_examples(template_path, expected_format):
2032
2033
2034
2035
2036
2037
    model_config = ModelConfig(
        PHI3V_MODEL_ID,  # Dummy
        tokenizer=PHI3V_MODEL_ID,  # Dummy
        trust_remote_code=True,
    )

2038
    dummy_tokenizer = get_tokenizer(
2039
2040
        PHI3V_MODEL_ID,  # Dummy
        trust_remote_code=model_config.trust_remote_code,
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
    )
    dummy_tokenizer.chat_template = None

    chat_template = load_chat_template(EXAMPLES_DIR / template_path)
    assert isinstance(chat_template, str)

    print("[TEXT]")
    print(chat_template)
    print("[AST]")
    print(_try_extract_ast(chat_template))

    resolved_format = resolve_chat_template_content_format(
        chat_template,
2054
        None,
2055
2056
        "auto",
        dummy_tokenizer,
2057
        model_config=model_config,
2058
2059
2060
    )

    assert resolved_format == expected_format
Julien Denize's avatar
Julien Denize committed
2061
2062


2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
def test_parse_chat_messages_include_thinking_chunk(
    mistral_model_config, mistral_tokenizer
):
    messages = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": "You are a helpful assistant."},
                {
                    "type": "thinking",
                    "closed": True,
                    "thinking": "Only return the answer when you are confident.",
                },
            ],
        },
        {"role": "user", "content": "What is 2+2?"},
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": "Let me think about it."},
                {"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
                {
                    "type": "text",
                    "text": "The answer is 4.",
                },
            ],
        },
    ]
Julien Denize's avatar
Julien Denize committed
2091

2092
    conversation_with_thinking, _, _ = parse_chat_messages(
Julien Denize's avatar
Julien Denize committed
2093
2094
2095
2096
2097
2098
        messages,
        mistral_model_config,
        mistral_tokenizer,
        content_format="openai",
    )

2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
    expected_conversation = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": "You are a helpful assistant."},
                {
                    "type": "text",
                    "text": "Only return the answer when you are confident.",
                },
            ],
        },
        {
            "role": "user",
            "content": [{"type": "text", "text": "What is 2+2?"}],
        },
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": "Let me think about it."},
                {"type": "text", "text": "2+2 = 4"},
                {"type": "text", "text": "The answer is 4."},
            ],
        },
    ]
Julien Denize's avatar
Julien Denize committed
2123
2124
2125
2126
2127

    assert conversation_with_thinking == expected_conversation


def test_apply_mistral_chat_template_thinking_chunk():
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
    messages = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": "You are a helpful assistant."},
                {
                    "type": "thinking",
                    "closed": True,
                    "thinking": "Only return the answer when you are confident.",
                },
            ],
        },
        {"role": "user", "content": "What is 2+2?"},
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": "Let me think about it."},
                {"type": "thinking", "closed": True, "thinking": "2+2 = 4"},
                {
                    "type": "text",
                    "text": "The answer is 4.",
                },
            ],
        },
        {"role": "user", "content": "Thanks, what is 3+3?"},
    ]
Julien Denize's avatar
Julien Denize committed
2154
    mistral_tokenizer = MistralTokenizer.from_pretrained(
2155
        "mistralai/Magistral-Small-2509"
2156
    )
Julien Denize's avatar
Julien Denize committed
2157

2158
2159
2160
    tokens_ids = apply_mistral_chat_template(
        mistral_tokenizer, messages, chat_template=None, tools=None
    )
Julien Denize's avatar
Julien Denize committed
2161
2162

    string_tokens = mistral_tokenizer.mistral.decode(
2163
2164
        tokens_ids, special_token_policy=SpecialTokenPolicy.KEEP
    )
Julien Denize's avatar
Julien Denize committed
2165
2166
2167
2168
2169
2170

    expected_tokens = (
        r"<s>[SYSTEM_PROMPT]You are a helpful assistant.[THINK]Only return the"
        r" answer when you are confident.[/THINK][/SYSTEM_PROMPT]"
        r"[INST]What is 2+2?[/INST]"
        r"Let me think about it.[THINK]2+2 = 4[/THINK]The answer is 4.</s>"
2171
2172
        r"[INST]Thanks, what is 3+3?[/INST]"
    )
Julien Denize's avatar
Julien Denize committed
2173
2174

    assert string_tokens == expected_tokens
2175
2176
2177
2178
2179
2180
2181
2182


def test_parse_chat_messages_single_empty_audio_with_uuid(
    qwen2_audio_model_config,
    qwen2_audio_tokenizer,
):
    audio_uuid = "abcd"
    conversation, mm_data, mm_uuids = parse_chat_messages(
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "input_audio",
                        "input_audio": {},
                        "uuid": audio_uuid,
                    },
                    {"type": "text", "text": "What does the audio say?"},
                ],
            }
        ],
2196
2197
2198
2199
2200
        qwen2_audio_model_config,
        qwen2_audio_tokenizer,
        content_format="string",
    )

2201
2202
2203
    assert conversation == [
        {
            "role": "user",
2204
2205
            "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the "
            "audio say?",
2206
2207
        }
    ]
2208
    _assert_mm_data_inputs(mm_data, {"audio": 1})
2209
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
2210
2211
2212
2213
2214
2215
2216
2217
2218


@pytest.mark.asyncio
async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
    qwen2_audio_model_config,
    qwen2_audio_tokenizer,
):
    audio_uuid = "abcd"
    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "input_audio",
                        "input_audio": {},
                        "uuid": audio_uuid,
                    },
                    {"type": "text", "text": "What does the audio say?"},
                ],
            }
        ],
2232
2233
2234
2235
2236
        qwen2_audio_model_config,
        qwen2_audio_tokenizer,
        content_format="string",
    )

2237
2238
2239
    assert conversation == [
        {
            "role": "user",
2240
2241
            "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the "
            "audio say?",
2242
2243
        }
    ]
2244
    _assert_mm_data_inputs(await mm_future, {"audio": 1})
2245
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])