vision_language_multi_image.py 34.3 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
"""
This example shows how to use vLLM for running offline inference with
Cyrus Leung's avatar
Cyrus Leung committed
5
6
multi-image input on vision language models for text generation,
using the chat template defined by the model.
7
"""
8

9
import os
10
from argparse import Namespace
11
from dataclasses import asdict
12
from typing import NamedTuple, Optional
13

14
from huggingface_hub import snapshot_download
15
from PIL.Image import Image
16
from transformers import AutoProcessor, AutoTokenizer
17

18
from vllm import LLM, EngineArgs, SamplingParams
19
from vllm.lora.request import LoRARequest
20
21
22
23
24
25
26
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser

QUESTION = "What is the content of each image?"
IMAGE_URLS = [
    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
27
28
29
30
31
32
33
34
35
36
    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
37
38
39
]


40
class ModelRequestData(NamedTuple):
41
    engine_args: EngineArgs
42
    prompt: str
43
    image_data: list[Image]
44
45
46
    stop_token_ids: Optional[list[int]] = None
    chat_template: Optional[str] = None
    lora_requests: Optional[list[LoRARequest]] = None
47
48


49
50
51
52
53
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.


54
def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
55
    model_name = "rhymes-ai/Aria"
56
57
58
59
60
61
62
    engine_args = EngineArgs(
        model=model_name,
        tokenizer_mode="slow",
        trust_remote_code=True,
        dtype="bfloat16",
        limit_mm_per_prompt={"image": len(image_urls)},
    )
63
    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
64
65
66
    prompt = (
        f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n<|im_start|>assistant\n"
    )
67
    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
68

69
    return ModelRequestData(
70
        engine_args=engine_args,
71
72
73
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
74
    )
75

76

Jennifer Zhao's avatar
Jennifer Zhao committed
77
78
79
80
81
82
83
84
85
86
def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "CohereForAI/aya-vision-8b"

    engine_args = EngineArgs(
        model=model_name,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
87
88
89
90
91
92
93
94
95
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
Jennifer Zhao's avatar
Jennifer Zhao committed
96
97
98

    processor = AutoProcessor.from_pretrained(model_name)

99
100
101
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
Jennifer Zhao's avatar
Jennifer Zhao committed
102
103
104
105
106
107
108
109

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


110
def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
111
    model_name = "deepseek-ai/deepseek-vl2-tiny"
112

113
114
115
116
117
118
119
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
        limit_mm_per_prompt={"image": len(image_urls)},
    )
120

121
122
123
    placeholder = "".join(
        f"image_{i}:<image>\n" for i, _ in enumerate(image_urls, start=1)
    )
124
125
126
    prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"

    return ModelRequestData(
127
        engine_args=engine_args,
128
129
130
131
132
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


133
def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
134
135
    model_name = "google/gemma-3-4b-it"

136
    engine_args = EngineArgs(
137
138
139
140
141
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
142
143

    placeholders = [{"type": "image", "image": url} for url in image_urls]
144
145
146
147
148
149
150
151
152
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
153
154
155

    processor = AutoProcessor.from_pretrained(model_name)

156
157
158
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
159
160

    return ModelRequestData(
161
        engine_args=engine_args,
162
163
164
165
166
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


167
def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
168
    model_name = "h2oai/h2ovl-mississippi-800m"
169

170
    engine_args = EngineArgs(
171
172
173
174
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        limit_mm_per_prompt={"image": len(image_urls)},
175
        mm_processor_kwargs={"max_dynamic_patch": 4},
176
177
    )

178
179
180
181
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
182

183
184
185
186
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
187
188

    # Stop tokens for H2OVL-Mississippi
189
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
190
191
192
    stop_token_ids = [tokenizer.eos_token_id]

    return ModelRequestData(
193
        engine_args=engine_args,
194
195
196
197
198
199
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )


200
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
201
202
203
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

    # The configuration below has been confirmed to launch on a single L40 GPU.
204
    engine_args = EngineArgs(
205
206
207
208
209
210
211
212
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        enforce_eager=True,
        limit_mm_per_prompt={"image": len(image_urls)},
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
213
            "size": {"longest_edge": 2 * 364},
214
215
216
        },
    )

217
218
219
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
220
221
    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    return ModelRequestData(
222
        engine_args=engine_args,
223
224
225
226
227
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


228
229
230
231
232
233
234
235
236
237
238
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

    # The configuration below has been confirmed to launch on a single L40 GPU.
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        enforce_eager=True,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={
239
            "max_image_size": {"longest_edge": 384},
240
241
242
        },
    )

243
244
245
246
247
248
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    prompt = (
        f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    )
249
250
251
252
253
254
255
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


256
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
257
258
    model_name = "OpenGVLab/InternVL2-2B"

259
    engine_args = EngineArgs(
260
261
262
263
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
264
        mm_processor_kwargs={"max_dynamic_patch": 4},
265
266
    )

267
268
269
270
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
271

272
273
274
275
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
276
277
278
279

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
280
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
281
282
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
283

284
    return ModelRequestData(
285
        engine_args=engine_args,
286
287
288
289
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )
290
291


292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
def load_hyperclovax_seed_vision(
    question: str, image_urls: list[str]
) -> ModelRequestData:
    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=16384,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    message = {"role": "user", "content": list()}
    for _image_url in image_urls:
        message["content"].append(
            {
                "type": "image",
                "image": _image_url,
                "ocr": "",
                "lens_keywords": "",
                "lens_local_keywords": "",
            }
        )
    message["content"].append(
        {
            "type": "text",
            "text": question,
        }
    )

    prompt = tokenizer.apply_chat_template(
        [
            message,
        ],
        tokenize=False,
        add_generation_prompt=True,
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=None,
        image_data=[fetch_image(url) for url in image_urls],
    )


339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
    # it will generate poor response for multi-image inputs!
    model_name = "llava-hf/llava-1.5-7b-hf"
    engine_args = EngineArgs(
        model=model_name,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=16384,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


439
440
441
442
443
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

    engine_args = EngineArgs(
        model=model_name,
444
        max_model_len=131072,
445
446
447
448
449
        tensor_parallel_size=8,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
450
451
452
453
454
455
456
457
458
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
459
460
461

    processor = AutoProcessor.from_pretrained(model_name)

462
463
464
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
465
466
467
468
469
470
471
472

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "Kwai-Keye/Keye-VL-8B-Preview"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]

    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


510
511
512
513
514
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "moonshotai/Kimi-VL-A3B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
Cyrus Leung's avatar
Cyrus Leung committed
515
        trust_remote_code=True,
516
517
518
519
520
521
        max_model_len=4096,
        max_num_seqs=4,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
522
523
524
525
526
527
528
529
530
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
531

532
    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
533

534
535
536
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
537
538
539
540
541
542
543
544

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


545
546
547
548
549
550
551
552
553
554
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    # Adjust this as necessary to fit in GPU
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
555
        ignore_patterns=["consolidated.safetensors"],
556
557
558
559
560
561
562
563
564
565
566
567
    )

    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


568
def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
569
570
571
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"

    # The configuration below has been confirmed to launch on a single L40 GPU.
572
    engine_args = EngineArgs(
573
        model=model_name,
574
575
        max_model_len=8192,
        max_num_seqs=2,
576
577
578
        limit_mm_per_prompt={"image": len(image_urls)},
    )

579
580
    img_prompt = "Given the first image <|image|> and the second image<|image|>"
    prompt = f"<|begin_of_text|>{img_prompt}, {question}?"
581
    return ModelRequestData(
582
        engine_args=engine_args,
583
584
585
586
587
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


588
def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
589
590
591
    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
592
    engine_args = EngineArgs(
593
594
595
596
597
598
599
600
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        tensor_parallel_size=4,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"max_dynamic_patch": 4},
    )

601
602
603
604
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
605

606
607
608
609
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
610
611

    return ModelRequestData(
612
        engine_args=engine_args,
613
614
615
616
617
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


618
619
# Ovis
def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
620
621
622
623
624
625
626
627
628
629
630
    model_name = "AIDC-AI/Ovis2-1B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="half",
        limit_mm_per_prompt={"image": len(image_urls)},
    )

631
632
633
634
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
635

636
637
638
639
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
640
641
642
643
644
645
646
647

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


648
def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
649
650
651
    model_name = "mistral-community/pixtral-12b"

    # Adjust this as necessary to fit in GPU
652
    engine_args = EngineArgs(
653
654
655
656
657
658
659
660
661
662
663
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"

    return ModelRequestData(
664
        engine_args=engine_args,
665
666
667
668
669
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


670
def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
671
672
673
674
675
676
677
678
679
680
681
682
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
683
    engine_args = EngineArgs(
684
685
686
687
688
689
690
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"num_crops": 4},
    )
691
692
693
    placeholders = "\n".join(
        f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1)
    )
694
695
696
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"

    return ModelRequestData(
697
        engine_args=engine_args,
698
699
700
701
702
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


703
704
705
706
707
708
709
710
711
712
def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process multi images inputs.
    """

    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
713
    engine_args = EngineArgs(
714
715
        model=model_path,
        trust_remote_code=True,
716
        max_model_len=4096,
717
718
719
720
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        enable_lora=True,
        max_lora_rank=320,
721
722
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 4},
723
724
    )

725
    placeholders = "".join(f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1))
726
727
728
    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"

    return ModelRequestData(
729
        engine_args=engine_args,
730
731
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
732
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
733
734
735
    )


736
def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
737
    model_name = "Qwen/Qwen-VL-Chat"
738
    engine_args = EngineArgs(
739
740
741
742
        model=model_name,
        trust_remote_code=True,
        max_model_len=1024,
        max_num_seqs=2,
743
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
744
745
        limit_mm_per_prompt={"image": len(image_urls)},
    )
746
747
748
    placeholders = "".join(
        f"Picture {i}: <img></img>\n" for i, _ in enumerate(image_urls, start=1)
    )
749
750
751
752
753

    # This model does not have a chat_template attribute on its tokenizer,
    # so we need to explicitly pass it. We use ChatML since it's used in the
    # generation utils of the model:
    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
754
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
755
756
757
758

    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501

759
760
761
762
763
764
765
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        chat_template=chat_template,
    )
766
767
768

    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
769

770
    return ModelRequestData(
771
        engine_args=engine_args,
772
773
774
775
776
777
778
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=chat_template,
    )


779
def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
780
    try:
汪志鹏's avatar
汪志鹏 committed
781
        from qwen_vl_utils import smart_resize
782
    except ModuleNotFoundError:
783
784
785
786
787
        print(
            "WARNING: `qwen-vl-utils` not installed, input images will not "
            "be automatically resized. You can enable this functionality by "
            "`pip install qwen-vl-utils`."
        )
汪志鹏's avatar
汪志鹏 committed
788
        smart_resize = None
789
790
791

    model_name = "Qwen/Qwen2-VL-7B-Instruct"

792
    # Tested on L40
793
    engine_args = EngineArgs(
794
        model=model_name,
汪志鹏's avatar
汪志鹏 committed
795
        max_model_len=32768 if smart_resize is None else 4096,
796
        max_num_seqs=5,
797
798
799
800
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
801
802
803
804
805
806
807
808
809
810
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]
811
812
813

    processor = AutoProcessor.from_pretrained(model_name)

814
815
816
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
817

汪志鹏's avatar
汪志鹏 committed
818
    if smart_resize is None:
819
820
        image_data = [fetch_image(url) for url in image_urls]
    else:
汪志鹏's avatar
汪志鹏 committed
821
822
823
824
825
826
827
828
829

        def post_process_image(image: Image) -> Image:
            width, height = image.size
            resized_height, resized_width = smart_resize(
                height, width, max_pixels=1024 * 28 * 28
            )
            return image.resize((resized_width, resized_height))

        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
830

831
    return ModelRequestData(
832
        engine_args=engine_args,
833
834
835
        prompt=prompt,
        image_data=image_data,
    )
836
837


838
def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
Roger Wang's avatar
Roger Wang committed
839
    try:
汪志鹏's avatar
汪志鹏 committed
840
        from qwen_vl_utils import smart_resize
Roger Wang's avatar
Roger Wang committed
841
    except ModuleNotFoundError:
842
843
844
845
846
        print(
            "WARNING: `qwen-vl-utils` not installed, input images will not "
            "be automatically resized. You can enable this functionality by "
            "`pip install qwen-vl-utils`."
        )
汪志鹏's avatar
汪志鹏 committed
847
        smart_resize = None
Roger Wang's avatar
Roger Wang committed
848
849
850

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

851
    engine_args = EngineArgs(
Roger Wang's avatar
Roger Wang committed
852
        model=model_name,
汪志鹏's avatar
汪志鹏 committed
853
        max_model_len=32768 if smart_resize is None else 4096,
Roger Wang's avatar
Roger Wang committed
854
855
856
857
858
        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
859
860
861
862
863
864
865
866
867
868
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]
Roger Wang's avatar
Roger Wang committed
869
870
871

    processor = AutoProcessor.from_pretrained(model_name)

872
873
874
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
Roger Wang's avatar
Roger Wang committed
875

汪志鹏's avatar
汪志鹏 committed
876
    if smart_resize is None:
Roger Wang's avatar
Roger Wang committed
877
878
        image_data = [fetch_image(url) for url in image_urls]
    else:
汪志鹏's avatar
汪志鹏 committed
879
880
881
882
883
884
885
886
887

        def post_process_image(image: Image) -> Image:
            width, height = image.size
            resized_height, resized_width = smart_resize(
                height, width, max_pixels=1024 * 28 * 28
            )
            return image.resize((resized_width, resized_height))

        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
Roger Wang's avatar
Roger Wang committed
888
889

    return ModelRequestData(
890
        engine_args=engine_args,
Roger Wang's avatar
Roger Wang committed
891
892
893
894
895
        prompt=prompt,
        image_data=image_data,
    )


汪志鹏's avatar
汪志鹏 committed
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "omni-research/Tarsier-7b"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    prompt = f"USER: {'<image>' * len(image_urls)}\n{question}\n ASSISTANT:"
    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "omni-research/Tarsier2-Recap-7b"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=32768,
        limit_mm_per_prompt={"image": len(image_urls)},
        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
    )

    prompt = (
        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
        f"<|im_start|>user\n<|vision_start|>{'<|image_pad|>' * len(image_urls)}"
        f"<|vision_end|>{question}<|im_end|>\n"
        "<|im_start|>assistant\n"
    )
    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


942
model_example_map = {
943
    "aria": load_aria,
Jennifer Zhao's avatar
Jennifer Zhao committed
944
    "aya_vision": load_aya_vision,
945
    "deepseek_vl_v2": load_deepseek_vl2,
946
    "gemma3": load_gemma3,
947
    "h2ovl_chat": load_h2ovl,
948
    "idefics3": load_idefics3,
949
    "internvl_chat": load_internvl,
950
    "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
951
    "keye_vl": load_keye_vl,
952
    "kimi_vl": load_kimi_vl,
953
954
955
    "llava": load_llava,
    "llava-next": load_llava_next,
    "llava-onevision": load_llava_onevision,
956
    "llama4": load_llama4,
957
    "mistral3": load_mistral3,
958
    "mllama": load_mllama,
959
    "NVLM_D": load_nvlm_d,
960
    "ovis": load_ovis,
961
    "phi3_v": load_phi3v,
962
    "phi4_mm": load_phi4mm,
963
964
    "pixtral_hf": load_pixtral_hf,
    "qwen_vl_chat": load_qwen_vl_chat,
965
    "qwen2_vl": load_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
966
    "qwen2_5_vl": load_qwen2_5_vl,
967
    "smolvlm": load_smolvlm,
汪志鹏's avatar
汪志鹏 committed
968
    "tarsier": load_tarsier,
969
    "tarsier2": load_tarsier2,
970
971
972
}


973
def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
974
    req_data = model_example_map[model](question, image_urls)
975

976
977
978
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)

979
980
981
    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
    )
982

983
    outputs = llm.generate(
984
        {
985
            "prompt": req_data.prompt,
986
            "multi_modal_data": {"image": req_data.image_data},
987
        },
988
989
990
        sampling_params=sampling_params,
        lora_request=req_data.lora_requests,
    )
991

992
    print("-" * 50)
993
994
995
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
996
        print("-" * 50)
997
998


999
def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
1000
    req_data = model_example_map[model](question, image_urls)
1001

1002
1003
1004
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
1005
1006
        req_data.engine_args.limit_mm_per_prompt or {}
    )
1007

1008
1009
1010
    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)

1011
1012
1013
    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
    )
1014
    outputs = llm.chat(
1015
1016
1017
1018
1019
1020
1021
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": question,
1022
                    },
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
                    *(
                        {
                            "type": "image_url",
                            "image_url": {"url": image_url},
                        }
                        for image_url in image_urls
                    ),
                ],
            }
        ],
1033
        sampling_params=sampling_params,
1034
        chat_template=req_data.chat_template,
1035
        lora_request=req_data.lora_requests,
1036
    )
1037

1038
    print("-" * 50)
1039
1040
1041
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
1042
        print("-" * 50)
1043
1044


1045
def parse_args():
1046
    parser = FlexibleArgumentParser(
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
        description="Demo on using vLLM for offline inference with "
        "vision language models that support multi-image input for text "
        "generation"
    )
    parser.add_argument(
        "--model-type",
        "-m",
        type=str,
        default="phi3_v",
        choices=model_example_map.keys(),
        help='Huggingface "model_type".',
    )
    parser.add_argument(
        "--method",
        type=str,
        default="generate",
        choices=["generate", "chat"],
        help="The method to run in `vllm.LLM`.",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=None,
        help="Set the seed when initializing `vllm.LLM`.",
    )
1072
1073
1074
    parser.add_argument(
        "--num-images",
        "-n",
1075
        type=int,
1076
        choices=list(range(1, len(IMAGE_URLS) + 1)),  # the max number of images
1077
        default=2,
1078
1079
        help="Number of images to use for the demo.",
    )
1080
1081
    return parser.parse_args()

1082

1083
1084
1085
1086
1087
def main(args: Namespace):
    model = args.model_type
    method = args.method
    seed = args.seed

1088
    image_urls = IMAGE_URLS[: args.num_images]
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099

    if method == "generate":
        run_generate(model, QUESTION, image_urls, seed)
    elif method == "chat":
        run_chat(model, QUESTION, image_urls, seed)
    else:
        raise ValueError(f"Invalid method: {method}")


if __name__ == "__main__":
    args = parse_args()
1100
    main(args)