vision_language_multi_image.py 32.9 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
"""
This example shows how to use vLLM for running offline inference with
Cyrus Leung's avatar
Cyrus Leung committed
5
6
multi-image input on vision language models for text generation,
using the chat template defined by the model.
7
"""
8

9
import os
10
from argparse import Namespace
11
from dataclasses import asdict
12
from typing import NamedTuple, Optional
13

14
from huggingface_hub import snapshot_download
15
from PIL.Image import Image
16
from transformers import AutoProcessor, AutoTokenizer
17

18
from vllm import LLM, EngineArgs, SamplingParams
19
from vllm.lora.request import LoRARequest
20
21
22
23
24
25
26
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser

QUESTION = "What is the content of each image?"
IMAGE_URLS = [
    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
27
28
29
30
31
32
33
34
35
36
    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
37
38
39
]


40
class ModelRequestData(NamedTuple):
41
    engine_args: EngineArgs
42
    prompt: str
43
    image_data: list[Image]
44
45
46
    stop_token_ids: Optional[list[int]] = None
    chat_template: Optional[str] = None
    lora_requests: Optional[list[LoRARequest]] = None
47
48


49
50
51
52
53
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.


54
def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
55
    model_name = "rhymes-ai/Aria"
56
57
58
59
60
61
62
    engine_args = EngineArgs(
        model=model_name,
        tokenizer_mode="slow",
        trust_remote_code=True,
        dtype="bfloat16",
        limit_mm_per_prompt={"image": len(image_urls)},
    )
63
    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
64
65
66
    prompt = (
        f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n<|im_start|>assistant\n"
    )
67
    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
68

69
    return ModelRequestData(
70
        engine_args=engine_args,
71
72
73
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
74
    )
75

76

Jennifer Zhao's avatar
Jennifer Zhao committed
77
78
79
80
81
82
83
84
85
86
def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "CohereForAI/aya-vision-8b"

    engine_args = EngineArgs(
        model=model_name,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
87
88
89
90
91
92
93
94
95
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
Jennifer Zhao's avatar
Jennifer Zhao committed
96
97
98

    processor = AutoProcessor.from_pretrained(model_name)

99
100
101
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
Jennifer Zhao's avatar
Jennifer Zhao committed
102
103
104
105
106
107
108
109

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


110
def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
111
    model_name = "deepseek-ai/deepseek-vl2-tiny"
112

113
114
115
116
117
118
119
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
        limit_mm_per_prompt={"image": len(image_urls)},
    )
120

121
122
123
    placeholder = "".join(
        f"image_{i}:<image>\n" for i, _ in enumerate(image_urls, start=1)
    )
124
125
126
    prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"

    return ModelRequestData(
127
        engine_args=engine_args,
128
129
130
131
132
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


133
def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
134
135
    model_name = "google/gemma-3-4b-it"

136
    engine_args = EngineArgs(
137
138
139
140
141
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
142
143

    placeholders = [{"type": "image", "image": url} for url in image_urls]
144
145
146
147
148
149
150
151
152
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
153
154
155

    processor = AutoProcessor.from_pretrained(model_name)

156
157
158
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
159
160

    return ModelRequestData(
161
        engine_args=engine_args,
162
163
164
165
166
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


167
def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
168
    model_name = "h2oai/h2ovl-mississippi-800m"
169

170
    engine_args = EngineArgs(
171
172
173
174
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        limit_mm_per_prompt={"image": len(image_urls)},
175
        mm_processor_kwargs={"max_dynamic_patch": 4},
176
177
    )

178
179
180
181
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
182

183
184
185
186
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
187
188

    # Stop tokens for H2OVL-Mississippi
189
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
190
191
192
    stop_token_ids = [tokenizer.eos_token_id]

    return ModelRequestData(
193
        engine_args=engine_args,
194
195
196
197
198
199
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )


200
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
201
202
203
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

    # The configuration below has been confirmed to launch on a single L40 GPU.
204
    engine_args = EngineArgs(
205
206
207
208
209
210
211
212
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        enforce_eager=True,
        limit_mm_per_prompt={"image": len(image_urls)},
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
213
            "size": {"longest_edge": 2 * 364},
214
215
216
        },
    )

217
218
219
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
220
221
    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    return ModelRequestData(
222
        engine_args=engine_args,
223
224
225
226
227
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


228
229
230
231
232
233
234
235
236
237
238
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

    # The configuration below has been confirmed to launch on a single L40 GPU.
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        enforce_eager=True,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={
239
            "max_image_size": {"longest_edge": 384},
240
241
242
        },
    )

243
244
245
246
247
248
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    prompt = (
        f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    )
249
250
251
252
253
254
255
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


256
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
257
258
    model_name = "OpenGVLab/InternVL2-2B"

259
    engine_args = EngineArgs(
260
261
262
263
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
264
        mm_processor_kwargs={"max_dynamic_patch": 4},
265
266
    )

267
268
269
270
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
271

272
273
274
275
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
276
277
278
279

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
280
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
281
282
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
283

284
    return ModelRequestData(
285
        engine_args=engine_args,
286
287
288
289
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )
290
291


292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
    # it will generate poor response for multi-image inputs!
    model_name = "llava-hf/llava-1.5-7b-hf"
    engine_args = EngineArgs(
        model=model_name,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=16384,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


392
393
394
395
396
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

    engine_args = EngineArgs(
        model=model_name,
397
        max_model_len=131072,
398
399
400
401
402
        tensor_parallel_size=8,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
403
404
405
406
407
408
409
410
411
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
412
413
414

    processor = AutoProcessor.from_pretrained(model_name)

415
416
417
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
418
419
420
421
422
423
424
425

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "Kwai-Keye/Keye-VL-8B-Preview"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]

    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


463
464
465
466
467
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "moonshotai/Kimi-VL-A3B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
Cyrus Leung's avatar
Cyrus Leung committed
468
        trust_remote_code=True,
469
470
471
472
473
474
        max_model_len=4096,
        max_num_seqs=4,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
475
476
477
478
479
480
481
482
483
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
484

485
    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
486

487
488
489
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
490
491
492
493
494
495
496
497

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    # Adjust this as necessary to fit in GPU
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


520
def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
521
522
523
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"

    # The configuration below has been confirmed to launch on a single L40 GPU.
524
    engine_args = EngineArgs(
525
        model=model_name,
526
527
        max_model_len=8192,
        max_num_seqs=2,
528
529
530
        limit_mm_per_prompt={"image": len(image_urls)},
    )

531
532
    img_prompt = "Given the first image <|image|> and the second image<|image|>"
    prompt = f"<|begin_of_text|>{img_prompt}, {question}?"
533
    return ModelRequestData(
534
        engine_args=engine_args,
535
536
537
538
539
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


540
def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
541
542
543
    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
544
    engine_args = EngineArgs(
545
546
547
548
549
550
551
552
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        tensor_parallel_size=4,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"max_dynamic_patch": 4},
    )

553
554
555
556
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
557

558
559
560
561
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
562
563

    return ModelRequestData(
564
        engine_args=engine_args,
565
566
567
568
569
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


570
571
# Ovis
def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
572
573
574
575
576
577
578
579
580
581
582
    model_name = "AIDC-AI/Ovis2-1B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="half",
        limit_mm_per_prompt={"image": len(image_urls)},
    )

583
584
585
586
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
587

588
589
590
591
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
592
593
594
595
596
597
598
599

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


600
def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
601
602
603
    model_name = "mistral-community/pixtral-12b"

    # Adjust this as necessary to fit in GPU
604
    engine_args = EngineArgs(
605
606
607
608
609
610
611
612
613
614
615
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"

    return ModelRequestData(
616
        engine_args=engine_args,
617
618
619
620
621
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


622
def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
623
624
625
626
627
628
629
630
631
632
633
634
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
635
    engine_args = EngineArgs(
636
637
638
639
640
641
642
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"num_crops": 4},
    )
643
644
645
    placeholders = "\n".join(
        f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1)
    )
646
647
648
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"

    return ModelRequestData(
649
        engine_args=engine_args,
650
651
652
653
654
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


655
656
657
658
659
660
661
662
663
664
def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process multi images inputs.
    """

    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
665
    engine_args = EngineArgs(
666
667
        model=model_path,
        trust_remote_code=True,
668
        max_model_len=4096,
669
670
671
672
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        enable_lora=True,
        max_lora_rank=320,
673
674
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 4},
675
676
    )

677
    placeholders = "".join(f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1))
678
679
680
    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"

    return ModelRequestData(
681
        engine_args=engine_args,
682
683
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
684
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
685
686
687
    )


688
def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
689
    model_name = "Qwen/Qwen-VL-Chat"
690
    engine_args = EngineArgs(
691
692
693
694
        model=model_name,
        trust_remote_code=True,
        max_model_len=1024,
        max_num_seqs=2,
695
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
696
697
        limit_mm_per_prompt={"image": len(image_urls)},
    )
698
699
700
    placeholders = "".join(
        f"Picture {i}: <img></img>\n" for i, _ in enumerate(image_urls, start=1)
    )
701
702
703
704
705

    # This model does not have a chat_template attribute on its tokenizer,
    # so we need to explicitly pass it. We use ChatML since it's used in the
    # generation utils of the model:
    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
706
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
707
708
709
710

    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501

711
712
713
714
715
716
717
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        chat_template=chat_template,
    )
718
719
720

    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
721

722
    return ModelRequestData(
723
        engine_args=engine_args,
724
725
726
727
728
729
730
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=chat_template,
    )


731
def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
732
    try:
汪志鹏's avatar
汪志鹏 committed
733
        from qwen_vl_utils import smart_resize
734
    except ModuleNotFoundError:
735
736
737
738
739
        print(
            "WARNING: `qwen-vl-utils` not installed, input images will not "
            "be automatically resized. You can enable this functionality by "
            "`pip install qwen-vl-utils`."
        )
汪志鹏's avatar
汪志鹏 committed
740
        smart_resize = None
741
742
743

    model_name = "Qwen/Qwen2-VL-7B-Instruct"

744
    # Tested on L40
745
    engine_args = EngineArgs(
746
        model=model_name,
汪志鹏's avatar
汪志鹏 committed
747
        max_model_len=32768 if smart_resize is None else 4096,
748
        max_num_seqs=5,
749
750
751
752
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
753
754
755
756
757
758
759
760
761
762
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]
763
764
765

    processor = AutoProcessor.from_pretrained(model_name)

766
767
768
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
769

汪志鹏's avatar
汪志鹏 committed
770
    if smart_resize is None:
771
772
        image_data = [fetch_image(url) for url in image_urls]
    else:
汪志鹏's avatar
汪志鹏 committed
773
774
775
776
777
778
779
780
781

        def post_process_image(image: Image) -> Image:
            width, height = image.size
            resized_height, resized_width = smart_resize(
                height, width, max_pixels=1024 * 28 * 28
            )
            return image.resize((resized_width, resized_height))

        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
782

783
    return ModelRequestData(
784
        engine_args=engine_args,
785
786
787
        prompt=prompt,
        image_data=image_data,
    )
788
789


790
def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
Roger Wang's avatar
Roger Wang committed
791
    try:
汪志鹏's avatar
汪志鹏 committed
792
        from qwen_vl_utils import smart_resize
Roger Wang's avatar
Roger Wang committed
793
    except ModuleNotFoundError:
794
795
796
797
798
        print(
            "WARNING: `qwen-vl-utils` not installed, input images will not "
            "be automatically resized. You can enable this functionality by "
            "`pip install qwen-vl-utils`."
        )
汪志鹏's avatar
汪志鹏 committed
799
        smart_resize = None
Roger Wang's avatar
Roger Wang committed
800
801
802

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

803
    engine_args = EngineArgs(
Roger Wang's avatar
Roger Wang committed
804
        model=model_name,
汪志鹏's avatar
汪志鹏 committed
805
        max_model_len=32768 if smart_resize is None else 4096,
Roger Wang's avatar
Roger Wang committed
806
807
808
809
810
        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
811
812
813
814
815
816
817
818
819
820
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]
Roger Wang's avatar
Roger Wang committed
821
822
823

    processor = AutoProcessor.from_pretrained(model_name)

824
825
826
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
Roger Wang's avatar
Roger Wang committed
827

汪志鹏's avatar
汪志鹏 committed
828
    if smart_resize is None:
Roger Wang's avatar
Roger Wang committed
829
830
        image_data = [fetch_image(url) for url in image_urls]
    else:
汪志鹏's avatar
汪志鹏 committed
831
832
833
834
835
836
837
838
839

        def post_process_image(image: Image) -> Image:
            width, height = image.size
            resized_height, resized_width = smart_resize(
                height, width, max_pixels=1024 * 28 * 28
            )
            return image.resize((resized_width, resized_height))

        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
Roger Wang's avatar
Roger Wang committed
840
841

    return ModelRequestData(
842
        engine_args=engine_args,
Roger Wang's avatar
Roger Wang committed
843
844
845
846
847
        prompt=prompt,
        image_data=image_data,
    )


汪志鹏's avatar
汪志鹏 committed
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "omni-research/Tarsier-7b"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    prompt = f"USER: {'<image>' * len(image_urls)}\n{question}\n ASSISTANT:"
    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "omni-research/Tarsier2-Recap-7b"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=32768,
        limit_mm_per_prompt={"image": len(image_urls)},
        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
    )

    prompt = (
        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
        f"<|im_start|>user\n<|vision_start|>{'<|image_pad|>' * len(image_urls)}"
        f"<|vision_end|>{question}<|im_end|>\n"
        "<|im_start|>assistant\n"
    )
    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


894
model_example_map = {
895
    "aria": load_aria,
Jennifer Zhao's avatar
Jennifer Zhao committed
896
    "aya_vision": load_aya_vision,
897
    "deepseek_vl_v2": load_deepseek_vl2,
898
    "gemma3": load_gemma3,
899
    "h2ovl_chat": load_h2ovl,
900
    "idefics3": load_idefics3,
901
    "internvl_chat": load_internvl,
902
    "keye_vl": load_keye_vl,
903
    "kimi_vl": load_kimi_vl,
904
905
906
    "llava": load_llava,
    "llava-next": load_llava_next,
    "llava-onevision": load_llava_onevision,
907
    "llama4": load_llama4,
908
    "mistral3": load_mistral3,
909
    "mllama": load_mllama,
910
    "NVLM_D": load_nvlm_d,
911
    "ovis": load_ovis,
912
    "phi3_v": load_phi3v,
913
    "phi4_mm": load_phi4mm,
914
915
    "pixtral_hf": load_pixtral_hf,
    "qwen_vl_chat": load_qwen_vl_chat,
916
    "qwen2_vl": load_qwen2_vl,
Roger Wang's avatar
Roger Wang committed
917
    "qwen2_5_vl": load_qwen2_5_vl,
918
    "smolvlm": load_smolvlm,
汪志鹏's avatar
汪志鹏 committed
919
    "tarsier": load_tarsier,
920
    "tarsier2": load_tarsier2,
921
922
923
}


924
def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
925
    req_data = model_example_map[model](question, image_urls)
926

927
928
929
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)

930
931
932
    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
    )
933

934
    outputs = llm.generate(
935
        {
936
            "prompt": req_data.prompt,
937
            "multi_modal_data": {"image": req_data.image_data},
938
        },
939
940
941
        sampling_params=sampling_params,
        lora_request=req_data.lora_requests,
    )
942

943
    print("-" * 50)
944
945
946
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
947
        print("-" * 50)
948
949


950
def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
951
    req_data = model_example_map[model](question, image_urls)
952

953
954
955
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
956
957
        req_data.engine_args.limit_mm_per_prompt or {}
    )
958

959
960
961
    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)

962
963
964
    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
    )
965
    outputs = llm.chat(
966
967
968
969
970
971
972
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": question,
973
                    },
974
975
976
977
978
979
980
981
982
983
                    *(
                        {
                            "type": "image_url",
                            "image_url": {"url": image_url},
                        }
                        for image_url in image_urls
                    ),
                ],
            }
        ],
984
        sampling_params=sampling_params,
985
        chat_template=req_data.chat_template,
986
        lora_request=req_data.lora_requests,
987
    )
988

989
    print("-" * 50)
990
991
992
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
993
        print("-" * 50)
994
995


996
def parse_args():
997
    parser = FlexibleArgumentParser(
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
        description="Demo on using vLLM for offline inference with "
        "vision language models that support multi-image input for text "
        "generation"
    )
    parser.add_argument(
        "--model-type",
        "-m",
        type=str,
        default="phi3_v",
        choices=model_example_map.keys(),
        help='Huggingface "model_type".',
    )
    parser.add_argument(
        "--method",
        type=str,
        default="generate",
        choices=["generate", "chat"],
        help="The method to run in `vllm.LLM`.",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=None,
        help="Set the seed when initializing `vllm.LLM`.",
    )
1023
1024
1025
    parser.add_argument(
        "--num-images",
        "-n",
1026
        type=int,
1027
        choices=list(range(1, len(IMAGE_URLS) + 1)),  # the max number of images
1028
        default=2,
1029
1030
        help="Number of images to use for the demo.",
    )
1031
1032
    return parser.parse_args()

1033

1034
1035
1036
1037
1038
def main(args: Namespace):
    model = args.model_type
    method = args.method
    seed = args.seed

1039
    image_urls = IMAGE_URLS[: args.num_images]
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050

    if method == "generate":
        run_generate(model, QUESTION, image_urls, seed)
    elif method == "chat":
        run_chat(model, QUESTION, image_urls, seed)
    else:
        raise ValueError(f"Invalid method: {method}")


if __name__ == "__main__":
    args = parse_args()
1051
    main(args)